diff --git a/.claude/rules/coding-style.md b/.claude/rules/coding-style.md new file mode 100644 index 00000000..a5febc56 --- /dev/null +++ b/.claude/rules/coding-style.md @@ -0,0 +1,153 @@ +--- +paths: + - '**/*.rs' +--- + +# Rust Coding Style + +> This file extends [common/coding-style.md](../common/coding-style.md) with Rust-specific content. + +## Formatting + +- **rustfmt** for enforcement — always run `cargo fmt` before committing +- **clippy** for lints — `cargo clippy -- -D warnings` (treat warnings as errors) +- 4-space indent (rustfmt default) +- Max line width: 100 characters (rustfmt default) + +## Immutability + +Rust variables are immutable by default — embrace this: + +- Use `let` by default; only use `let mut` when mutation is required +- Prefer returning new values over mutating in place +- Use `Cow<'_, T>` when a function may or may not need to allocate + +```rust +use std::borrow::Cow; + +// GOOD — immutable by default, new value returned +fn normalize(input: &str) -> Cow<'_, str> { + if input.contains(' ') { + Cow::Owned(input.replace(' ', "_")) + } else { + Cow::Borrowed(input) + } +} + +// BAD — unnecessary mutation +fn normalize_bad(input: &mut String) { + *input = input.replace(' ', "_"); +} +``` + +## Naming + +Follow standard Rust conventions: + +- `snake_case` for functions, methods, variables, modules, crates +- `PascalCase` (UpperCamelCase) for types, traits, enums, type parameters +- `SCREAMING_SNAKE_CASE` for constants and statics +- Lifetimes: short lowercase (`'a`, `'de`) — descriptive names for complex cases (`'input`) + +## Ownership and Borrowing + +- Borrow (`&T`) by default; take ownership only when you need to store or consume +- Never clone to satisfy the borrow checker without understanding the root cause +- Accept `&str` over `String`, `&[T]` over `Vec` in function parameters +- Use `impl Into` for constructors that need to own a `String` + +```rust +// GOOD — borrows when ownership isn't needed +fn word_count(text: &str) -> usize { + text.split_whitespace().count() +} + +// GOOD — takes ownership in constructor via Into +fn new(name: impl Into) -> Self { + Self { name: name.into() } +} + +// BAD — takes String when &str suffices +fn word_count_bad(text: String) -> usize { + text.split_whitespace().count() +} +``` + +## Error Handling + +- Use `Result` and `?` for propagation — never `unwrap()` in production code +- **Libraries**: define typed errors with `thiserror` +- **Applications**: use `anyhow` for flexible error context +- Add context with `.with_context(|| format!("failed to ..."))?` +- Reserve `unwrap()` / `expect()` for tests and truly unreachable states + +```rust +// GOOD — library error with thiserror +#[derive(Debug, thiserror::Error)] +pub enum ConfigError { + #[error("failed to read config: {0}")] + Io(#[from] std::io::Error), + #[error("invalid config format: {0}")] + Parse(String), +} + +// GOOD — application error with anyhow +use anyhow::Context; + +fn load_config(path: &str) -> anyhow::Result { + let content = std::fs::read_to_string(path) + .with_context(|| format!("failed to read {path}"))?; + toml::from_str(&content) + .with_context(|| format!("failed to parse {path}")) +} +``` + +## Iterators Over Loops + +Prefer iterator chains for transformations; use loops for complex control flow: + +```rust +// GOOD — declarative and composable +let active_emails: Vec<&str> = users.iter() + .filter(|u| u.is_active) + .map(|u| u.email.as_str()) + .collect(); + +// GOOD — loop for complex logic with early returns +for user in &users { + if let Some(verified) = verify_email(&user.email)? { + send_welcome(&verified)?; + } +} +``` + +## Module Organization + +Organize by domain, not by type: + +```text +src/ +├── main.rs +├── lib.rs +├── auth/ # Domain module +│ ├── mod.rs +│ ├── token.rs +│ └── middleware.rs +├── orders/ # Domain module +│ ├── mod.rs +│ ├── model.rs +│ └── service.rs +└── db/ # Infrastructure + ├── mod.rs + └── pool.rs +``` + +## Visibility + +- Default to private; use `pub(crate)` for internal sharing +- Only mark `pub` what is part of the crate's public API +- Re-export public API from `lib.rs` + +## References + +See skill: `rust-patterns` for comprehensive Rust idioms and patterns. diff --git a/.claude/rules/hooks.md b/.claude/rules/hooks.md new file mode 100644 index 00000000..37edca90 --- /dev/null +++ b/.claude/rules/hooks.md @@ -0,0 +1,17 @@ +--- +paths: + - '**/*.rs' + - '**/Cargo.toml' +--- + +# Rust Hooks + +> This file extends [common/hooks.md](../common/hooks.md) with Rust-specific content. + +## PostToolUse Hooks + +Configure in `~/.claude/settings.json`: + +- **cargo fmt**: Auto-format `.rs` files after edit +- **cargo clippy**: Run lint checks after editing Rust files +- **cargo check**: Verify compilation after changes (faster than `cargo build`) diff --git a/.claude/rules/patterns.md b/.claude/rules/patterns.md new file mode 100644 index 00000000..e0f74cc7 --- /dev/null +++ b/.claude/rules/patterns.md @@ -0,0 +1,169 @@ +--- +paths: + - '**/*.rs' +--- + +# Rust Patterns + +> This file extends [common/patterns.md](../common/patterns.md) with Rust-specific content. + +## Repository Pattern with Traits + +Encapsulate data access behind a trait: + +```rust +pub trait OrderRepository: Send + Sync { + fn find_by_id(&self, id: u64) -> Result, StorageError>; + fn find_all(&self) -> Result, StorageError>; + fn save(&self, order: &Order) -> Result; + fn delete(&self, id: u64) -> Result<(), StorageError>; +} +``` + +Concrete implementations handle storage details (Postgres, SQLite, in-memory for tests). + +## Service Layer + +Business logic in service structs; inject dependencies via constructor: + +```rust +pub struct OrderService { + repo: Box, + payment: Box, +} + +impl OrderService { + pub fn new(repo: Box, payment: Box) -> Self { + Self { repo, payment } + } + + pub fn place_order(&self, request: CreateOrderRequest) -> anyhow::Result { + let order = Order::from(request); + self.payment.charge(order.total())?; + let saved = self.repo.save(&order)?; + Ok(OrderSummary::from(saved)) + } +} +``` + +## Newtype Pattern for Type Safety + +Prevent argument mix-ups with distinct wrapper types: + +```rust +struct UserId(u64); +struct OrderId(u64); + +fn get_order(user: UserId, order: OrderId) -> anyhow::Result { + // Can't accidentally swap user and order IDs at call sites + todo!() +} +``` + +## Enum State Machines + +Model states as enums — make illegal states unrepresentable: + +```rust +enum ConnectionState { + Disconnected, + Connecting { attempt: u32 }, + Connected { session_id: String }, + Failed { reason: String, retries: u32 }, +} + +fn handle(state: &ConnectionState) { + match state { + ConnectionState::Disconnected => connect(), + ConnectionState::Connecting { attempt } if *attempt > 3 => abort(), + ConnectionState::Connecting { .. } => wait(), + ConnectionState::Connected { session_id } => use_session(session_id), + ConnectionState::Failed { retries, .. } if *retries < 5 => retry(), + ConnectionState::Failed { reason, .. } => log_failure(reason), + } +} +``` + +Always match exhaustively — no wildcard `_` for business-critical enums. + +## Builder Pattern + +Use for structs with many optional parameters: + +```rust +pub struct ServerConfig { + host: String, + port: u16, + max_connections: usize, +} + +impl ServerConfig { + pub fn builder(host: impl Into, port: u16) -> ServerConfigBuilder { + ServerConfigBuilder { + host: host.into(), + port, + max_connections: 100, + } + } +} + +pub struct ServerConfigBuilder { + host: String, + port: u16, + max_connections: usize, +} + +impl ServerConfigBuilder { + pub fn max_connections(mut self, n: usize) -> Self { + self.max_connections = n; + self + } + + pub fn build(self) -> ServerConfig { + ServerConfig { + host: self.host, + port: self.port, + max_connections: self.max_connections, + } + } +} +``` + +## Sealed Traits for Extensibility Control + +Use a private module to seal a trait, preventing external implementations: + +```rust +mod private { + pub trait Sealed {} +} + +pub trait Format: private::Sealed { + fn encode(&self, data: &[u8]) -> Vec; +} + +pub struct Json; +impl private::Sealed for Json {} +impl Format for Json { + fn encode(&self, data: &[u8]) -> Vec { todo!() } +} +``` + +## API Response Envelope + +Consistent API responses using a generic enum: + +```rust +#[derive(Debug, serde::Serialize)] +#[serde(tag = "status")] +pub enum ApiResponse { + #[serde(rename = "ok")] + Ok { data: T }, + #[serde(rename = "error")] + Error { message: String }, +} +``` + +## References + +See skill: `rust-patterns` for comprehensive patterns including ownership, traits, generics, concurrency, and async. diff --git a/.claude/rules/security.md b/.claude/rules/security.md new file mode 100644 index 00000000..43e971ca --- /dev/null +++ b/.claude/rules/security.md @@ -0,0 +1,141 @@ +--- +paths: + - '**/*.rs' +--- + +# Rust Security + +> This file extends [common/security.md](../common/security.md) with Rust-specific content. + +## Secrets Management + +- Never hardcode API keys, tokens, or credentials in source code +- Use environment variables: `std::env::var("API_KEY")` +- Fail fast if required secrets are missing at startup +- Keep `.env` files in `.gitignore` + +```rust +// BAD +const API_KEY: &str = "sk-abc123..."; + +// GOOD — environment variable with early validation +fn load_api_key() -> anyhow::Result { + std::env::var("PAYMENT_API_KEY") + .context("PAYMENT_API_KEY must be set") +} +``` + +## SQL Injection Prevention + +- Always use parameterized queries — never format user input into SQL strings +- Use query builder or ORM (sqlx, diesel, sea-orm) with bind parameters + +```rust +// BAD — SQL injection via format string +let query = format!("SELECT * FROM users WHERE name = '{name}'"); +sqlx::query(&query).fetch_one(&pool).await?; + +// GOOD — parameterized query with sqlx +// Placeholder syntax varies by backend: Postgres: $1 | MySQL: ? | SQLite: $1 +sqlx::query("SELECT * FROM users WHERE name = $1") + .bind(&name) + .fetch_one(&pool) + .await?; +``` + +## Input Validation + +- Validate all user input at system boundaries before processing +- Use the type system to enforce invariants (newtype pattern) +- Parse, don't validate — convert unstructured data to typed structs at the boundary +- Reject invalid input with clear error messages + +```rust +// Parse, don't validate — invalid states are unrepresentable +pub struct Email(String); + +impl Email { + pub fn parse(input: &str) -> Result { + let trimmed = input.trim(); + let at_pos = trimmed.find('@') + .filter(|&p| p > 0 && p < trimmed.len() - 1) + .ok_or_else(|| ValidationError::InvalidEmail(input.to_string()))?; + let domain = &trimmed[at_pos + 1..]; + if trimmed.len() > 254 || !domain.contains('.') { + return Err(ValidationError::InvalidEmail(input.to_string())); + } + // For production use, prefer a validated email crate (e.g., `email_address`) + Ok(Self(trimmed.to_string())) + } + + pub fn as_str(&self) -> &str { + &self.0 + } +} +``` + +## Unsafe Code + +- Minimize `unsafe` blocks — prefer safe abstractions +- Every `unsafe` block must have a `// SAFETY:` comment explaining the invariant +- Never use `unsafe` to bypass the borrow checker for convenience +- Audit all `unsafe` code during review — it is a red flag without justification +- Prefer `safe` FFI wrappers around C libraries + +```rust +// GOOD — safety comment documents ALL required invariants +let widget: &Widget = { + // SAFETY: `ptr` is non-null, aligned, points to an initialized Widget, + // and no mutable references or mutations exist for its lifetime. + unsafe { &*ptr } +}; + +// BAD — no safety justification +unsafe { &*ptr } +``` + +## Dependency Security + +- Run `cargo audit` to scan for known CVEs in dependencies +- Run `cargo deny check` for license and advisory compliance +- Use `cargo tree` to audit transitive dependencies +- Keep dependencies updated — set up Dependabot or Renovate +- Minimize dependency count — evaluate before adding new crates + +```bash +# Security audit +cargo audit + +# Deny advisories, duplicate versions, and restricted licenses +cargo deny check + +# Inspect dependency tree +cargo tree +cargo tree -d # Show duplicates only +``` + +## Error Messages + +- Never expose internal paths, stack traces, or database errors in API responses +- Log detailed errors server-side; return generic messages to clients +- Use `tracing` or `log` for structured server-side logging + +```rust +// Map errors to appropriate status codes and generic messages +// (Example uses axum; adapt the response type to your framework) +match order_service.find_by_id(id) { + Ok(order) => Ok((StatusCode::OK, Json(order))), + Err(ServiceError::NotFound(_)) => { + tracing::info!(order_id = id, "order not found"); + Err((StatusCode::NOT_FOUND, "Resource not found")) + } + Err(e) => { + tracing::error!(order_id = id, error = %e, "unexpected error"); + Err((StatusCode::INTERNAL_SERVER_ERROR, "Internal server error")) + } +} +``` + +## References + +See skill: `rust-patterns` for unsafe code guidelines and ownership patterns. See skill: `security-review` for general security checklists. diff --git a/.claude/rules/testing.md b/.claude/rules/testing.md new file mode 100644 index 00000000..b27b0e4d --- /dev/null +++ b/.claude/rules/testing.md @@ -0,0 +1,156 @@ +--- +paths: + - '**/*.rs' +--- + +# Rust Testing + +> This file extends [common/testing.md](../common/testing.md) with Rust-specific content. + +## Test Framework + +- **`#[test]`** with `#[cfg(test)]` modules for unit tests +- **rstest** for parameterized tests and fixtures +- **proptest** for property-based testing +- **mockall** for trait-based mocking +- **`#[tokio::test]`** for async tests + +## Test Organization + +```text +my_crate/ +├── src/ +│ ├── lib.rs # Unit tests in #[cfg(test)] modules +│ ├── auth/ +│ │ └── mod.rs # #[cfg(test)] mod tests { ... } +│ └── orders/ +│ └── service.rs # #[cfg(test)] mod tests { ... } +├── tests/ # Integration tests (each file = separate binary) +│ ├── api_test.rs +│ ├── db_test.rs +│ └── common/ # Shared test utilities +│ └── mod.rs +└── benches/ # Criterion benchmarks + └── benchmark.rs +``` + +Unit tests go inside `#[cfg(test)]` modules in the same file. Integration tests go in `tests/`. + +## Unit Test Pattern + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn creates_user_with_valid_email() { + let user = User::new("Alice", "alice@example.com").unwrap(); + assert_eq!(user.name, "Alice"); + } + + #[test] + fn rejects_invalid_email() { + let result = User::new("Bob", "not-an-email"); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("invalid email")); + } +} +``` + +## Parameterized Tests + +```rust +use rstest::rstest; + +#[rstest] +#[case("hello", 5)] +#[case("", 0)] +#[case("rust", 4)] +fn test_string_length(#[case] input: &str, #[case] expected: usize) { + assert_eq!(input.len(), expected); +} +``` + +## Async Tests + +```rust +#[tokio::test] +async fn fetches_data_successfully() { + let client = TestClient::new().await; + let result = client.get("/data").await; + assert!(result.is_ok()); +} +``` + +## Mocking with mockall + +Define traits in production code; generate mocks in test modules: + +```rust +// Production trait — pub so integration tests can import it +pub trait UserRepository { + fn find_by_id(&self, id: u64) -> Option; +} + +#[cfg(test)] +mod tests { + use super::*; + use mockall::predicate::eq; + + mockall::mock! { + pub Repo {} + impl UserRepository for Repo { + fn find_by_id(&self, id: u64) -> Option; + } + } + + #[test] + fn service_returns_user_when_found() { + let mut mock = MockRepo::new(); + mock.expect_find_by_id() + .with(eq(42)) + .times(1) + .returning(|_| Some(User { id: 42, name: "Alice".into() })); + + let service = UserService::new(Box::new(mock)); + let user = service.get_user(42).unwrap(); + assert_eq!(user.name, "Alice"); + } +} +``` + +## Test Naming + +Use descriptive names that explain the scenario: + +- `creates_user_with_valid_email()` +- `rejects_order_when_insufficient_stock()` +- `returns_none_when_not_found()` + +## Coverage + +- Target 80%+ line coverage +- Use **cargo-llvm-cov** for coverage reporting +- Focus on business logic — exclude generated code and FFI bindings + +```bash +cargo llvm-cov # Summary +cargo llvm-cov --html # HTML report +cargo llvm-cov --fail-under-lines 80 # Fail if below threshold +``` + +## Testing Commands + +```bash +cargo test # Run all tests +cargo test -- --nocapture # Show println output +cargo test test_name # Run tests matching pattern +cargo test --lib # Unit tests only +cargo test --test api_test # Specific integration test (tests/api_test.rs) +cargo test --doc # Doc tests only +``` + +## References + +See skill: `rust-testing` for comprehensive testing patterns including property-based testing, fixtures, and benchmarking with Criterion. diff --git a/.claude/skills/code-tour/SKILL.md b/.claude/skills/code-tour/SKILL.md new file mode 100644 index 00000000..2523699e --- /dev/null +++ b/.claude/skills/code-tour/SKILL.md @@ -0,0 +1,269 @@ +--- +name: code-tour +description: Create CodeTour `.tour` files — persona-targeted, step-by-step walkthroughs with real file and line anchors. Use for onboarding tours, architecture walkthroughs, PR tours, RCA tours, and structured "explain how this works" requests. +origin: ECC +--- + +# Code Tour + +Create **CodeTour** `.tour` files for codebase walkthroughs that open directly to real files and line ranges. Tours live in `.tours/` and are meant for the CodeTour format, not ad hoc Markdown notes. + +A good tour is a narrative for a specific reader: + +- what they are looking at +- why it matters +- what path they should follow next + +Only create `.tour` JSON files. Do not modify source code as part of this skill. + +## When to Use + +Use this skill when: + +- the user asks for a code tour, onboarding tour, architecture walkthrough, or PR tour +- the user says "explain how X works" and wants a reusable guided artifact +- the user wants a ramp-up path for a new engineer or reviewer +- the task is better served by a guided sequence than a flat summary + +Examples: + +- onboarding a new maintainer +- architecture tour for one service or package +- PR-review walk-through anchored to changed files +- RCA tour showing the failure path +- security review tour of trust boundaries and key checks + +## When NOT to Use + +| Instead of code-tour | Use | +| ------------------------------------------------------------- | ------------------------------------------- | +| A one-off explanation in chat is enough | answer directly | +| The user wants prose docs, not a `.tour` artifact | `documentation-lookup` or repo docs editing | +| The task is implementation or refactoring | do the implementation work | +| The task is broad codebase onboarding without a tour artifact | `codebase-onboarding` | + +## Workflow + +### 1. Discover + +Explore the repo before writing anything: + +- README and package/app entry points +- folder structure +- relevant config files +- the changed files if the tour is PR-focused + +Do not start writing steps before you understand the shape of the code. + +### 2. Infer the reader + +Decide the persona and depth from the request. + +| Request shape | Persona | Suggested depth | +| -------------------------------- | ------------------- | --------------- | +| "onboarding", "new joiner" | `new-joiner` | 9-13 steps | +| "quick tour", "vibe check" | `vibecoder` | 5-8 steps | +| "architecture" | `architect` | 14-18 steps | +| "tour this PR" | `pr-reviewer` | 7-11 steps | +| "why did this break" | `rca-investigator` | 7-11 steps | +| "security review" | `security-reviewer` | 7-11 steps | +| "explain how this feature works" | `feature-explainer` | 7-11 steps | +| "debug this path" | `bug-fixer` | 7-11 steps | + +### 3. Read and verify anchors + +Every file path and line anchor must be real: + +- confirm the file exists +- confirm the line numbers are in range +- if using a selection, verify the exact block +- if the file is volatile, prefer a pattern-based anchor + +Never guess line numbers. + +### 4. Write the `.tour` + +Write to: + +```text +.tours/-.tour +``` + +Keep the path deterministic and readable. + +### 5. Validate + +Before finishing: + +- every referenced path exists +- every line or selection is valid +- the first step is anchored to a real file or directory +- the tour tells a coherent story rather than listing files + +## Step Types + +### Content + +Use sparingly, usually only for a closing step: + +```json +{ + "title": "Next Steps", + "description": "You can now trace the request path end to end." +} +``` + +Do not make the first step content-only. + +### Directory + +Use to orient the reader to a module: + +```json +{ + "directory": "src/services", + "title": "Service Layer", + "description": "The core orchestration logic lives here." +} +``` + +### File + line + +This is the default step type: + +```json +{ + "file": "src/auth/middleware.ts", + "line": 42, + "title": "Auth Gate", + "description": "Every protected request passes here first." +} +``` + +### Selection + +Use when one code block matters more than the whole file: + +```json +{ + "file": "src/core/pipeline.ts", + "selection": { + "start": { + "line": 15, + "character": 0 + }, + "end": { + "line": 34, + "character": 0 + } + }, + "title": "Request Pipeline", + "description": "This block wires validation, auth, and downstream execution." +} +``` + +### Pattern + +Use when exact lines may drift: + +```json +{ + "file": "src/app.ts", + "pattern": "export default class App", + "title": "Application Entry" +} +``` + +### URI + +Use for PRs, issues, or docs when helpful: + +```json +{ + "uri": "https://github.com/org/repo/pull/456", + "title": "The PR" +} +``` + +## Writing Rule: SMIG + +Each description should answer: + +- **Situation**: what the reader is looking at +- **Mechanism**: how it works +- **Implication**: why it matters for this persona +- **Gotcha**: what a smart reader might miss + +Keep descriptions compact, specific, and grounded in the actual code. + +## Narrative Shape + +Use this arc unless the task clearly needs something different: + +1. orientation +2. module map +3. core execution path +4. edge case or gotcha +5. closing / next move + +The tour should feel like a path, not an inventory. + +## Example + +```json +{ + "$schema": "https://aka.ms/codetour-schema", + "title": "API Service Tour", + "description": "Walkthrough of the request path for the payments service.", + "ref": "main", + "steps": [ + { + "directory": "src", + "title": "Source Root", + "description": "All runtime code for the service starts here." + }, + { + "file": "src/server.ts", + "line": 12, + "title": "Entry Point", + "description": "The server boots here and wires middleware before any route is reached." + }, + { + "file": "src/routes/payments.ts", + "line": 8, + "title": "Payment Routes", + "description": "Every payments request enters through this router before hitting service logic." + }, + { + "title": "Next Steps", + "description": "You can now follow any payment request end to end with the main anchors in place." + } + ] +} +``` + +## Anti-Patterns + +| Anti-pattern | Fix | +| ------------------------------- | --------------------------------------------------- | +| Flat file listing | Tell a story with dependency between steps | +| Generic descriptions | Name the concrete code path or pattern | +| Guessed anchors | Verify every file and line first | +| Too many steps for a quick tour | Cut aggressively | +| First step is content-only | Anchor the first step to a real file or directory | +| Persona mismatch | Write for the actual reader, not a generic engineer | + +## Best Practices + +- keep step count proportional to repo size and persona depth +- use directory steps for orientation, file steps for substance +- for PR tours, cover changed files first +- for monorepos, scope to the relevant packages instead of touring everything +- close with what the reader can now do, not a recap + +## Related Skills + +- `codebase-onboarding` +- `coding-standards` +- `council` +- official upstream format: `microsoft/codetour` diff --git a/.claude/skills/codebase-onboarding/SKILL.md b/.claude/skills/codebase-onboarding/SKILL.md new file mode 100644 index 00000000..3670d3ff --- /dev/null +++ b/.claude/skills/codebase-onboarding/SKILL.md @@ -0,0 +1,235 @@ +--- +name: codebase-onboarding +description: Analyze an unfamiliar codebase and generate a structured onboarding guide with architecture map, key entry points, conventions, and a starter CLAUDE.md. Use when joining a new project or setting up Claude Code for the first time in a repo. +origin: ECC +--- + +# Codebase Onboarding + +Systematically analyze an unfamiliar codebase and produce a structured onboarding guide. Designed for developers joining a new project or setting up Claude Code in an existing repo for the first time. + +## When to Use + +- First time opening a project with Claude Code +- Joining a new team or repository +- User asks "help me understand this codebase" +- User asks to generate a CLAUDE.md for a project +- User says "onboard me" or "walk me through this repo" + +## How It Works + +### Phase 1: Reconnaissance + +Gather raw signals about the project without reading every file. Run these checks in parallel: + +``` +1. Package manifest detection + → package.json, go.mod, Cargo.toml, pyproject.toml, pom.xml, build.gradle, + Gemfile, composer.json, mix.exs, pubspec.yaml + +2. Framework fingerprinting + → next.config.*, nuxt.config.*, angular.json, vite.config.*, + django settings, flask app factory, fastapi main, rails config + +3. Entry point identification + → main.*, index.*, app.*, server.*, cmd/, src/main/ + +4. Directory structure snapshot + → Top 2 levels of the directory tree, ignoring node_modules, vendor, + .git, dist, build, __pycache__, .next + +5. Config and tooling detection + → .eslintrc*, .prettierrc*, tsconfig.json, Makefile, Dockerfile, + docker-compose*, .github/workflows/, .env.example, CI configs + +6. Test structure detection + → tests/, test/, __tests__/, *_test.go, *.spec.ts, *.test.js, + pytest.ini, jest.config.*, vitest.config.* +``` + +### Phase 2: Architecture Mapping + +From the reconnaissance data, identify: + +**Tech Stack** + +- Language(s) and version constraints +- Framework(s) and major libraries +- Database(s) and ORMs +- Build tools and bundlers +- CI/CD platform + +**Architecture Pattern** + +- Monolith, monorepo, microservices, or serverless +- Frontend/backend split or full-stack +- API style: REST, GraphQL, gRPC, tRPC + +**Key Directories** Map the top-level directories to their purpose: + + + +``` +src/components/ → React UI components +src/api/ → API route handlers +src/lib/ → Shared utilities +src/db/ → Database models and migrations +tests/ → Test suites +scripts/ → Build and deployment scripts +``` + +**Data Flow** Trace one request from entry to response: + +- Where does a request enter? (router, handler, controller) +- How is it validated? (middleware, schemas, guards) +- Where is business logic? (services, models, use cases) +- How does it reach the database? (ORM, raw queries, repositories) + +### Phase 3: Convention Detection + +Identify patterns the codebase already follows: + +**Naming Conventions** + +- File naming: kebab-case, camelCase, PascalCase, snake_case +- Component/class naming patterns +- Test file naming: `*.test.ts`, `*.spec.ts`, `*_test.go` + +**Code Patterns** + +- Error handling style: try/catch, Result types, error codes +- Dependency injection or direct imports +- State management approach +- Async patterns: callbacks, promises, async/await, channels + +**Git Conventions** + +- Branch naming from recent branches +- Commit message style from recent commits +- PR workflow (squash, merge, rebase) +- If the repo has no commits yet or only a shallow history (e.g. `git clone --depth 1`), skip this section and note "Git history unavailable or too shallow to detect conventions" + +### Phase 4: Generate Onboarding Artifacts + +Produce two outputs: + +#### Output 1: Onboarding Guide + +```markdown +# Onboarding Guide: [Project Name] + +## Overview +[2-3 sentences: what this project does and who it serves] + +## Tech Stack + +| Layer | Technology | Version | +|-------|-----------|---------| +| Language | TypeScript | 5.x | +| Framework | Next.js | 14.x | +| Database | PostgreSQL | 16 | +| ORM | Prisma | 5.x | +| Testing | Jest + Playwright | - | + +## Architecture +[Diagram or description of how components connect] + +## Key Entry Points + +- **API routes**: `src/app/api/` — Next.js route handlers +- **UI pages**: `src/app/(dashboard)/` — authenticated pages +- **Database**: `prisma/schema.prisma` — data model source of truth +- **Config**: `next.config.ts` — build and runtime config + +## Directory Map +[Top-level directory → purpose mapping] + +## Request Lifecycle +[Trace one API request from entry to response] + +## Conventions +- [File naming pattern] +- [Error handling approach] +- [Testing patterns] +- [Git workflow] + +## Common Tasks + +- **Run dev server**: `npm run dev` +- **Run tests**: `npm test` +- **Run linter**: `npm run lint` +- **Database migrations**: `npx prisma migrate dev` +- **Build for production**: `npm run build` + +## Where to Look + +| I want to... | Look at... | +|--------------|-----------| +| Add an API endpoint | `src/app/api/` | +| Add a UI page | `src/app/(dashboard)/` | +| Add a database table | `prisma/schema.prisma` | +| Add a test | `tests/` matching the source path | +| Change build config | `next.config.ts` | +``` + +#### Output 2: Starter CLAUDE.md + +Generate or update a project-specific CLAUDE.md based on detected conventions. If `CLAUDE.md` already exists, read it first and enhance it — preserve existing project-specific instructions and clearly call out what was added or changed. + +```markdown +# Project Instructions + +## Tech Stack +[Detected stack summary] + +## Code Style +- [Detected naming conventions] +- [Detected patterns to follow] + +## Testing +- Run tests: `[detected test command]` +- Test pattern: [detected test file convention] +- Coverage: [if configured, the coverage command] + +## Build & Run +- Dev: `[detected dev command]` +- Build: `[detected build command]` +- Lint: `[detected lint command]` + +## Project Structure +[Key directory → purpose map] + +## Conventions +- [Commit style if detectable] +- [PR workflow if detectable] +- [Error handling patterns] +``` + +## Best Practices + +1. **Don't read everything** — reconnaissance should use Glob and Grep, not Read on every file. Read selectively only for ambiguous signals. +2. **Verify, don't guess** — if a framework is detected from config but the actual code uses something different, trust the code. +3. **Respect existing CLAUDE.md** — if one already exists, enhance it rather than replacing it. Call out what's new vs existing. +4. **Stay concise** — the onboarding guide should be scannable in 2 minutes. Details belong in the code, not the guide. +5. **Flag unknowns** — if a convention can't be confidently detected, say so rather than guessing. "Could not determine test runner" is better than a wrong answer. + +## Anti-Patterns to Avoid + +- Generating a CLAUDE.md that's longer than 100 lines — keep it focused +- Listing every dependency — highlight only the ones that shape how you write code +- Describing obvious directory names — `src/` doesn't need an explanation +- Copying the README — the onboarding guide adds structural insight the README lacks + +## Examples + +### Example 1: First time in a new repo + +**User**: "Onboard me to this codebase" **Action**: Run full 4-phase workflow → produce Onboarding Guide + Starter CLAUDE.md **Output**: Onboarding Guide printed directly to the conversation, plus a `CLAUDE.md` written to the project root + +### Example 2: Generate CLAUDE.md for existing project + +**User**: "Generate a CLAUDE.md for this project" **Action**: Run Phases 1-3, skip Onboarding Guide, produce only CLAUDE.md **Output**: Project-specific `CLAUDE.md` with detected conventions + +### Example 3: Enhance existing CLAUDE.md + +**User**: "Update the CLAUDE.md with current project conventions" **Action**: Read existing CLAUDE.md, run Phases 1-3, merge new findings **Output**: Updated `CLAUDE.md` with additions clearly marked diff --git a/.claude/skills/database-migrations/SKILL.md b/.claude/skills/database-migrations/SKILL.md new file mode 100644 index 00000000..3923d498 --- /dev/null +++ b/.claude/skills/database-migrations/SKILL.md @@ -0,0 +1,429 @@ +--- +name: database-migrations +description: Database migration best practices for schema changes, data migrations, rollbacks, and zero-downtime deployments across PostgreSQL, MySQL, and common ORMs (Prisma, Drizzle, Kysely, Django, TypeORM, golang-migrate). +origin: ECC +--- + +# Database Migration Patterns + +Safe, reversible database schema changes for production systems. + +## When to Activate + +- Creating or altering database tables +- Adding/removing columns or indexes +- Running data migrations (backfill, transform) +- Planning zero-downtime schema changes +- Setting up migration tooling for a new project + +## Core Principles + +1. **Every change is a migration** — never alter production databases manually +2. **Migrations are forward-only in production** — rollbacks use new forward migrations +3. **Schema and data migrations are separate** — never mix DDL and DML in one migration +4. **Test migrations against production-sized data** — a migration that works on 100 rows may lock on 10M +5. **Migrations are immutable once deployed** — never edit a migration that has run in production + +## Migration Safety Checklist + +Before applying any migration: + +- [ ] Migration has both UP and DOWN (or is explicitly marked irreversible) +- [ ] No full table locks on large tables (use concurrent operations) +- [ ] New columns have defaults or are nullable (never add NOT NULL without default) +- [ ] Indexes created concurrently (not inline with CREATE TABLE for existing tables) +- [ ] Data backfill is a separate migration from schema change +- [ ] Tested against a copy of production data +- [ ] Rollback plan documented + +## PostgreSQL Patterns + +### Adding a Column Safely + +```sql +-- GOOD: Nullable column, no lock +ALTER TABLE users ADD COLUMN avatar_url TEXT; + +-- GOOD: Column with default (Postgres 11+ is instant, no rewrite) +ALTER TABLE users ADD COLUMN is_active BOOLEAN NOT NULL DEFAULT true; + +-- BAD: NOT NULL without default on existing table (requires full rewrite) +ALTER TABLE users ADD COLUMN role TEXT NOT NULL; +-- This locks the table and rewrites every row +``` + +### Adding an Index Without Downtime + +```sql +-- BAD: Blocks writes on large tables +CREATE INDEX idx_users_email ON users (email); + +-- GOOD: Non-blocking, allows concurrent writes +CREATE INDEX CONCURRENTLY idx_users_email ON users (email); + +-- Note: CONCURRENTLY cannot run inside a transaction block +-- Most migration tools need special handling for this +``` + +### Renaming a Column (Zero-Downtime) + +Never rename directly in production. Use the expand-contract pattern: + +```sql +-- Step 1: Add new column (migration 001) +ALTER TABLE users ADD COLUMN display_name TEXT; + +-- Step 2: Backfill data (migration 002, data migration) +UPDATE users SET display_name = username WHERE display_name IS NULL; + +-- Step 3: Update application code to read/write both columns +-- Deploy application changes + +-- Step 4: Stop writing to old column, drop it (migration 003) +ALTER TABLE users DROP COLUMN username; +``` + +### Removing a Column Safely + +```sql +-- Step 1: Remove all application references to the column +-- Step 2: Deploy application without the column reference +-- Step 3: Drop column in next migration +ALTER TABLE orders DROP COLUMN legacy_status; + +-- For Django: use SeparateDatabaseAndState to remove from model +-- without generating DROP COLUMN (then drop in next migration) +``` + +### Large Data Migrations + +```sql +-- BAD: Updates all rows in one transaction (locks table) +UPDATE users SET normalized_email = LOWER(email); + +-- GOOD: Batch update with progress +DO $$ +DECLARE + batch_size INT := 10000; + rows_updated INT; +BEGIN + LOOP + UPDATE users + SET normalized_email = LOWER(email) + WHERE id IN ( + SELECT id FROM users + WHERE normalized_email IS NULL + LIMIT batch_size + FOR UPDATE SKIP LOCKED + ); + GET DIAGNOSTICS rows_updated = ROW_COUNT; + RAISE NOTICE 'Updated % rows', rows_updated; + EXIT WHEN rows_updated = 0; + COMMIT; + END LOOP; +END $$; +``` + +## Prisma (TypeScript/Node.js) + +### Workflow + +```bash +# Create migration from schema changes +npx prisma migrate dev --name add_user_avatar + +# Apply pending migrations in production +npx prisma migrate deploy + +# Reset database (dev only) +npx prisma migrate reset + +# Generate client after schema changes +npx prisma generate +``` + +### Schema Example + +```prisma +model User { + id String @id @default(cuid()) + email String @unique + name String? + avatarUrl String? @map("avatar_url") + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + orders Order[] + + @@map("users") + @@index([email]) +} +``` + +### Custom SQL Migration + +For operations Prisma cannot express (concurrent indexes, data backfills): + +```bash +# Create empty migration, then edit the SQL manually +npx prisma migrate dev --create-only --name add_email_index +``` + +```sql +-- migrations/20240115_add_email_index/migration.sql +-- Prisma cannot generate CONCURRENTLY, so we write it manually +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_users_email ON users (email); +``` + +## Drizzle (TypeScript/Node.js) + +### Workflow + +```bash +# Generate migration from schema changes +npx drizzle-kit generate + +# Apply migrations +npx drizzle-kit migrate + +# Push schema directly (dev only, no migration file) +npx drizzle-kit push +``` + +### Schema Example + +```typescript +import { pgTable, text, timestamp, uuid, boolean } from "drizzle-orm/pg-core"; + +export const users = pgTable("users", { + id: uuid("id").primaryKey().defaultRandom(), + email: text("email").notNull().unique(), + name: text("name"), + isActive: boolean("is_active").notNull().default(true), + createdAt: timestamp("created_at").notNull().defaultNow(), + updatedAt: timestamp("updated_at").notNull().defaultNow(), +}); +``` + +## Kysely (TypeScript/Node.js) + +### Workflow (kysely-ctl) + +```bash +# Initialize config file (kysely.config.ts) +kysely init + +# Create a new migration file +kysely migrate make add_user_avatar + +# Apply all pending migrations +kysely migrate latest + +# Rollback last migration +kysely migrate down + +# Show migration status +kysely migrate list +``` + +### Migration File + +```typescript +// migrations/2024_01_15_001_create_user_profile.ts +import { type Kysely, sql } from 'kysely' + +// IMPORTANT: Always use Kysely, not your typed DB interface. +// Migrations are frozen in time and must not depend on current schema types. +export async function up(db: Kysely): Promise { + await db.schema + .createTable('user_profile') + .addColumn('id', 'serial', (col) => col.primaryKey()) + .addColumn('email', 'varchar(255)', (col) => col.notNull().unique()) + .addColumn('avatar_url', 'text') + .addColumn('created_at', 'timestamp', (col) => + col.defaultTo(sql`now()`).notNull() + ) + .execute() + + await db.schema + .createIndex('idx_user_profile_avatar') + .on('user_profile') + .column('avatar_url') + .execute() +} + +export async function down(db: Kysely): Promise { + await db.schema.dropTable('user_profile').execute() +} +``` + +### Programmatic Migrator + +```typescript +import { Migrator, FileMigrationProvider } from 'kysely' +import { promises as fs } from 'fs' +import * as path from 'path' +// ESM only — CJS can use __dirname directly +import { fileURLToPath } from 'url' +const migrationFolder = path.join( + path.dirname(fileURLToPath(import.meta.url)), + './migrations', +) + +// `db` is your Kysely database instance +const migrator = new Migrator({ + db, + provider: new FileMigrationProvider({ + fs, + path, + migrationFolder, + }), + // WARNING: Only enable in development. Disables timestamp-ordering + // validation, which can cause schema drift between environments. + // allowUnorderedMigrations: true, +}) + +const { error, results } = await migrator.migrateToLatest() + +results?.forEach((it) => { + if (it.status === 'Success') { + console.log(`migration "${it.migrationName}" executed successfully`) + } else if (it.status === 'Error') { + console.error(`failed to execute migration "${it.migrationName}"`) + } +}) + +if (error) { + console.error('migration failed', error) + process.exit(1) +} +``` + +## Django (Python) + +### Workflow + +```bash +# Generate migration from model changes +python manage.py makemigrations + +# Apply migrations +python manage.py migrate + +# Show migration status +python manage.py showmigrations + +# Generate empty migration for custom SQL +python manage.py makemigrations --empty app_name -n description +``` + +### Data Migration + +```python +from django.db import migrations + +def backfill_display_names(apps, schema_editor): + User = apps.get_model("accounts", "User") + batch_size = 5000 + users = User.objects.filter(display_name="") + while users.exists(): + batch = list(users[:batch_size]) + for user in batch: + user.display_name = user.username + User.objects.bulk_update(batch, ["display_name"], batch_size=batch_size) + +def reverse_backfill(apps, schema_editor): + pass # Data migration, no reverse needed + +class Migration(migrations.Migration): + dependencies = [("accounts", "0015_add_display_name")] + + operations = [ + migrations.RunPython(backfill_display_names, reverse_backfill), + ] +``` + +### SeparateDatabaseAndState + +Remove a column from the Django model without dropping it from the database immediately: + +```python +class Migration(migrations.Migration): + operations = [ + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.RemoveField(model_name="user", name="legacy_field"), + ], + database_operations=[], # Don't touch the DB yet + ), + ] +``` + +## golang-migrate (Go) + +### Workflow + +```bash +# Create migration pair +migrate create -ext sql -dir migrations -seq add_user_avatar + +# Apply all pending migrations +migrate -path migrations -database "$DATABASE_URL" up + +# Rollback last migration +migrate -path migrations -database "$DATABASE_URL" down 1 + +# Force version (fix dirty state) +migrate -path migrations -database "$DATABASE_URL" force VERSION +``` + +### Migration Files + +```sql +-- migrations/000003_add_user_avatar.up.sql +ALTER TABLE users ADD COLUMN avatar_url TEXT; +CREATE INDEX CONCURRENTLY idx_users_avatar ON users (avatar_url) WHERE avatar_url IS NOT NULL; + +-- migrations/000003_add_user_avatar.down.sql +DROP INDEX IF EXISTS idx_users_avatar; +ALTER TABLE users DROP COLUMN IF EXISTS avatar_url; +``` + +## Zero-Downtime Migration Strategy + +For critical production changes, follow the expand-contract pattern: + +``` +Phase 1: EXPAND + - Add new column/table (nullable or with default) + - Deploy: app writes to BOTH old and new + - Backfill existing data + +Phase 2: MIGRATE + - Deploy: app reads from NEW, writes to BOTH + - Verify data consistency + +Phase 3: CONTRACT + - Deploy: app only uses NEW + - Drop old column/table in separate migration +``` + +### Timeline Example + +``` +Day 1: Migration adds new_status column (nullable) +Day 1: Deploy app v2 — writes to both status and new_status +Day 2: Run backfill migration for existing rows +Day 3: Deploy app v3 — reads from new_status only +Day 7: Migration drops old status column +``` + +## Anti-Patterns + +| Anti-Pattern | Why It Fails | Better Approach | +| ------------------------------------ | ------------------------------------ | ------------------------------------------- | +| Manual SQL in production | No audit trail, unrepeatable | Always use migration files | +| Editing deployed migrations | Causes drift between environments | Create new migration instead | +| NOT NULL without default | Locks table, rewrites all rows | Add nullable, backfill, then add constraint | +| Inline index on large table | Blocks writes during build | CREATE INDEX CONCURRENTLY | +| Schema + data in one migration | Hard to rollback, long transactions | Separate migrations | +| Dropping column before removing code | Application errors on missing column | Remove code first, drop column next deploy | diff --git a/.claude/skills/deployment-patterns/SKILL.md b/.claude/skills/deployment-patterns/SKILL.md new file mode 100644 index 00000000..1a73c9c5 --- /dev/null +++ b/.claude/skills/deployment-patterns/SKILL.md @@ -0,0 +1,426 @@ +--- +name: deployment-patterns +description: Deployment workflows, CI/CD pipeline patterns, Docker containerization, health checks, rollback strategies, and production readiness checklists for web applications. +origin: ECC +--- + +# Deployment Patterns + +Production deployment workflows and CI/CD best practices. + +## When to Activate + +- Setting up CI/CD pipelines +- Dockerizing an application +- Planning deployment strategy (blue-green, canary, rolling) +- Implementing health checks and readiness probes +- Preparing for a production release +- Configuring environment-specific settings + +## Deployment Strategies + +### Rolling Deployment (Default) + +Replace instances gradually — old and new versions run simultaneously during rollout. + +``` +Instance 1: v1 → v2 (update first) +Instance 2: v1 (still running v1) +Instance 3: v1 (still running v1) + +Instance 1: v2 +Instance 2: v1 → v2 (update second) +Instance 3: v1 + +Instance 1: v2 +Instance 2: v2 +Instance 3: v1 → v2 (update last) +``` + +**Pros:** Zero downtime, gradual rollout **Cons:** Two versions run simultaneously — requires backward-compatible changes **Use when:** Standard deployments, backward-compatible changes + +### Blue-Green Deployment + +Run two identical environments. Switch traffic atomically. + +``` +Blue (v1) ← traffic +Green (v2) idle, running new version + +# After verification: +Blue (v1) idle (becomes standby) +Green (v2) ← traffic +``` + +**Pros:** Instant rollback (switch back to blue), clean cutover **Cons:** Requires 2x infrastructure during deployment **Use when:** Critical services, zero-tolerance for issues + +### Canary Deployment + +Route a small percentage of traffic to the new version first. + +``` +v1: 95% of traffic +v2: 5% of traffic (canary) + +# If metrics look good: +v1: 50% of traffic +v2: 50% of traffic + +# Final: +v2: 100% of traffic +``` + +**Pros:** Catches issues with real traffic before full rollout **Cons:** Requires traffic splitting infrastructure, monitoring **Use when:** High-traffic services, risky changes, feature flags + +## Docker + +### Multi-Stage Dockerfile (Node.js) + +```dockerfile +# Stage 1: Install dependencies +FROM node:22-alpine AS deps +WORKDIR /app +COPY package.json package-lock.json ./ +RUN npm ci --production=false + +# Stage 2: Build +FROM node:22-alpine AS builder +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY . . +RUN npm run build +RUN npm prune --production + +# Stage 3: Production image +FROM node:22-alpine AS runner +WORKDIR /app + +RUN addgroup -g 1001 -S appgroup && adduser -S appuser -u 1001 +USER appuser + +COPY --from=builder --chown=appuser:appgroup /app/node_modules ./node_modules +COPY --from=builder --chown=appuser:appgroup /app/dist ./dist +COPY --from=builder --chown=appuser:appgroup /app/package.json ./ + +ENV NODE_ENV=production +EXPOSE 3000 + +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1 + +CMD ["node", "dist/server.js"] +``` + +### Multi-Stage Dockerfile (Go) + +```dockerfile +FROM golang:1.22-alpine AS builder +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /server ./cmd/server + +FROM alpine:3.19 AS runner +RUN apk --no-cache add ca-certificates +RUN adduser -D -u 1001 appuser +USER appuser + +COPY --from=builder /server /server + +EXPOSE 8080 +HEALTHCHECK --interval=30s --timeout=3s CMD wget -qO- http://localhost:8080/health || exit 1 +CMD ["/server"] +``` + +### Multi-Stage Dockerfile (Python/Django) + +```dockerfile +FROM python:3.12-slim AS builder +WORKDIR /app +RUN pip install --no-cache-dir uv +COPY requirements.txt . +RUN uv pip install --system --no-cache -r requirements.txt + +FROM python:3.12-slim AS runner +WORKDIR /app + +RUN useradd -r -u 1001 appuser +USER appuser + +COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin +COPY . . + +ENV PYTHONUNBUFFERED=1 +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=3s CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health/')" || exit 1 +CMD ["gunicorn", "config.wsgi:application", "--bind", "0.0.0.0:8000", "--workers", "4"] +``` + +### Docker Best Practices + +``` +# GOOD practices +- Use specific version tags (node:22-alpine, not node:latest) +- Multi-stage builds to minimize image size +- Run as non-root user +- Copy dependency files first (layer caching) +- Use .dockerignore to exclude node_modules, .git, tests +- Add HEALTHCHECK instruction +- Set resource limits in docker-compose or k8s + +# BAD practices +- Running as root +- Using :latest tags +- Copying entire repo in one COPY layer +- Installing dev dependencies in production image +- Storing secrets in image (use env vars or secrets manager) +``` + +## CI/CD Pipeline + +### GitHub Actions (Standard Pipeline) + +```yaml +name: CI/CD + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: npm + - run: npm ci + - run: npm run lint + - run: npm run typecheck + - run: npm test -- --coverage + - uses: actions/upload-artifact@v4 + if: always() + with: + name: coverage + path: coverage/ + + build: + needs: test + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/build-push-action@v5 + with: + push: true + tags: ghcr.io/${{ github.repository }}:${{ github.sha }} + cache-from: type=gha + cache-to: type=gha,mode=max + + deploy: + needs: build + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' + environment: production + steps: + - name: Deploy to production + run: | + # Platform-specific deployment command + # Railway: railway up + # Vercel: vercel --prod + # K8s: kubectl set image deployment/app app=ghcr.io/${{ github.repository }}:${{ github.sha }} + echo "Deploying ${{ github.sha }}" +``` + +### Pipeline Stages + +``` +PR opened: + lint → typecheck → unit tests → integration tests → preview deploy + +Merged to main: + lint → typecheck → unit tests → integration tests → build image → deploy staging → smoke tests → deploy production +``` + +## Health Checks + +### Health Check Endpoint + +```typescript +// Simple health check +app.get("/health", (req, res) => { + res.status(200).json({ status: "ok" }); +}); + +// Detailed health check (for internal monitoring) +app.get("/health/detailed", async (req, res) => { + const checks = { + database: await checkDatabase(), + redis: await checkRedis(), + externalApi: await checkExternalApi(), + }; + + const allHealthy = Object.values(checks).every(c => c.status === "ok"); + + res.status(allHealthy ? 200 : 503).json({ + status: allHealthy ? "ok" : "degraded", + timestamp: new Date().toISOString(), + version: process.env.APP_VERSION || "unknown", + uptime: process.uptime(), + checks, + }); +}); + +async function checkDatabase(): Promise { + try { + await db.query("SELECT 1"); + return { status: "ok", latency_ms: 2 }; + } catch (err) { + return { status: "error", message: "Database unreachable" }; + } +} +``` + +### Kubernetes Probes + +```yaml +livenessProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 30 + failureThreshold: 3 + +readinessProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 2 + +startupProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 0 + periodSeconds: 5 + failureThreshold: 30 # 30 * 5s = 150s max startup time +``` + +## Environment Configuration + +### Twelve-Factor App Pattern + +```bash +# All config via environment variables — never in code +DATABASE_URL=postgres://user:pass@host:5432/db +REDIS_URL=redis://host:6379/0 +API_KEY=${API_KEY} # injected by secrets manager +LOG_LEVEL=info +PORT=3000 + +# Environment-specific behavior +NODE_ENV=production # or staging, development +APP_ENV=production # explicit app environment +``` + +### Configuration Validation + +```typescript +import { z } from "zod"; + +const envSchema = z.object({ + NODE_ENV: z.enum(["development", "staging", "production"]), + PORT: z.coerce.number().default(3000), + DATABASE_URL: z.string().url(), + REDIS_URL: z.string().url(), + JWT_SECRET: z.string().min(32), + LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"), +}); + +// Validate at startup — fail fast if config is wrong +export const env = envSchema.parse(process.env); +``` + +## Rollback Strategy + +### Instant Rollback + +```bash +# Docker/Kubernetes: point to previous image +kubectl rollout undo deployment/app + +# Vercel: promote previous deployment +vercel rollback + +# Railway: redeploy previous commit +railway up --commit + +# Database: rollback migration (if reversible) +npx prisma migrate resolve --rolled-back +``` + +### Rollback Checklist + +- [ ] Previous image/artifact is available and tagged +- [ ] Database migrations are backward-compatible (no destructive changes) +- [ ] Feature flags can disable new features without deploy +- [ ] Monitoring alerts configured for error rate spikes +- [ ] Rollback tested in staging before production release + +## Production Readiness Checklist + +Before any production deployment: + +### Application + +- [ ] All tests pass (unit, integration, E2E) +- [ ] No hardcoded secrets in code or config files +- [ ] Error handling covers all edge cases +- [ ] Logging is structured (JSON) and does not contain PII +- [ ] Health check endpoint returns meaningful status + +### Infrastructure + +- [ ] Docker image builds reproducibly (pinned versions) +- [ ] Environment variables documented and validated at startup +- [ ] Resource limits set (CPU, memory) +- [ ] Horizontal scaling configured (min/max instances) +- [ ] SSL/TLS enabled on all endpoints + +### Monitoring + +- [ ] Application metrics exported (request rate, latency, errors) +- [ ] Alerts configured for error rate > threshold +- [ ] Log aggregation set up (structured logs, searchable) +- [ ] Uptime monitoring on health endpoint + +### Security + +- [ ] Dependencies scanned for CVEs +- [ ] CORS configured for allowed origins only +- [ ] Rate limiting enabled on public endpoints +- [ ] Authentication and authorization verified +- [ ] Security headers set (CSP, HSTS, X-Frame-Options) + +### Operations + +- [ ] Rollback plan documented and tested +- [ ] Database migration tested against production-sized data +- [ ] Runbook for common failure scenarios +- [ ] On-call rotation and escalation path defined diff --git a/.claude/skills/docker-patterns/SKILL.md b/.claude/skills/docker-patterns/SKILL.md new file mode 100644 index 00000000..1160d37a --- /dev/null +++ b/.claude/skills/docker-patterns/SKILL.md @@ -0,0 +1,365 @@ +--- +name: docker-patterns +description: Docker and Docker Compose patterns for local development, container security, networking, volume strategies, and multi-service orchestration. +origin: ECC +--- + +# Docker Patterns + +Docker and Docker Compose best practices for containerized development. + +## When to Activate + +- Setting up Docker Compose for local development +- Designing multi-container architectures +- Troubleshooting container networking or volume issues +- Reviewing Dockerfiles for security and size +- Migrating from local dev to containerized workflow + +## Docker Compose for Local Development + +### Standard Web App Stack + +```yaml +# docker-compose.yml +services: + app: + build: + context: . + target: dev # Use dev stage of multi-stage Dockerfile + ports: + - 3000:3000 + volumes: + - .:/app # Bind mount for hot reload + - /app/node_modules # Anonymous volume -- preserves container deps + environment: + - DATABASE_URL=postgres://postgres:postgres@db:5432/app_dev + - REDIS_URL=redis://redis:6379/0 + - NODE_ENV=development + depends_on: + db: + condition: service_healthy + redis: + condition: service_started + command: npm run dev + + db: + image: postgres:16-alpine + ports: + - 5432:5432 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: app_dev + volumes: + - pgdata:/var/lib/postgresql/data + - ./scripts/init-db.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: [CMD-SHELL, pg_isready -U postgres] + interval: 5s + timeout: 3s + retries: 5 + + redis: + image: redis:7-alpine + ports: + - 6379:6379 + volumes: + - redisdata:/data + + mailpit: # Local email testing + image: axllent/mailpit + ports: + - 8025:8025 # Web UI + - 1025:1025 # SMTP + +volumes: + pgdata: + redisdata: +``` + +### Development vs Production Dockerfile + +```dockerfile +# Stage: dependencies +FROM node:22-alpine AS deps +WORKDIR /app +COPY package.json package-lock.json ./ +RUN npm ci + +# Stage: dev (hot reload, debug tools) +FROM node:22-alpine AS dev +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY . . +EXPOSE 3000 +CMD ["npm", "run", "dev"] + +# Stage: build +FROM node:22-alpine AS build +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY . . +RUN npm run build && npm prune --production + +# Stage: production (minimal image) +FROM node:22-alpine AS production +WORKDIR /app +RUN addgroup -g 1001 -S appgroup && adduser -S appuser -u 1001 +USER appuser +COPY --from=build --chown=appuser:appgroup /app/dist ./dist +COPY --from=build --chown=appuser:appgroup /app/node_modules ./node_modules +COPY --from=build --chown=appuser:appgroup /app/package.json ./ +ENV NODE_ENV=production +EXPOSE 3000 +HEALTHCHECK --interval=30s --timeout=3s CMD wget -qO- http://localhost:3000/health || exit 1 +CMD ["node", "dist/server.js"] +``` + +### Override Files + +```yaml +# docker-compose.override.yml (auto-loaded, dev-only settings) +services: + app: + environment: + - DEBUG=app:* + - LOG_LEVEL=debug + ports: + - "9229:9229" # Node.js debugger + +# docker-compose.prod.yml (explicit for production) +services: + app: + build: + target: production + restart: always + deploy: + resources: + limits: + cpus: "1.0" + memory: 512M +``` + +```bash +# Development (auto-loads override) +docker compose up + +# Production +docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d +``` + +## Networking + +### Service Discovery + +Services in the same Compose network resolve by service name: + +``` +# From "app" container: +postgres://postgres:postgres@db:5432/app_dev # "db" resolves to the db container +redis://redis:6379/0 # "redis" resolves to the redis container +``` + +### Custom Networks + +```yaml +services: + frontend: + networks: + - frontend-net + + api: + networks: + - frontend-net + - backend-net + + db: + networks: + - backend-net # Only reachable from api, not frontend + +networks: + frontend-net: + backend-net: +``` + +### Exposing Only What's Needed + +```yaml +services: + db: + ports: + - 127.0.0.1:5432:5432 # Only accessible from host, not network + # Omit ports entirely in production -- accessible only within Docker network +``` + +## Volume Strategies + +```yaml +volumes: + # Named volume: persists across container restarts, managed by Docker + pgdata: + + # Bind mount: maps host directory into container (for development) + # - ./src:/app/src + + # Anonymous volume: preserves container-generated content from bind mount override + # - /app/node_modules +``` + +### Common Patterns + +```yaml +services: + app: + volumes: + - .:/app # Source code (bind mount for hot reload) + - /app/node_modules # Protect container's node_modules from host + - /app/.next # Protect build cache + + db: + volumes: + - pgdata:/var/lib/postgresql/data # Persistent data + - ./scripts/init.sql:/docker-entrypoint-initdb.d/init.sql # Init scripts +``` + +## Container Security + +### Dockerfile Hardening + +```dockerfile +# 1. Use specific tags (never :latest) +FROM node:22.12-alpine3.20 + +# 2. Run as non-root +RUN addgroup -g 1001 -S app && adduser -S app -u 1001 +USER app + +# 3. Drop capabilities (in compose) +# 4. Read-only root filesystem where possible +# 5. No secrets in image layers +``` + +### Compose Security + +```yaml +services: + app: + security_opt: + - no-new-privileges:true + read_only: true + tmpfs: + - /tmp + - /app/.cache + cap_drop: + - ALL + cap_add: + - NET_BIND_SERVICE # Only if binding to ports < 1024 +``` + +### Secret Management + +```yaml +# GOOD: Use environment variables (injected at runtime) +services: + app: + env_file: + - .env # Never commit .env to git + environment: + - API_KEY # Inherits from host environment + +# GOOD: Docker secrets (Swarm mode) +secrets: + db_password: + file: ./secrets/db_password.txt + +services: + db: + secrets: + - db_password + +# BAD: Hardcoded in image +# ENV API_KEY=sk-proj-xxxxx # NEVER DO THIS +``` + +## .dockerignore + +``` +node_modules +.git +.env +.env.* +dist +coverage +*.log +.next +.cache +docker-compose*.yml +Dockerfile* +README.md +tests/ +``` + +## Debugging + +### Common Commands + +```bash +# View logs +docker compose logs -f app # Follow app logs +docker compose logs --tail=50 db # Last 50 lines from db + +# Execute commands in running container +docker compose exec app sh # Shell into app +docker compose exec db psql -U postgres # Connect to postgres + +# Inspect +docker compose ps # Running services +docker compose top # Processes in each container +docker stats # Resource usage + +# Rebuild +docker compose up --build # Rebuild images +docker compose build --no-cache app # Force full rebuild + +# Clean up +docker compose down # Stop and remove containers +docker compose down -v # Also remove volumes (DESTRUCTIVE) +docker system prune # Remove unused images/containers +``` + +### Debugging Network Issues + +```bash +# Check DNS resolution inside container +docker compose exec app nslookup db + +# Check connectivity +docker compose exec app wget -qO- http://api:3000/health + +# Inspect network +docker network ls +docker network inspect _default +``` + +## Anti-Patterns + +``` +# BAD: Using docker compose in production without orchestration +# Use Kubernetes, ECS, or Docker Swarm for production multi-container workloads + +# BAD: Storing data in containers without volumes +# Containers are ephemeral -- all data lost on restart without volumes + +# BAD: Running as root +# Always create and use a non-root user + +# BAD: Using :latest tag +# Pin to specific versions for reproducible builds + +# BAD: One giant container with all services +# Separate concerns: one process per container + +# BAD: Putting secrets in docker-compose.yml +# Use .env files (gitignored) or Docker secrets +``` diff --git a/.claude/skills/rust-patterns/SKILL.md b/.claude/skills/rust-patterns/SKILL.md new file mode 100644 index 00000000..6ca79c12 --- /dev/null +++ b/.claude/skills/rust-patterns/SKILL.md @@ -0,0 +1,499 @@ +--- +name: rust-patterns +description: Idiomatic Rust patterns, ownership, error handling, traits, concurrency, and best practices for building safe, performant applications. +origin: ECC +--- + +# Rust Development Patterns + +Idiomatic Rust patterns and best practices for building safe, performant, and maintainable applications. + +## When to Use + +- Writing new Rust code +- Reviewing Rust code +- Refactoring existing Rust code +- Designing crate structure and module layout + +## How It Works + +This skill enforces idiomatic Rust conventions across six key areas: ownership and borrowing to prevent data races at compile time, `Result`/`?` error propagation with `thiserror` for libraries and `anyhow` for applications, enums and exhaustive pattern matching to make illegal states unrepresentable, traits and generics for zero-cost abstraction, safe concurrency via `Arc>`, channels, and async/await, and minimal `pub` surfaces organized by domain. + +## Core Principles + +### 1. Ownership and Borrowing + +Rust's ownership system prevents data races and memory bugs at compile time. + +```rust +// Good: Pass references when you don't need ownership +fn process(data: &[u8]) -> usize { + data.len() +} + +// Good: Take ownership only when you need to store or consume +fn store(data: Vec) -> Record { + Record { payload: data } +} + +// Bad: Cloning unnecessarily to avoid borrow checker +fn process_bad(data: &Vec) -> usize { + let cloned = data.clone(); // Wasteful — just borrow + cloned.len() +} +``` + +### Use `Cow` for Flexible Ownership + +```rust +use std::borrow::Cow; + +fn normalize(input: &str) -> Cow<'_, str> { + if input.contains(' ') { + Cow::Owned(input.replace(' ', "_")) + } else { + Cow::Borrowed(input) // Zero-cost when no mutation needed + } +} +``` + +## Error Handling + +### Use `Result` and `?` — Never `unwrap()` in Production + +```rust +// Good: Propagate errors with context +use anyhow::{Context, Result}; + +fn load_config(path: &str) -> Result { + let content = std::fs::read_to_string(path) + .with_context(|| format!("failed to read config from {path}"))?; + let config: Config = toml::from_str(&content) + .with_context(|| format!("failed to parse config from {path}"))?; + Ok(config) +} + +// Bad: Panics on error +fn load_config_bad(path: &str) -> Config { + let content = std::fs::read_to_string(path).unwrap(); // Panics! + toml::from_str(&content).unwrap() +} +``` + +### Library Errors with `thiserror`, Application Errors with `anyhow` + +```rust +// Library code: structured, typed errors +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum StorageError { + #[error("record not found: {id}")] + NotFound { id: String }, + #[error("connection failed")] + Connection(#[from] std::io::Error), + #[error("invalid data: {0}")] + InvalidData(String), +} + +// Application code: flexible error handling +use anyhow::{bail, Result}; + +fn run() -> Result<()> { + let config = load_config("app.toml")?; + if config.workers == 0 { + bail!("worker count must be > 0"); + } + Ok(()) +} +``` + +### `Option` Combinators Over Nested Matching + +```rust +// Good: Combinator chain +fn find_user_email(users: &[User], id: u64) -> Option { + users.iter() + .find(|u| u.id == id) + .map(|u| u.email.clone()) +} + +// Bad: Deeply nested matching +fn find_user_email_bad(users: &[User], id: u64) -> Option { + match users.iter().find(|u| u.id == id) { + Some(user) => match &user.email { + email => Some(email.clone()), + }, + None => None, + } +} +``` + +## Enums and Pattern Matching + +### Model States as Enums + +```rust +// Good: Impossible states are unrepresentable +enum ConnectionState { + Disconnected, + Connecting { attempt: u32 }, + Connected { session_id: String }, + Failed { reason: String, retries: u32 }, +} + +fn handle(state: &ConnectionState) { + match state { + ConnectionState::Disconnected => connect(), + ConnectionState::Connecting { attempt } if *attempt > 3 => abort(), + ConnectionState::Connecting { .. } => wait(), + ConnectionState::Connected { session_id } => use_session(session_id), + ConnectionState::Failed { retries, .. } if *retries < 5 => retry(), + ConnectionState::Failed { reason, .. } => log_failure(reason), + } +} +``` + +### Exhaustive Matching — No Catch-All for Business Logic + +```rust +// Good: Handle every variant explicitly +match command { + Command::Start => start_service(), + Command::Stop => stop_service(), + Command::Restart => restart_service(), + // Adding a new variant forces handling here +} + +// Bad: Wildcard hides new variants +match command { + Command::Start => start_service(), + _ => {} // Silently ignores Stop, Restart, and future variants +} +``` + +## Traits and Generics + +### Accept Generics, Return Concrete Types + +```rust +// Good: Generic input, concrete output +fn read_all(reader: &mut impl Read) -> std::io::Result> { + let mut buf = Vec::new(); + reader.read_to_end(&mut buf)?; + Ok(buf) +} + +// Good: Trait bounds for multiple constraints +fn process(item: T) -> String { + format!("processed: {item}") +} +``` + +### Trait Objects for Dynamic Dispatch + +```rust +// Use when you need heterogeneous collections or plugin systems +trait Handler: Send + Sync { + fn handle(&self, request: &Request) -> Response; +} + +struct Router { + handlers: Vec>, +} + +// Use generics when you need performance (monomorphization) +fn fast_process(handler: &H, request: &Request) -> Response { + handler.handle(request) +} +``` + +### Newtype Pattern for Type Safety + +```rust +// Good: Distinct types prevent mixing up arguments +struct UserId(u64); +struct OrderId(u64); + +fn get_order(user: UserId, order: OrderId) -> Result { + // Can't accidentally swap user and order IDs + todo!() +} + +// Bad: Easy to swap arguments +fn get_order_bad(user_id: u64, order_id: u64) -> Result { + todo!() +} +``` + +## Structs and Data Modeling + +### Builder Pattern for Complex Construction + +```rust +struct ServerConfig { + host: String, + port: u16, + max_connections: usize, +} + +impl ServerConfig { + fn builder(host: impl Into, port: u16) -> ServerConfigBuilder { + ServerConfigBuilder { host: host.into(), port, max_connections: 100 } + } +} + +struct ServerConfigBuilder { host: String, port: u16, max_connections: usize } + +impl ServerConfigBuilder { + fn max_connections(mut self, n: usize) -> Self { self.max_connections = n; self } + fn build(self) -> ServerConfig { + ServerConfig { host: self.host, port: self.port, max_connections: self.max_connections } + } +} + +// Usage: ServerConfig::builder("localhost", 8080).max_connections(200).build() +``` + +## Iterators and Closures + +### Prefer Iterator Chains Over Manual Loops + +```rust +// Good: Declarative, lazy, composable +let active_emails: Vec = users.iter() + .filter(|u| u.is_active) + .map(|u| u.email.clone()) + .collect(); + +// Bad: Imperative accumulation +let mut active_emails = Vec::new(); +for user in &users { + if user.is_active { + active_emails.push(user.email.clone()); + } +} +``` + +### Use `collect()` with Type Annotation + +```rust +// Collect into different types +let names: Vec<_> = items.iter().map(|i| &i.name).collect(); +let lookup: HashMap<_, _> = items.iter().map(|i| (i.id, i)).collect(); +let combined: String = parts.iter().copied().collect(); + +// Collect Results — short-circuits on first error +let parsed: Result, _> = strings.iter().map(|s| s.parse()).collect(); +``` + +## Concurrency + +### `Arc>` for Shared Mutable State + +```rust +use std::sync::{Arc, Mutex}; + +let counter = Arc::new(Mutex::new(0)); +let handles: Vec<_> = (0..10).map(|_| { + let counter = Arc::clone(&counter); + std::thread::spawn(move || { + let mut num = counter.lock().expect("mutex poisoned"); + *num += 1; + }) +}).collect(); + +for handle in handles { + handle.join().expect("worker thread panicked"); +} +``` + +### Channels for Message Passing + +```rust +use std::sync::mpsc; + +let (tx, rx) = mpsc::sync_channel(16); // Bounded channel with backpressure + +for i in 0..5 { + let tx = tx.clone(); + std::thread::spawn(move || { + tx.send(format!("message {i}")).expect("receiver disconnected"); + }); +} +drop(tx); // Close sender so rx iterator terminates + +for msg in rx { + println!("{msg}"); +} +``` + +### Async with Tokio + +```rust +use tokio::time::Duration; + +async fn fetch_with_timeout(url: &str) -> Result { + let response = tokio::time::timeout( + Duration::from_secs(5), + reqwest::get(url), + ) + .await + .context("request timed out")? + .context("request failed")?; + + response.text().await.context("failed to read body") +} + +// Spawn concurrent tasks +async fn fetch_all(urls: Vec) -> Vec> { + let handles: Vec<_> = urls.into_iter() + .map(|url| tokio::spawn(async move { + fetch_with_timeout(&url).await + })) + .collect(); + + let mut results = Vec::with_capacity(handles.len()); + for handle in handles { + results.push(handle.await.unwrap_or_else(|e| panic!("spawned task panicked: {e}"))); + } + results +} +``` + +## Unsafe Code + +### When Unsafe Is Acceptable + +```rust +// Acceptable: FFI boundary with documented invariants (Rust 2024+) +/// # Safety +/// `ptr` must be a valid, aligned pointer to an initialized `Widget`. +unsafe fn widget_from_raw<'a>(ptr: *const Widget) -> &'a Widget { + // SAFETY: caller guarantees ptr is valid and aligned + unsafe { &*ptr } +} + +// Acceptable: Performance-critical path with proof of correctness +// SAFETY: index is always < len due to the loop bound +unsafe { slice.get_unchecked(index) } +``` + +### When Unsafe Is NOT Acceptable + +```rust +// Bad: Using unsafe to bypass borrow checker +// Bad: Using unsafe for convenience +// Bad: Using unsafe without a Safety comment +// Bad: Transmuting between unrelated types +``` + +## Module System and Crate Structure + +### Organize by Domain, Not by Type + +```text +my_app/ +├── src/ +│ ├── main.rs +│ ├── lib.rs +│ ├── auth/ # Domain module +│ │ ├── mod.rs +│ │ ├── token.rs +│ │ └── middleware.rs +│ ├── orders/ # Domain module +│ │ ├── mod.rs +│ │ ├── model.rs +│ │ └── service.rs +│ └── db/ # Infrastructure +│ ├── mod.rs +│ └── pool.rs +├── tests/ # Integration tests +├── benches/ # Benchmarks +└── Cargo.toml +``` + +### Visibility — Expose Minimally + +```rust +// Good: pub(crate) for internal sharing +pub(crate) fn validate_input(input: &str) -> bool { + !input.is_empty() +} + +// Good: Re-export public API from lib.rs +pub mod auth; +pub use auth::AuthMiddleware; + +// Bad: Making everything pub +pub fn internal_helper() {} // Should be pub(crate) or private +``` + +## Tooling Integration + +### Essential Commands + +```bash +# Build and check +cargo build +cargo check # Fast type checking without codegen +cargo clippy # Lints and suggestions +cargo fmt # Format code + +# Testing +cargo test +cargo test -- --nocapture # Show println output +cargo test --lib # Unit tests only +cargo test --test integration # Integration tests only + +# Dependencies +cargo audit # Security audit +cargo tree # Dependency tree +cargo update # Update dependencies + +# Performance +cargo bench # Run benchmarks +``` + +## Quick Reference: Rust Idioms + +| Idiom | Description | +| ----------------------------------- | ---------------------------------------------------------- | +| Borrow, don't clone | Pass `&T` instead of cloning unless ownership is needed | +| Make illegal states unrepresentable | Use enums to model valid states only | +| `?` over `unwrap()` | Propagate errors, never panic in library/production code | +| Parse, don't validate | Convert unstructured data to typed structs at the boundary | +| Newtype for type safety | Wrap primitives in newtypes to prevent argument swaps | +| Prefer iterators over loops | Declarative chains are clearer and often faster | +| `#[must_use]` on Results | Ensure callers handle return values | +| `Cow` for flexible ownership | Avoid allocations when borrowing suffices | +| Exhaustive matching | No wildcard `_` for business-critical enums | +| Minimal `pub` surface | Use `pub(crate)` for internal APIs | + +## Anti-Patterns to Avoid + +```rust +// Bad: .unwrap() in production code +let value = map.get("key").unwrap(); + +// Bad: .clone() to satisfy borrow checker without understanding why +let data = expensive_data.clone(); +process(&original, &data); + +// Bad: Using String when &str suffices +fn greet(name: String) { /* should be &str */ } + +// Bad: Box in libraries (use thiserror instead) +fn parse(input: &str) -> Result> { todo!() } + +// Bad: Ignoring must_use warnings +let _ = validate(input); // Silently discarding a Result + +// Bad: Blocking in async context +async fn bad_async() { + std::thread::sleep(Duration::from_secs(1)); // Blocks the executor! + // Use: tokio::time::sleep(Duration::from_secs(1)).await; +} +``` + +**Remember**: If it compiles, it's probably correct — but only if you avoid `unwrap()`, minimize `unsafe`, and let the type system work for you. diff --git a/.claude/skills/rust-testing/SKILL.md b/.claude/skills/rust-testing/SKILL.md new file mode 100644 index 00000000..00c8866b --- /dev/null +++ b/.claude/skills/rust-testing/SKILL.md @@ -0,0 +1,502 @@ +--- +name: rust-testing +description: Rust testing patterns including unit tests, integration tests, async testing, property-based testing, mocking, and coverage. Follows TDD methodology. +origin: ECC +--- + +# Rust Testing Patterns + +Comprehensive Rust testing patterns for writing reliable, maintainable tests following TDD methodology. + +## When to Use + +- Writing new Rust functions, methods, or traits +- Adding test coverage to existing code +- Creating benchmarks for performance-critical code +- Implementing property-based tests for input validation +- Following TDD workflow in Rust projects + +## How It Works + +1. **Identify target code** — Find the function, trait, or module to test +2. **Write a test** — Use `#[test]` in a `#[cfg(test)]` module, rstest for parameterized tests, or proptest for property-based tests +3. **Mock dependencies** — Use mockall to isolate the unit under test +4. **Run tests (RED)** — Verify the test fails with the expected error +5. **Implement (GREEN)** — Write minimal code to pass +6. **Refactor** — Improve while keeping tests green +7. **Check coverage** — Use cargo-llvm-cov, target 80%+ + +## TDD Workflow for Rust + +### The RED-GREEN-REFACTOR Cycle + +``` +RED → Write a failing test first +GREEN → Write minimal code to pass the test +REFACTOR → Improve code while keeping tests green +REPEAT → Continue with next requirement +``` + +### Step-by-Step TDD in Rust + +```rust +// RED: Write test first, use todo!() as placeholder +pub fn add(a: i32, b: i32) -> i32 { todo!() } + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_add() { assert_eq!(add(2, 3), 5); } +} +// cargo test → panics at 'not yet implemented' +``` + +```rust +// GREEN: Replace todo!() with minimal implementation +pub fn add(a: i32, b: i32) -> i32 { a + b } +// cargo test → PASS, then REFACTOR while keeping tests green +``` + +## Unit Tests + +### Module-Level Test Organization + +```rust +// src/user.rs +pub struct User { + pub name: String, + pub email: String, +} + +impl User { + pub fn new(name: impl Into, email: impl Into) -> Result { + let email = email.into(); + if !email.contains('@') { + return Err(format!("invalid email: {email}")); + } + Ok(Self { name: name.into(), email }) + } + + pub fn display_name(&self) -> &str { + &self.name + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn creates_user_with_valid_email() { + let user = User::new("Alice", "alice@example.com").unwrap(); + assert_eq!(user.display_name(), "Alice"); + assert_eq!(user.email, "alice@example.com"); + } + + #[test] + fn rejects_invalid_email() { + let result = User::new("Bob", "not-an-email"); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("invalid email")); + } +} +``` + +### Assertion Macros + +```rust +assert_eq!(2 + 2, 4); // Equality +assert_ne!(2 + 2, 5); // Inequality +assert!(vec![1, 2, 3].contains(&2)); // Boolean +assert_eq!(value, 42, "expected 42 but got {value}"); // Custom message +assert!((0.1_f64 + 0.2 - 0.3).abs() < f64::EPSILON); // Float comparison +``` + +## Error and Panic Testing + +### Testing `Result` Returns + +```rust +#[test] +fn parse_returns_error_for_invalid_input() { + let result = parse_config("}{invalid"); + assert!(result.is_err()); + + // Assert specific error variant + let err = result.unwrap_err(); + assert!(matches!(err, ConfigError::ParseError(_))); +} + +#[test] +fn parse_succeeds_for_valid_input() -> Result<(), Box> { + let config = parse_config(r#"{"port": 8080}"#)?; + assert_eq!(config.port, 8080); + Ok(()) // Test fails if any ? returns Err +} +``` + +### Testing Panics + +```rust +#[test] +#[should_panic] +fn panics_on_empty_input() { + process(&[]); +} + +#[test] +#[should_panic(expected = "index out of bounds")] +fn panics_with_specific_message() { + let v: Vec = vec![]; + let _ = v[0]; +} +``` + +## Integration Tests + +### File Structure + +```text +my_crate/ +├── src/ +│ └── lib.rs +├── tests/ # Integration tests +│ ├── api_test.rs # Each file is a separate test binary +│ ├── db_test.rs +│ └── common/ # Shared test utilities +│ └── mod.rs +``` + +### Writing Integration Tests + +```rust +// tests/api_test.rs +use my_crate::{App, Config}; + +#[test] +fn full_request_lifecycle() { + let config = Config::test_default(); + let app = App::new(config); + + let response = app.handle_request("/health"); + assert_eq!(response.status, 200); + assert_eq!(response.body, "OK"); +} +``` + +## Async Tests + +### With Tokio + +```rust +#[tokio::test] +async fn fetches_data_successfully() { + let client = TestClient::new().await; + let result = client.get("/data").await; + assert!(result.is_ok()); + assert_eq!(result.unwrap().items.len(), 3); +} + +#[tokio::test] +async fn handles_timeout() { + use std::time::Duration; + let result = tokio::time::timeout( + Duration::from_millis(100), + slow_operation(), + ).await; + + assert!(result.is_err(), "should have timed out"); +} +``` + +## Test Organization Patterns + +### Parameterized Tests with `rstest` + +```rust +use rstest::{rstest, fixture}; + +#[rstest] +#[case("hello", 5)] +#[case("", 0)] +#[case("rust", 4)] +fn test_string_length(#[case] input: &str, #[case] expected: usize) { + assert_eq!(input.len(), expected); +} + +// Fixtures +#[fixture] +fn test_db() -> TestDb { + TestDb::new_in_memory() +} + +#[rstest] +fn test_insert(test_db: TestDb) { + test_db.insert("key", "value"); + assert_eq!(test_db.get("key"), Some("value".into())); +} +``` + +### Test Helpers + +```rust +#[cfg(test)] +mod tests { + use super::*; + + /// Creates a test user with sensible defaults. + fn make_user(name: &str) -> User { + User::new(name, &format!("{name}@test.com")).unwrap() + } + + #[test] + fn user_display() { + let user = make_user("alice"); + assert_eq!(user.display_name(), "alice"); + } +} +``` + +## Property-Based Testing with `proptest` + +### Basic Property Tests + +```rust +use proptest::prelude::*; + +proptest! { + #[test] + fn encode_decode_roundtrip(input in ".*") { + let encoded = encode(&input); + let decoded = decode(&encoded).unwrap(); + assert_eq!(input, decoded); + } + + #[test] + fn sort_preserves_length(mut vec in prop::collection::vec(any::(), 0..100)) { + let original_len = vec.len(); + vec.sort(); + assert_eq!(vec.len(), original_len); + } + + #[test] + fn sort_produces_ordered_output(mut vec in prop::collection::vec(any::(), 0..100)) { + vec.sort(); + for window in vec.windows(2) { + assert!(window[0] <= window[1]); + } + } +} +``` + +### Custom Strategies + +```rust +use proptest::prelude::*; + +fn valid_email() -> impl Strategy { + ("[a-z]{1,10}", "[a-z]{1,5}") + .prop_map(|(user, domain)| format!("{user}@{domain}.com")) +} + +proptest! { + #[test] + fn accepts_valid_emails(email in valid_email()) { + assert!(User::new("Test", &email).is_ok()); + } +} +``` + +## Mocking with `mockall` + +### Trait-Based Mocking + +```rust +use mockall::{automock, predicate::eq}; + +#[automock] +trait UserRepository { + fn find_by_id(&self, id: u64) -> Option; + fn save(&self, user: &User) -> Result<(), StorageError>; +} + +#[test] +fn service_returns_user_when_found() { + let mut mock = MockUserRepository::new(); + mock.expect_find_by_id() + .with(eq(42)) + .times(1) + .returning(|_| Some(User { id: 42, name: "Alice".into() })); + + let service = UserService::new(Box::new(mock)); + let user = service.get_user(42).unwrap(); + assert_eq!(user.name, "Alice"); +} + +#[test] +fn service_returns_none_when_not_found() { + let mut mock = MockUserRepository::new(); + mock.expect_find_by_id() + .returning(|_| None); + + let service = UserService::new(Box::new(mock)); + assert!(service.get_user(99).is_none()); +} +``` + +## Doc Tests + +### Executable Documentation + +````rust +/// Adds two numbers together. +/// +/// # Examples +/// +/// ``` +/// use my_crate::add; +/// +/// assert_eq!(add(2, 3), 5); +/// assert_eq!(add(-1, 1), 0); +/// ``` +pub fn add(a: i32, b: i32) -> i32 { + a + b +} + +/// Parses a config string. +/// +/// # Errors +/// +/// Returns `Err` if the input is not valid TOML. +/// +/// ```no_run +/// use my_crate::parse_config; +/// +/// let config = parse_config(r#"port = 8080"#).unwrap(); +/// assert_eq!(config.port, 8080); +/// ``` +/// +/// ```no_run +/// use my_crate::parse_config; +/// +/// assert!(parse_config("}{invalid").is_err()); +/// ``` +pub fn parse_config(input: &str) -> Result { + todo!() +} +```` + +## Benchmarking with Criterion + +```toml +# Cargo.toml +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "benchmark" +harness = false +``` + +```rust +// benches/benchmark.rs +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +fn fibonacci(n: u64) -> u64 { + match n { + 0 | 1 => n, + _ => fibonacci(n - 1) + fibonacci(n - 2), + } +} + +fn bench_fibonacci(c: &mut Criterion) { + c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20)))); +} + +criterion_group!(benches, bench_fibonacci); +criterion_main!(benches); +``` + +## Test Coverage + +### Running Coverage + +```bash +# Install: cargo install cargo-llvm-cov (or use taiki-e/install-action in CI) +cargo llvm-cov # Summary +cargo llvm-cov --html # HTML report +cargo llvm-cov --lcov > lcov.info # LCOV format for CI +cargo llvm-cov --fail-under-lines 80 # Fail if below threshold +``` + +### Coverage Targets + +| Code Type | Target | +| ------------------------ | ------- | +| Critical business logic | 100% | +| Public API | 90%+ | +| General code | 80%+ | +| Generated / FFI bindings | Exclude | + +## Testing Commands + +```bash +cargo test # Run all tests +cargo test -- --nocapture # Show println output +cargo test test_name # Run tests matching pattern +cargo test --lib # Unit tests only +cargo test --test api_test # Integration tests only +cargo test --doc # Doc tests only +cargo test --no-fail-fast # Don't stop on first failure +cargo test -- --ignored # Run ignored tests +``` + +## Best Practices + +**DO:** + +- Write tests FIRST (TDD) +- Use `#[cfg(test)]` modules for unit tests +- Test behavior, not implementation +- Use descriptive test names that explain the scenario +- Prefer `assert_eq!` over `assert!` for better error messages +- Use `?` in tests that return `Result` for cleaner error output +- Keep tests independent — no shared mutable state + +**DON'T:** + +- Use `#[should_panic]` when you can test `Result::is_err()` instead +- Mock everything — prefer integration tests when feasible +- Ignore flaky tests — fix or quarantine them +- Use `sleep()` in tests — use channels, barriers, or `tokio::time::pause()` +- Skip error path testing + +## CI Integration + +```yaml +# GitHub Actions +test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy, rustfmt + + - name: Check formatting + run: cargo fmt --check + + - name: Clippy + run: cargo clippy -- -D warnings + + - name: Run tests + run: cargo test + + - uses: taiki-e/install-action@cargo-llvm-cov + + - name: Coverage + run: cargo llvm-cov --fail-under-lines 80 +``` + +**Remember**: Tests are documentation. They show how your code is meant to be used. Write them clearly and keep them up to date. diff --git a/.claude/skills/security-bounty-hunter/SKILL.md b/.claude/skills/security-bounty-hunter/SKILL.md new file mode 100644 index 00000000..aed7c870 --- /dev/null +++ b/.claude/skills/security-bounty-hunter/SKILL.md @@ -0,0 +1,99 @@ +--- +name: security-bounty-hunter +description: Hunt for exploitable, bounty-worthy security issues in repositories. Focuses on remotely reachable vulnerabilities that qualify for real reports instead of noisy local-only findings. +origin: ECC direct-port adaptation +version: 1.0.0 +--- + +# Security Bounty Hunter + +Use this when the goal is practical vulnerability discovery for responsible disclosure or bounty submission, not a broad best-practices review. + +## When to Use + +- Scanning a repository for exploitable vulnerabilities +- Preparing a Huntr, HackerOne, or similar bounty submission +- Triage where the question is "does this actually pay?" rather than "is this theoretically unsafe?" + +## How It Works + +Bias toward remotely reachable, user-controlled attack paths and throw away patterns that platforms routinely reject as informative or out of scope. + +## In-Scope Patterns + +These are the kinds of issues that consistently matter: + +| Pattern | CWE | Typical impact | +| --------------------------------------------- | ------- | ------------------------------------------------ | +| SSRF through user-controlled URLs | CWE-918 | internal network access, cloud metadata theft | +| Auth bypass in middleware or API guards | CWE-287 | unauthorized account or data access | +| Remote deserialization or upload-to-RCE paths | CWE-502 | code execution | +| SQL injection in reachable endpoints | CWE-89 | data exfiltration, auth bypass, data destruction | +| Command injection in request handlers | CWE-78 | code execution | +| Path traversal in file-serving paths | CWE-22 | arbitrary file read or write | +| Auto-triggered XSS | CWE-79 | session theft, admin compromise | + +## Skip These + +These are usually low-signal or out of bounty scope unless the program says otherwise: + +- Local-only `pickle.loads`, `torch.load`, or equivalent with no remote path +- `eval()` or `exec()` in CLI-only tooling +- `shell=True` on fully hardcoded commands +- Missing security headers by themselves +- Generic rate-limiting complaints without exploit impact +- Self-XSS requiring the victim to paste code manually +- CI/CD injection that is not part of the target program scope +- Demo, example, or test-only code + +## Workflow + +1. Check scope first: program rules, SECURITY.md, disclosure channel, and exclusions. +2. Find real entrypoints: HTTP handlers, uploads, background jobs, webhooks, parsers, and integration endpoints. +3. Run static tooling where it helps, but treat it as triage input only. +4. Read the real code path end to end. +5. Prove user control reaches a meaningful sink. +6. Confirm exploitability and impact with the smallest safe PoC possible. +7. Check for duplicates before drafting a report. + +## Example Triage Loop + +```bash +semgrep --config=auto --severity=ERROR --severity=WARNING --json +``` + +Then manually filter: + +- drop tests, demos, fixtures, vendored code +- drop local-only or non-reachable paths +- keep only findings with a clear network or user-controlled route + +## Report Structure + +```markdown +## Description +[What the vulnerability is and why it matters] + +## Vulnerable Code +[File path, line range, and a small snippet] + +## Proof of Concept +[Minimal working request or script] + +## Impact +[What the attacker can achieve] + +## Affected Version +[Version, commit, or deployment target tested] +``` + +## Quality Gate + +Before submitting: + +- The code path is reachable from a real user or network boundary +- The input is genuinely user-controlled +- The sink is meaningful and exploitable +- The PoC works +- The issue is not already covered by an advisory, CVE, or open ticket +- The target is actually in scope for the bounty program diff --git a/.claude/skills/security-scan/SKILL.md b/.claude/skills/security-scan/SKILL.md new file mode 100644 index 00000000..9c4bfda5 --- /dev/null +++ b/.claude/skills/security-scan/SKILL.md @@ -0,0 +1,172 @@ +--- +name: security-scan +description: Scan your Claude Code configuration (.claude/ directory) for security vulnerabilities, misconfigurations, and injection risks using AgentShield. Checks CLAUDE.md, settings.json, MCP servers, hooks, and agent definitions. +origin: ECC +--- + +# Security Scan Skill + +Audit your Claude Code configuration for security issues using [AgentShield](https://github.com/affaan-m/agentshield). + +## When to Activate + +- Setting up a new Claude Code project +- After modifying `.claude/settings.json`, `CLAUDE.md`, or MCP configs +- Before committing configuration changes +- When onboarding to a new repository with existing Claude Code configs +- Periodic security hygiene checks + +## What It Scans + +| File | Checks | +| --------------- | -------------------------------------------------------------------------------- | +| `CLAUDE.md` | Hardcoded secrets, auto-run instructions, prompt injection patterns | +| `settings.json` | Overly permissive allow lists, missing deny lists, dangerous bypass flags | +| `mcp.json` | Risky MCP servers, hardcoded env secrets, npx supply chain risks | +| `hooks/` | Command injection via interpolation, data exfiltration, silent error suppression | +| `agents/*.md` | Unrestricted tool access, prompt injection surface, missing model specs | + +## Prerequisites + +AgentShield must be installed. Check and install if needed: + +```bash +# Check if installed +npx ecc-agentshield --version + +# Install globally (recommended) +npm install -g ecc-agentshield + +# Or run directly via npx (no install needed) +npx ecc-agentshield scan . +``` + +## Usage + +### Basic Scan + +Run against the current project's `.claude/` directory: + +```bash +# Scan current project +npx ecc-agentshield scan + +# Scan a specific path +npx ecc-agentshield scan --path /path/to/.claude + +# Scan with minimum severity filter +npx ecc-agentshield scan --min-severity medium +``` + +### Output Formats + +```bash +# Terminal output (default) — colored report with grade +npx ecc-agentshield scan + +# JSON — for CI/CD integration +npx ecc-agentshield scan --format json + +# Markdown — for documentation +npx ecc-agentshield scan --format markdown + +# HTML — self-contained dark-theme report +npx ecc-agentshield scan --format html > security-report.html +``` + +### Auto-Fix + +Apply safe fixes automatically (only fixes marked as auto-fixable): + +```bash +npx ecc-agentshield scan --fix +``` + +This will: + +- Replace hardcoded secrets with environment variable references +- Tighten wildcard permissions to scoped alternatives +- Never modify manual-only suggestions + +### Opus 4.6 Deep Analysis + +Run the adversarial three-agent pipeline for deeper analysis: + +```bash +# Requires ANTHROPIC_API_KEY +export ANTHROPIC_API_KEY=your-key +npx ecc-agentshield scan --opus --stream +``` + +This runs: + +1. **Attacker (Red Team)** — finds attack vectors +2. **Defender (Blue Team)** — recommends hardening +3. **Auditor (Final Verdict)** — synthesizes both perspectives + +### Initialize Secure Config + +Scaffold a new secure `.claude/` configuration from scratch: + +```bash +npx ecc-agentshield init +``` + +Creates: + +- `settings.json` with scoped permissions and deny list +- `CLAUDE.md` with security best practices +- `mcp.json` placeholder + +### GitHub Action + +Add to your CI pipeline: + +```yaml + - uses: affaan-m/agentshield@v1 + with: + path: . + min-severity: medium + fail-on-findings: true +``` + +## Severity Levels + +| Grade | Score | Meaning | +| ----- | ------ | ------------------------ | +| A | 90-100 | Secure configuration | +| B | 75-89 | Minor issues | +| C | 60-74 | Needs attention | +| D | 40-59 | Significant risks | +| F | 0-39 | Critical vulnerabilities | + +## Interpreting Results + +### Critical Findings (fix immediately) + +- Hardcoded API keys or tokens in config files +- `Bash(*)` in the allow list (unrestricted shell access) +- Command injection in hooks via `${file}` interpolation +- Shell-running MCP servers + +### High Findings (fix before production) + +- Auto-run instructions in CLAUDE.md (prompt injection vector) +- Missing deny lists in permissions +- Agents with unnecessary Bash access + +### Medium Findings (recommended) + +- Silent error suppression in hooks (`2>/dev/null`, `|| true`) +- Missing PreToolUse security hooks +- `npx -y` auto-install in MCP server configs + +### Info Findings (awareness) + +- Missing descriptions on MCP servers +- Prohibitive instructions correctly flagged as good practice + +## Links + +- **GitHub**: [github.com/affaan-m/agentshield](https://github.com/affaan-m/agentshield) +- **npm**: [npmjs.com/package/ecc-agentshield](https://www.npmjs.com/package/ecc-agentshield) diff --git a/.claude/skills/skill-stocktake/SKILL.md b/.claude/skills/skill-stocktake/SKILL.md new file mode 100644 index 00000000..2bde9158 --- /dev/null +++ b/.claude/skills/skill-stocktake/SKILL.md @@ -0,0 +1,189 @@ +--- +description: Use when auditing Claude skills and commands for quality. Supports Quick Scan (changed skills only) and Full Stocktake modes with sequential subagent batch evaluation. +origin: ECC +--- + +# skill-stocktake + +Slash command (`/skill-stocktake`) that audits all Claude skills and commands using a quality checklist + AI holistic judgment. Supports two modes: Quick Scan for recently changed skills, and Full Stocktake for a complete review. + +## Scope + +The command targets the following paths **relative to the directory where it is invoked**: + +| Path | Description | +| ----------------------- | ---------------------------------------------- | +| `~/.claude/skills/` | Global skills (all projects) | +| `{cwd}/.claude/skills/` | Project-level skills (if the directory exists) | + +**At the start of Phase 1, the command explicitly lists which paths were found and scanned.** + +### Targeting a specific project + +To include project-level skills, run from that project's root directory: + +```bash +cd ~/path/to/my-project +/skill-stocktake +``` + +If the project has no `.claude/skills/` directory, only global skills and commands are evaluated. + +## Modes + +| Mode | Trigger | Duration | +| -------------- | ------------------------------------------------- | --------- | +| Quick Scan | `results.json` exists (default) | 5–10 min | +| Full Stocktake | `results.json` absent, or `/skill-stocktake full` | 20–30 min | + +**Results cache:** `~/.claude/skills/skill-stocktake/results.json` + +## Quick Scan Flow + +Re-evaluate only skills that have changed since the last run (5–10 min). + +1. Read `~/.claude/skills/skill-stocktake/results.json` +2. Run: `bash ~/.claude/skills/skill-stocktake/scripts/quick-diff.sh \ ~/.claude/skills/skill-stocktake/results.json` (Project dir is auto-detected from `$PWD/.claude/skills`; pass it explicitly only if needed) +3. If output is `[]`: report "No changes since last run." and stop +4. Re-evaluate only those changed files using the same Phase 2 criteria +5. Carry forward unchanged skills from previous results +6. Output only the diff +7. Run: `bash ~/.claude/skills/skill-stocktake/scripts/save-results.sh \ ~/.claude/skills/skill-stocktake/results.json <<< "$EVAL_RESULTS"` + +## Full Stocktake Flow + +### Phase 1 — Inventory + +Run: `bash ~/.claude/skills/skill-stocktake/scripts/scan.sh` + +The script enumerates skill files, extracts frontmatter, and collects UTC mtimes. Project dir is auto-detected from `$PWD/.claude/skills`; pass it explicitly only if needed. Present the scan summary and inventory table from the script output: + +``` +Scanning: + ✓ ~/.claude/skills/ (17 files) + ✗ {cwd}/.claude/skills/ (not found — global skills only) +``` + +| Skill | 7d use | 30d use | Description | +| ----- | ------ | ------- | ----------- | + +### Phase 2 — Quality Evaluation + +Launch an Agent tool subagent (**general-purpose agent**) with the full inventory and checklist: + +```text +Agent( + subagent_type="general-purpose", + prompt=" +Evaluate the following skill inventory against the checklist. + +[INVENTORY] + +[CHECKLIST] + +Return JSON for each skill: +{ \"verdict\": \"Keep\"|\"Improve\"|\"Update\"|\"Retire\"|\"Merge into [X]\", \"reason\": \"...\" } +" +) +``` + +The subagent reads each skill, applies the checklist, and returns per-skill JSON: + +`{ "verdict": "Keep"|"Improve"|"Update"|"Retire"|"Merge into [X]", "reason": "..." }` + +**Chunk guidance:** Process ~20 skills per subagent invocation to keep context manageable. Save intermediate results to `results.json` (`status: "in_progress"`) after each chunk. + +After all skills are evaluated: set `status: "completed"`, proceed to Phase 3. + +**Resume detection:** If `status: "in_progress"` is found on startup, resume from the first unevaluated skill. + +Each skill is evaluated against this checklist: + +``` +- [ ] Content overlap with other skills checked +- [ ] Overlap with MEMORY.md / CLAUDE.md checked +- [ ] Freshness of technical references verified (use WebSearch if tool names / CLI flags / APIs are present) +- [ ] Usage frequency considered +``` + +Verdict criteria: + +| Verdict | Meaning | +| -------------- | ------------------------------------------------------------- | +| Keep | Useful and current | +| Improve | Worth keeping, but specific improvements needed | +| Update | Referenced technology is outdated (verify with WebSearch) | +| Retire | Low quality, stale, or cost-asymmetric | +| Merge into [X] | Substantial overlap with another skill; name the merge target | + +Evaluation is **holistic AI judgment** — not a numeric rubric. Guiding dimensions: + +- **Actionability**: code examples, commands, or steps that let you act immediately +- **Scope fit**: name, trigger, and content are aligned; not too broad or narrow +- **Uniqueness**: value not replaceable by MEMORY.md / CLAUDE.md / another skill +- **Currency**: technical references work in the current environment + +**Reason quality requirements** — the `reason` field must be self-contained and decision-enabling: + +- Do NOT write "unchanged" alone — always restate the core evidence +- For **Retire**: state (1) what specific defect was found, (2) what covers the same need instead + - Bad: `"Superseded"` + - Good: `"disable-model-invocation: true already set; superseded by continuous-learning-v2 which covers all the same patterns plus confidence scoring. No unique content remains."` +- For **Merge**: name the target and describe what content to integrate + - Bad: `"Overlaps with X"` + - Good: `"42-line thin content; Step 4 of chatlog-to-article already covers the same workflow. Integrate the 'article angle' tip as a note in that skill."` +- For **Improve**: describe the specific change needed (what section, what action, target size if relevant) + - Bad: `"Too long"` + - Good: `"276 lines; Section 'Framework Comparison' (L80–140) duplicates ai-era-architecture-principles; delete it to reach ~150 lines."` +- For **Keep** (mtime-only change in Quick Scan): restate the original verdict rationale, do not write "unchanged" + - Bad: `"Unchanged"` + - Good: `"mtime updated but content unchanged. Unique Python reference explicitly imported by rules/python/; no overlap found."` + +### Phase 3 — Summary Table + +| Skill | 7d use | Verdict | Reason | +| ----- | ------ | ------- | ------ | + +### Phase 4 — Consolidation + +1. **Retire / Merge**: present detailed justification per file before confirming with user: + - What specific problem was found (overlap, staleness, broken references, etc.) + - What alternative covers the same functionality (for Retire: which existing skill/rule; for Merge: the target file and what content to integrate) + - Impact of removal (any dependent skills, MEMORY.md references, or workflows affected) +2. **Improve**: present specific improvement suggestions with rationale: + - What to change and why (e.g., "trim 430→200 lines because sections X/Y duplicate python-patterns") + - User decides whether to act +3. **Update**: present updated content with sources checked +4. Check MEMORY.md line count; propose compression if >100 lines + +## Results File Schema + +`~/.claude/skills/skill-stocktake/results.json`: + +**`evaluated_at`**: Must be set to the actual UTC time of evaluation completion. Obtain via Bash: `date -u +%Y-%m-%dT%H:%M:%SZ`. Never use a date-only approximation like `T00:00:00Z`. + +```json +{ + "evaluated_at": "2026-02-21T10:00:00Z", + "mode": "full", + "batch_progress": { + "total": 80, + "evaluated": 80, + "status": "completed" + }, + "skills": { + "skill-name": { + "path": "~/.claude/skills/skill-name/SKILL.md", + "verdict": "Keep", + "reason": "Concrete, actionable, unique value for X workflow", + "mtime": "2026-01-15T08:30:00Z" + } + } +} +``` + +## Notes + +- Evaluation is blind: the same checklist applies to all skills regardless of origin (ECC, self-authored, auto-extracted) +- Archive / delete operations always require explicit user confirmation +- No verdict branching by skill origin diff --git a/.claude/skills/skill-stocktake/scripts/quick-diff.sh b/.claude/skills/skill-stocktake/scripts/quick-diff.sh new file mode 100755 index 00000000..c145100a --- /dev/null +++ b/.claude/skills/skill-stocktake/scripts/quick-diff.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# quick-diff.sh — compare skill file mtimes against results.json evaluated_at +# Usage: quick-diff.sh RESULTS_JSON [CWD_SKILLS_DIR] +# Output: JSON array of changed/new files to stdout (empty [] if no changes) +# +# When CWD_SKILLS_DIR is omitted, defaults to $PWD/.claude/skills so the +# script always picks up project-level skills without relying on the caller. +# +# Environment: +# SKILL_STOCKTAKE_GLOBAL_DIR Override ~/.claude/skills (for testing only; +# do not set in production — intended for bats tests) +# SKILL_STOCKTAKE_PROJECT_DIR Override project dir detection (for testing only) + +set -euo pipefail + +RESULTS_JSON="${1:-}" +CWD_SKILLS_DIR="${SKILL_STOCKTAKE_PROJECT_DIR:-${2:-$PWD/.claude/skills}}" +GLOBAL_DIR="${SKILL_STOCKTAKE_GLOBAL_DIR:-$HOME/.claude/skills}" + +if [[ -z "$RESULTS_JSON" || ! -f "$RESULTS_JSON" ]]; then + echo "Error: RESULTS_JSON not found: ${RESULTS_JSON:-}" >&2 + exit 1 +fi + +# Validate CWD_SKILLS_DIR looks like a .claude/skills path (defense-in-depth). +# Only warn when the path exists — a nonexistent path poses no traversal risk. +if [[ -n "$CWD_SKILLS_DIR" && -d "$CWD_SKILLS_DIR" && "$CWD_SKILLS_DIR" != */.claude/skills* ]]; then + echo "Warning: CWD_SKILLS_DIR does not look like a .claude/skills path: $CWD_SKILLS_DIR" >&2 +fi + +evaluated_at=$(jq -r '.evaluated_at' "$RESULTS_JSON") + +# Fail fast on a missing or malformed evaluated_at rather than producing +# unpredictable results from ISO 8601 string comparison against "null". +if [[ ! "$evaluated_at" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$ ]]; then + echo "Error: invalid or missing evaluated_at in $RESULTS_JSON: $evaluated_at" >&2 + exit 1 +fi + +# Pre-extract known paths from results.json once (O(1) lookup per file instead of O(n*m)) +known_paths=$(jq -r '.skills[].path' "$RESULTS_JSON" 2>/dev/null) + +tmpdir=$(mktemp -d) +# Use a function to avoid embedding $tmpdir in a quoted string (prevents injection +# if TMPDIR were crafted to contain shell metacharacters). +_cleanup() { rm -rf "$tmpdir"; } +trap _cleanup EXIT + +# Shared counter across process_dir calls — intentionally NOT local +i=0 + +process_dir() { + local dir="$1" + while IFS= read -r file; do + local mtime dp is_new + mtime=$(date -u -r "$file" +%Y-%m-%dT%H:%M:%SZ) + dp="${file/#$HOME/~}" + + # Check if this file is known to results.json (exact whole-line match to + # avoid substring false-positives, e.g. "python-patterns" matching "python-patterns-v2"). + if echo "$known_paths" | grep -qxF "$dp"; then + is_new="false" + # Known file: only emit if mtime changed (ISO 8601 string comparison is safe) + [[ "$mtime" > "$evaluated_at" ]] || continue + else + is_new="true" + # New file: always emit regardless of mtime + fi + + jq -n \ + --arg path "$dp" \ + --arg mtime "$mtime" \ + --argjson is_new "$is_new" \ + '{path:$path,mtime:$mtime,is_new:$is_new}' \ + > "$tmpdir/$i.json" + i=$((i+1)) + done < <(find "$dir" -name "*.md" -type f 2>/dev/null | sort) +} + +[[ -d "$GLOBAL_DIR" ]] && process_dir "$GLOBAL_DIR" +[[ -n "$CWD_SKILLS_DIR" && -d "$CWD_SKILLS_DIR" ]] && process_dir "$CWD_SKILLS_DIR" + +if [[ $i -eq 0 ]]; then + echo "[]" +else + jq -s '.' "$tmpdir"/*.json +fi diff --git a/.claude/skills/skill-stocktake/scripts/save-results.sh b/.claude/skills/skill-stocktake/scripts/save-results.sh new file mode 100755 index 00000000..32952007 --- /dev/null +++ b/.claude/skills/skill-stocktake/scripts/save-results.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# save-results.sh — merge evaluated skills into results.json with correct UTC timestamp +# Usage: save-results.sh RESULTS_JSON <<< "$EVAL_JSON" +# +# stdin format: +# { "skills": {...}, "mode"?: "full"|"quick", "batch_progress"?: {...} } +# +# Always sets evaluated_at to current UTC time via `date -u`. +# Merges stdin .skills into existing results.json (new entries override old). +# Optionally updates .mode and .batch_progress if present in stdin. + +set -euo pipefail + +RESULTS_JSON="${1:-}" + +if [[ -z "$RESULTS_JSON" ]]; then + echo "Error: RESULTS_JSON argument required" >&2 + echo "Usage: save-results.sh RESULTS_JSON <<< \"\$EVAL_JSON\"" >&2 + exit 1 +fi + +EVALUATED_AT=$(date -u +%Y-%m-%dT%H:%M:%SZ) + +# Read eval results from stdin and validate JSON before touching the results file +input_json=$(cat) +if ! echo "$input_json" | jq empty 2>/dev/null; then + echo "Error: stdin is not valid JSON" >&2 + exit 1 +fi + +if [[ ! -f "$RESULTS_JSON" ]]; then + # Bootstrap: create new results.json from stdin JSON + current UTC timestamp + echo "$input_json" | jq --arg ea "$EVALUATED_AT" \ + '. + { evaluated_at: $ea }' > "$RESULTS_JSON" + exit 0 +fi + +# Merge: new .skills override existing ones; old skills not in input_json are kept. +# Optionally update .mode and .batch_progress if provided. +# +# Use mktemp for a collision-safe temp file (concurrent runs on the same RESULTS_JSON +# would race on a predictable ".tmp" suffix; random suffix prevents silent overwrites). +tmp=$(mktemp "${RESULTS_JSON}.XXXXXX") +trap 'rm -f "$tmp"' EXIT + +jq -s \ + --arg ea "$EVALUATED_AT" \ + '.[0] as $existing | .[1] as $new | + $existing | + .evaluated_at = $ea | + .skills = ($existing.skills + ($new.skills // {})) | + if ($new | has("mode")) then .mode = $new.mode else . end | + if ($new | has("batch_progress")) then .batch_progress = $new.batch_progress else . end' \ + "$RESULTS_JSON" <(echo "$input_json") > "$tmp" + +mv "$tmp" "$RESULTS_JSON" diff --git a/.claude/skills/skill-stocktake/scripts/scan.sh b/.claude/skills/skill-stocktake/scripts/scan.sh new file mode 100755 index 00000000..5f1d12db --- /dev/null +++ b/.claude/skills/skill-stocktake/scripts/scan.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +# scan.sh — enumerate skill files, extract frontmatter and UTC mtime +# Usage: scan.sh [CWD_SKILLS_DIR] +# Output: JSON to stdout +# +# When CWD_SKILLS_DIR is omitted, defaults to $PWD/.claude/skills so the +# script always picks up project-level skills without relying on the caller. +# +# Environment: +# SKILL_STOCKTAKE_GLOBAL_DIR Override ~/.claude/skills (for testing only; +# do not set in production — intended for bats tests) +# SKILL_STOCKTAKE_PROJECT_DIR Override project dir detection (for testing only) + +set -euo pipefail + +GLOBAL_DIR="${SKILL_STOCKTAKE_GLOBAL_DIR:-$HOME/.claude/skills}" +CWD_SKILLS_DIR="${SKILL_STOCKTAKE_PROJECT_DIR:-${1:-$PWD/.claude/skills}}" +# Path to JSONL file containing tool-use observations (optional; used for usage frequency counts). +# Override via SKILL_STOCKTAKE_OBSERVATIONS env var if your setup uses a different path. +OBSERVATIONS="${SKILL_STOCKTAKE_OBSERVATIONS:-$HOME/.claude/observations.jsonl}" + +# Validate CWD_SKILLS_DIR looks like a .claude/skills path (defense-in-depth). +# Only warn when the path exists — a nonexistent path poses no traversal risk. +if [[ -n "$CWD_SKILLS_DIR" && -d "$CWD_SKILLS_DIR" && "$CWD_SKILLS_DIR" != */.claude/skills* ]]; then + echo "Warning: CWD_SKILLS_DIR does not look like a .claude/skills path: $CWD_SKILLS_DIR" >&2 +fi + +# Extract a frontmatter field (handles both quoted and unquoted single-line values). +# Does NOT support multi-line YAML blocks (| or >) or nested YAML keys. +extract_field() { + local file="$1" field="$2" + awk -v f="$field" ' + BEGIN { fm=0 } + /^---$/ { fm++; next } + fm==1 { + n = length(f) + 2 + if (substr($0, 1, n) == f ": ") { + val = substr($0, n+1) + gsub(/^"/, "", val) + gsub(/"$/, "", val) + print val + exit + } + } + fm>=2 { exit } + ' "$file" +} + +# Get UTC timestamp N days ago (supports both macOS and GNU date) +date_ago() { + local n="$1" + date -u -v-"${n}d" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || + date -u -d "${n} days ago" +%Y-%m-%dT%H:%M:%SZ +} + +# Count observations matching a file path since a cutoff timestamp +count_obs() { + local file="$1" cutoff="$2" + if [[ ! -f "$OBSERVATIONS" ]]; then + echo 0 + return + fi + jq -r --arg p "$file" --arg c "$cutoff" \ + 'select(.tool=="Read" and .path==$p and .timestamp>=$c) | 1' \ + "$OBSERVATIONS" 2>/dev/null | wc -l | tr -d ' ' +} + +# Scan a directory and produce a JSON array of skill objects +scan_dir_to_json() { + local dir="$1" + local c7 c30 + c7=$(date_ago 7) + c30=$(date_ago 30) + + local tmpdir + tmpdir=$(mktemp -d) + # Use a function to avoid embedding $tmpdir in a quoted string (prevents injection + # if TMPDIR were crafted to contain shell metacharacters). + local _scan_tmpdir="$tmpdir" + _scan_cleanup() { rm -rf "$_scan_tmpdir"; } + trap _scan_cleanup RETURN + + # Pre-aggregate observation counts in two passes (one per window) instead of + # calling jq per-file — reduces from O(n*m) to O(n+m) jq invocations. + local obs_7d_counts obs_30d_counts + obs_7d_counts="" + obs_30d_counts="" + if [[ -f "$OBSERVATIONS" ]]; then + obs_7d_counts=$(jq -r --arg c "$c7" \ + 'select(.tool=="Read" and .timestamp>=$c) | .path' \ + "$OBSERVATIONS" 2>/dev/null | sort | uniq -c) + obs_30d_counts=$(jq -r --arg c "$c30" \ + 'select(.tool=="Read" and .timestamp>=$c) | .path' \ + "$OBSERVATIONS" 2>/dev/null | sort | uniq -c) + fi + + local i=0 + while IFS= read -r file; do + local name desc mtime u7 u30 dp + name=$(extract_field "$file" "name") + desc=$(extract_field "$file" "description") + mtime=$(date -u -r "$file" +%Y-%m-%dT%H:%M:%SZ) + # Use awk exact field match to avoid substring false-positives from grep -F. + # uniq -c output format: " N /path/to/file" — path is always field 2. + u7=$(echo "$obs_7d_counts" | awk -v f="$file" '$2 == f {print $1}' | head -1) + u7="${u7:-0}" + u30=$(echo "$obs_30d_counts" | awk -v f="$file" '$2 == f {print $1}' | head -1) + u30="${u30:-0}" + dp="${file/#$HOME/~}" + + jq -n \ + --arg path "$dp" \ + --arg name "$name" \ + --arg description "$desc" \ + --arg mtime "$mtime" \ + --argjson use_7d "$u7" \ + --argjson use_30d "$u30" \ + '{path:$path,name:$name,description:$description,use_7d:$use_7d,use_30d:$use_30d,mtime:$mtime}' \ + > "$tmpdir/$i.json" + i=$((i+1)) + done < <(find "$dir" -name "*.md" -type f 2>/dev/null | sort) + + if [[ $i -eq 0 ]]; then + echo "[]" + else + jq -s '.' "$tmpdir"/*.json + fi +} + +# --- Main --- + +global_found="false" +global_count=0 +global_skills="[]" + +if [[ -d "$GLOBAL_DIR" ]]; then + global_found="true" + global_skills=$(scan_dir_to_json "$GLOBAL_DIR") + global_count=$(echo "$global_skills" | jq 'length') +fi + +project_found="false" +project_path="" +project_count=0 +project_skills="[]" + +if [[ -n "$CWD_SKILLS_DIR" && -d "$CWD_SKILLS_DIR" ]]; then + project_found="true" + project_path="$CWD_SKILLS_DIR" + project_skills=$(scan_dir_to_json "$CWD_SKILLS_DIR") + project_count=$(echo "$project_skills" | jq 'length') +fi + +# Merge global + project skills into one array +all_skills=$(jq -s 'add' <(echo "$global_skills") <(echo "$project_skills")) + +jq -n \ + --arg global_found "$global_found" \ + --argjson global_count "$global_count" \ + --arg project_found "$project_found" \ + --arg project_path "$project_path" \ + --argjson project_count "$project_count" \ + --argjson skills "$all_skills" \ + '{ + scan_summary: { + global: { found: ($global_found == "true"), count: $global_count }, + project: { found: ($project_found == "true"), path: $project_path, count: $project_count } + }, + skills: $skills + }' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23b77cf7..211896c3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,30 +6,39 @@ repos: rev: v6.0.0 hooks: - id: check-added-large-files - args: ["--maxkb=10240"] + args: [ "--maxkb=10240" ] - id: check-case-conflict - id: check-merge-conflict - id: check-illegal-windows-names - id: check-json + types: [ json ] - id: check-toml + types: [ toml ] - id: check-yaml - args: [--allow-multiple-documents] + args: [ --allow-multiple-documents ] + types: [ yaml ] - id: check-xml + types: [ xml ] - id: mixed-line-ending - args: [--fix=auto] + types: [ text ] + args: [ --fix=auto ] - id: check-vcs-permalinks - repo: https://github.com/FeryET/pre-commit-rust rev: v1.2.1 hooks: - id: fmt + types: [ rust ] - id: clippy + types: [ rust ] - id: cargo-check + types: [ rust ] - id: build + types: [ rust ] # 🧪 GitHub Actions validation - repo: https://github.com/rhysd/actionlint - rev: v1.7.7 + rev: v1.7.12 hooks: - id: actionlint files: ".github/workflows/" @@ -40,20 +49,15 @@ repos: rev: v0.11.0 hooks: - id: shellcheck - exclude: ^.github/workflows/release.yml$ - - # 📝 Commit message linting (commit-msg only, not blocking push) - - repo: https://github.com/commitizen-tools/commitizen - rev: v4.9.1 - hooks: - - id: commitizen - stages: [commit-msg] + types: [ shell ] + exclude: ^.github/workflows/release.yml$|^.claude # Temporarily disabled due to issues with cargo-machete while we are stubbing out the project - # - repo: https://github.com/bnjbvr/cargo-machete - # rev: v0.9.1 - # hooks: - # - id: cargo-machete + - repo: https://github.com/bnjbvr/cargo-machete + rev: v0.9.2 + hooks: + - id: cargo-machete + types: [ cargo, cargo-lock ] # 📝 Markdown formatting and linting (temporarily disabled in CI due to path issues) - repo: https://github.com/executablebooks/mdformat @@ -70,7 +74,7 @@ repos: - mdformat-gfm-alerts - mdformat-toc - mdformat-front-matters - types: [markdown] + types: [ markdown ] # 🔒 Security audit for Rust dependencies (moved to CI) - repo: local @@ -79,9 +83,11 @@ repos: name: cargo-audit entry: cargo audit language: system + types: [ cargo, cargo-lock ] pass_filenames: false always_run: true - repo: https://github.com/DevinR528/cargo-sort - rev: v2.0.2 + rev: v2.1.3 hooks: - id: cargo-sort + types: [ cargo, cargo-lock ] diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 00000000..c0c4dd18 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,15 @@ +{ + "recommendations": [ + "rust-lang.rust-analyzer", + "dustypomerleau.rust-syntax", + "redhat.vscode-yaml", + "mikestead.dotenv", + "editorconfig.editorconfig", + "tamasfe.even-better-toml", + "skellock.just", + "llvm-vs-code-extensions.lldb-dap", + "yzhang.markdown-all-in-one", + "hverlin.mise-vscode", + "vsls-contrib.codetour" + ] +} diff --git a/AGENTS.md b/AGENTS.md index a94967c0..3fc9132d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,16 +12,16 @@ DaemonEye is a **silent observability and hunt layer** for sensitive, restricted - **daemoneye-agent** — detection orchestrator, alert delivery, event bus - **daemoneye-cli** — operator query interface and rule management - **daemoneye-lib** — shared library (config, models, storage, detection, crypto) -- **collector-core** — SDK for building new collectors in any language (see [ADR-0003]) +- **collector-core** — SDK for building new collectors in any language -The Business and Enterprise tiers (separate codebases) add Security Center (SC), Proxy nodes (PX), GUI, federation, kernel telemetry, and compliance modules. The agent-side components in this repo are designed to participate in the full architecture — protobuf IPC contracts, capability negotiation, and store-and-forward patterns are built in from the start so the Community tier is not a stripped-down afterthought but the real foundation. +DaemonEye also ships in higher commercial tiers (sold separately, not in this repo). The Community tier is designed to participate in that larger architecture — protobuf IPC contracts, capability negotiation, and store-and-forward patterns are built in from the start so this repo is the real foundation, not a stripped-down afterthought. See evilbitlabs.io for commercial details. ### Core use-case: ShadowHunt The cornerstone scenario DaemonEye is built around: 1. **Passive baseline** — procmond collects lightweight process metadata, parent/child relationships, and connection tuples into the local store (redb) -2. **Heuristic trigger** — a detection rule (e.g., `Apache → bash spawn`) fires in the agent or Security Center +2. **Heuristic trigger** — a detection rule (e.g., `Apache → bash spawn`) fires in the agent 3. **Silent trace** — a TraceCommand targets the root PID and descendants for focused tracing without host-visible artifacts 4. **Focused capture** — fork/exec, cmdline snapshots, file metadata, socket events — all cryptographically signed 5. **Cross-host stitching** — if a traced process connects to a remote host running DaemonEye, the trace fans out via shared trace_id @@ -40,7 +40,7 @@ Every architectural decision in this repo — privilege separation, protobuf IPC | **Air-gap friendly** | Fully functional offline; signed bundles for rule and update distribution | | **Privacy defaults** | Command args masked by default; RBAC for trace initiation | -**Source of Truth**: Technical requirements in [.kiro/steering/](./kiro/steering/) and [.kiro/specs/](./kiro/specs/). Design origin: [ShadowHunt Concept] in Confluence (ES space). +**Source of Truth**: Technical requirements for the Community tier live in `spec/` and `.kiro/specs/daemoneye-core-monitoring/`. Higher-tier designs are maintained privately and are not part of this repo. --- @@ -92,11 +92,12 @@ Commit style: [.github/commit-instructions.md](.github/commit-instructions.md) 6. **Testing Required**: All code changes must include appropriate tests. 7. **Linter Restrictions**: Never remove clippy restrictions or `deny` attributes. 8. **File Size Limit**: Keep source files under 500-600 lines when possible. +9. **AI Disclosure**: Always disclose AI usage in PR descriptions, following the AI Usage Policy [AI Usage Policy](AI_POLICY.md). Be transparent, but brief — no need to list every prompt, just the tools used (e.g., "Used Claude Code (`Claude Opus 4.7 (1M Context)`) for initial draft of detection engine refactor. All code reviewed and tested."). ### Rule Precedence -1. Project Rules (.cursor/rules/, AGENTS.md, GEMINI.md) -2. Steering Documents (.kiro/steering/) +1. Project Rules (AGENTS.md) +2. Steering Documents (specs/, .kiro/steering/) 3. Technical Specifications (.kiro/specs/) 4. Embedded defaults @@ -104,7 +105,7 @@ Commit style: [.github/commit-instructions.md](.github/commit-instructions.md) ## Architecture Overview -DaemonEye implements **three-component security architecture** with strict privilege separation. This repo contains the agent-side components (solid lines below); Security Center and Proxy nodes are separate codebases in paid tiers (dashed lines). +DaemonEye implements a **three-component security architecture** with strict privilege separation. This repo contains the host-side components. Commercial tiers extend this foundation with fleet management and centralized aggregation; those components live in separate private codebases and are not described here. ### Components (this repo) @@ -116,7 +117,7 @@ DaemonEye implements **three-component security architecture** with strict privi | **daemoneye-lib** | N/A | N/A | N/A | Shared library | | **collector-core** | N/A | N/A | N/A | Collector SDK | -### Full deployment architecture +### Deployment architecture ```mermaid flowchart LR @@ -127,19 +128,10 @@ flowchart LR C[daemoneye-cli] -->|reads| DB1 & DB2 end A -->|outbound alerts| EXT[(Alert Sinks)] - A -.->|mTLS events up| PX[Proxy Node] - PX -.->|mTLS batched| SC[Security Center] - SC -.->|TraceCommand down| PX - PX -.->|TraceCommand down| A + A -.->|optional upstream| UP[External tiers] ``` -### Deployment Tiers - -| Tier | Components | Scope | -| ------------------------- | -------------------------------------------- | ---------------------------------------------------------------------------- | -| **Community** (this repo) | procmond + daemoneye-agent + daemoneye-cli | Standalone host monitoring, local detection, alert delivery | -| **Business** | + Security Center + Proxy nodes + GUI | Fleet management, proxy tree (AG→PX→SC), curated rule packs, SIEM connectors | -| **Enterprise** | + Kernel telemetry + Federation + Compliance | eBPF/ETW/ESF collectors, multi-site federation, STIX/TAXII, SSO/LDAP | +The dashed line to "External tiers" indicates that `daemoneye-agent`'s outbound IPC contract is designed to support upstream aggregation in commercial deployments. The upstream components are not part of this repo. ### Security Boundaries @@ -236,19 +228,20 @@ flowchart LR - Least privilege: Components run with minimal permissions - Automatic privilege drop after initialization -- SQL injection prevention: AST validation at rule load time [Implemented]; SQL-based rule execution \[Planned — engine currently uses pattern matching, see `detection/mod.rs`\] +- SQL injection prevention: AST validation at rule load time [Implemented]; SQL-based rule execution \[Planned — engine currently uses pattern matching, see `detection/mod.rs`\]. Pipeline is two-phase: sqlparser lowers the custom dialect (spec §4.10) at rule-compile time into (a) protobuf collection tasks and (b) derived standard SQL; the runtime executor only sees the derived SQL, never the original dialect. - Credentials: Environment variables or OS keychain, never hardcoded - No inbound network: Outbound-only for alerts - Audit trail: BLAKE3 hash-chained audit ledger [Implemented]; Merkle tree inclusion proofs \[In Progress — `generate_inclusion_proof()` returns empty vec, see `crypto.rs`\] -### Enterprise Features +### Planned Hardening (Community) -- mTLS with certificate chain validation [Planned] - SLSA Level 3 provenance, Cosign signatures [Planned] - Merkle tree with inclusion proofs \[In Progress — chain hashing implemented; inclusion proof generation stubbed in `crypto.rs`\] - Sandboxed detection engine (read-only DB) [Planned] - Query whitelist (SELECT only with approved functions) [Implemented for rule validation; not yet enforced at execution time] +> Fleet-level transport security (mTLS between host agents and upstream aggregators) is handled in the commercial tiers, not in this repo. + ### Integer Overflow Protection ```toml @@ -496,7 +489,7 @@ pub struct Cli { - GitHub Actions: Linux, macOS, Windows matrix - Rust: stable, beta, MSRV (1.91+) - Checks: fmt, clippy strict, tests, benchmarks -- Security: Dependency scanning, SLSA (Enterprise) +- Security: Dependency scanning, SLSA provenance ### Security Scanners @@ -558,6 +551,16 @@ Files: `daemoneye-lib/src/storage.rs`, `procmond/src/collector.rs`, `benches/` - [ ] ACID guarantees maintained - [ ] Document characteristics +### Open-Core Hygiene (any PR touching docs, specs, or AGENTS.md) + +DaemonEye is the Community tier of an open-core product. Content for commercial tiers lives in separate private codebases and Confluence — not here. + +- [ ] No paid-tier feature enumerations (Business/Enterprise specifics, Security Center internals, fleet-management details, STIX/TAXII, federation) +- [ ] No internal-only URLs (private Confluence hyperlinks, Jira ticket IDs, internal Slack references, pricing specifics) +- [ ] No references to deleted files (sweep for `pricing.md`, `feature-tiers.md`, `business-tier.md`, `enterprise-tier.md`, `product_strategy.md`, `spec/product.md`, `spec/procmond/`) +- [ ] Boundary-acknowledgement footnotes preferred over erasure ("commercial tiers extend this foundation, sold separately, not in this repo") +- [ ] **Never delete a repo doc without first verifying the Confluence copy fully matches** — see [docs/solutions/workflow-issues/open-core-hygiene-confluence-migration-2026-04-18.md](./docs/solutions/workflow-issues/open-core-hygiene-confluence-migration-2026-04-18.md) for the eight-phase workflow + --- ## Code Generation Guidelines @@ -579,15 +582,14 @@ When generating code: ## Source-of-Truth Map -| Section | Source | -| ------------------- | ---------------------------------------------------------------------------------------------------------------- | -| Architecture | [.kiro/steering/structure.md](./.kiro/steering/structure.md) | -| Technology | [.kiro/steering/tech.md](./.kiro/steering/tech.md) | -| Product | [.kiro/steering/product.md](./.kiro/steering/product.md) | -| Core Requirements | [.kiro/specs/DaemonEye-core-monitoring/requirements.md](./.kiro/specs/DaemonEye-core-monitoring/requirements.md) | -| Business Features | [.kiro/specs/business-tier-features/requirements.md](./.kiro/specs/business-tier-features/requirements.md) | -| Enterprise Features | [.kiro/specs/enterprise-tier-features/requirements.md](./.kiro/specs/enterprise-tier-features/requirements.md) | -| Development | [.kiro/steering/development.md](./.kiro/steering/development.md) | +| Section | Source | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------ | +| Architecture | [.kiro/steering/structure.md](./.kiro/steering/structure.md) | +| Technology | [.kiro/steering/tech.md](./.kiro/steering/tech.md) | +| Product | [.kiro/steering/product.md](./.kiro/steering/product.md) | +| Core Requirements | [.kiro/specs/daemoneye-core-monitoring/requirements.md](./.kiro/specs/daemoneye-core-monitoring/requirements.md) | +| Development | [.kiro/steering/development.md](./.kiro/steering/development.md) | +| SQL-to-IPC Pipeline | [spec/daemon_eye_spec_sql_to_ipc_detection_architecture.md](./spec/daemon_eye_spec_sql_to_ipc_detection_architecture.md) | ### Cross-References @@ -599,19 +601,16 @@ When generating code: ## Glossary -| Term | Definition | -| ------ | ------------------------------------------------------ | -| AST | Abstract Syntax Tree (SQL validation) | -| BLAKE3 | Cryptographic hash for audit trails | -| CEF | Common Event Format | -| eBPF | Extended Berkeley Packet Filter | -| ETW | Event Tracing for Windows | -| IPC | Inter-Process Communication | -| mTLS | Mutual Transport Layer Security | -| redb | Pure Rust embedded database | -| SLSA | Supply-chain Levels for Software Artifacts | -| STIX | Structured Threat Information eXpression | -| TAXII | Trusted Automated eXchange of Intelligence Information | +| Term | Definition | +| ------ | ------------------------------------------ | +| AST | Abstract Syntax Tree (SQL validation) | +| BLAKE3 | Cryptographic hash for audit trails | +| CEF | Common Event Format | +| eBPF | Extended Berkeley Packet Filter | +| ETW | Event Tracing for Windows | +| IPC | Inter-Process Communication | +| redb | Pure Rust embedded database | +| SLSA | Supply-chain Levels for Software Artifacts | --- @@ -620,6 +619,3 @@ When generating code: ## Agent Rules @.tessl/RULES.md follow the [instructions](.tessl/RULES.md) - -[adr-0003]: https://evilbitlabs.atlassian.net/wiki/spaces/ES/pages/5767187/ADR+0003+Polyglot+Collector+SDK+Strategy -[shadowhunt concept]: https://evilbitlabs.atlassian.net/wiki/spaces/ES/pages/1802386/ShadowHunt+Concept diff --git a/docs/solutions/workflow-issues/open-core-hygiene-confluence-migration-2026-04-18.md b/docs/solutions/workflow-issues/open-core-hygiene-confluence-migration-2026-04-18.md new file mode 100644 index 00000000..05df6dc5 --- /dev/null +++ b/docs/solutions/workflow-issues/open-core-hygiene-confluence-migration-2026-04-18.md @@ -0,0 +1,301 @@ +--- +title: Open-Core Hygiene Pass — Migrating Paid-Tier Content to Confluence Before Public Repo Scrub +date: 2026-04-18 +category: workflow-issues +module: repository-hygiene +problem_type: workflow_issue +component: documentation +severity: high +applies_when: + - Public OSS repo contains specs, docs, or planning artifacts for paid/commercial tiers that should not live in the open-source codebase + - An open-core product has split into Community/Business/Enterprise tiers after the repo was already public + - Internal planning artifacts (Jira IDs, private Confluence URLs, pricing, roadmap) have leaked into tracked files + - A private knowledge base (Confluence, Notion, etc.) exists or can be created to hold the authoritative version of removed content + - Removed content needs to be preserved verbatim for future reference, not merely deleted +tags: [open-core, documentation, confluence, migration, public-repo, hygiene, atlassian, pandoc] +related_components: + - documentation + - development_workflow + - tooling +--- + +# Open-Core Hygiene Pass — Migrating Paid-Tier Content to Confluence Before Public Repo Scrub + +## Context + +Open-core projects accumulate paid-tier pollution over time: feature specifications for commercial tiers, internal Confluence/Jira hyperlinks, pricing pages, product-strategy documents, and planning artifacts that were drafted in the public repo before the project split into separate codebases. Deleting it is only half the work — much of that content is still operationally valuable and needs to survive somewhere trusted (typically Confluence or an internal wiki) before it leaves the repo. + +The naive approaches both fail: + +- **Delete first, migrate later** risks silent data loss when "migrated" pages turn out to be empty stubs, renamed, or missing sections. +- **Erase all mention of paid tiers** is dishonest (the commercial product is advertised publicly anyway) and confuses readers who wonder why the repo architecture has suspicious dotted-line boxes. + +The workflow below emerged from scrubbing DaemonEye's public repo of Business/Enterprise tier content while preserving it in Confluence. It's structured as eight phases that can be run end-to-end or piecewise. + +## Guidance + +### Phase 1 — Inventory the pollution + +Use grep for known paid-tier vocabulary, internal hyperlinks, ticket IDs, and strategy keywords. Produce a severity-ranked report before touching anything. + +```bash +# Literal paid-tier terms (adjust vocabulary to your project) +grep -r -n "Business Tier\|Enterprise Tier\|Security Center\|Proxy Node\|mTLS\|STIX\|TAXII\|federation" docs/ spec/ AGENTS.md + +# Internal wiki hyperlinks +grep -r -n "evilbitlabs.atlassian.net" . + +# Jira/ticket IDs +grep -r -n -E "END-[0-9]+|ENDI-[0-9]+" --include="*.md" + +# Pricing / roadmap / strategy keywords +grep -r -n -i "pricing\|roadmap\|strateg" docs/src/ spec/ +``` + +Classify findings as HIGH (feature specs, pricing, roadmap), MEDIUM (agent/config file leaks), LOW (passing mentions, capability-boundary footnotes). + +### Phase 2 — Write down the policy before editing + +Decide and commit to explicit boundaries: + +- Authoritative home for architecture/design going forward (e.g., Confluence). +- What the public repo contains (e.g., "Community tier only — code, build, user-facing docs"). +- Boundary-acknowledgement language is OK. "Commercial tiers extend this foundation, sold separately, not in this repo" is preferable to erasure. +- Which frozen artifacts stay (e.g., Community-relevant specs) vs. get removed (phantom paths to non-existent directories). +- **Hard rule: never delete without verification.** No exceptions. + +### Phase 3 — Verify before deleting + +For every candidate file, search → fetch → size-compare against the target destination. + +```bash +TOKEN=$(cat ~/auth_token) + +# Search Confluence by title keyword +curl -s -u "$TOKEN" -H "Accept: application/json" \ + --data-urlencode "cql=space = ES AND title ~ \"pricing\"" \ + -G "https://your-instance.atlassian.net/wiki/rest/api/content/search?limit=5" | \ + jq -r '.results[] | "\(.title) (id=\(.id))"' + +# Fetch and size-compare +curl -s -u "$TOKEN" -H "Accept: application/json" \ + "https://your-instance.atlassian.net/wiki/api/v2/pages/$id?body-format=storage" | \ + jq -r '.body.storage.value' > /tmp/confluence_check/pricing.html +wc -c /tmp/confluence_check/pricing.html docs/src/pricing.md +``` + +Flag anything with a size ratio < 0.9 or heading mismatches for manual inspection. Expect at least one "migrated" page to be an empty stub — this phase exists to catch those. + +### Phase 4 — Fill migration gaps + +Two mechanisms depending on batch size: + +- **Single pages or small batches:** Use an MCP/API tool with `contentFormat: markdown` — faster to invoke, slower per call because response bodies include the full generated storage format. +- **Batch uploads (3+ pages):** Shell out to pandoc + curl against the Confluence v2 API. This is ~3× faster for larger batches because you bypass the token cost of the tool response body. + +```bash +#!/bin/bash +# upload_confluence.sh <parent_id> +set -e +FILE="$1"; TITLE="$2"; PARENT="$3" +CLOUD="<your-cloud-id>" +SPACE="<your-space-id>" +TOKEN=$(cat ~/auth_token) + +# Append migration footer so the Confluence page self-documents provenance +TMP=$(mktemp) +cat "$FILE" > "$TMP" +cat >> "$TMP" <<EOF + +--- + +**Source note:** Migrated from the public repo (\`$FILE\`) on $(date +%Y-%m-%d). The repo copy has been removed. +EOF + +HTML=$(pandoc --from=gfm --to=html "$TMP") + +JSON=$(jq -n --arg space "$SPACE" --arg parent "$PARENT" --arg title "$TITLE" --arg body "$HTML" '{ + spaceId: $space, + parentId: $parent, + status: "current", + title: $title, + body: { representation: "storage", value: $body } +}') + +RESULT=$(curl -s -w "\nHTTP_%{http_code}" -u "$TOKEN" \ + -H "Accept: application/json" -H "Content-Type: application/json" \ + -X POST "https://your-instance.atlassian.net/wiki/api/v2/pages" \ + --data "$JSON") + +rm -f "$TMP" + +HTTP=$(echo "$RESULT" | tail -n1) +BODY=$(echo "$RESULT" | sed '$d') + +if [[ "$HTTP" == "HTTP_200" ]]; then + echo "OK: $TITLE -> id=$(echo "$BODY" | jq -r '.id')" +else + echo "FAIL: $TITLE ($HTTP)"; echo "$BODY" | head -c 300; exit 1 +fi +``` + +Required Atlassian scoped-token (ATAT) scopes for this flow: `read:page:confluence`, `read:space:confluence`, `write:page:confluence`, `search:confluence`, `read:hierarchical-content:confluence`. Missing scopes return 404, not 403 — don't waste time debugging routing. + +### Phase 5 — Verify fidelity with text-only comparison, not markdown match + +After upload, strip HTML and compare text-content lengths: + +```bash +repo_text=$(cat "$file" | wc -c) +confluence_text=$(curl -s -u "$TOKEN" -H "Accept: application/json" \ + "https://your-instance.atlassian.net/wiki/api/v2/pages/$id?body-format=storage" | \ + jq -r '.body.storage.value' | sed 's/<[^>]*>//g' | wc -c) +ratio=$(echo "scale=2; $confluence_text / $repo_text" | bc) +``` + +Ratios of 0.91–1.00 confirm good migration. Expected losses: `**bold**` renders as plain text, link URL text compacts, code-fence language tags drop. Do **not** grep for literal markdown headings — YAML comment lines starting with `#` inside code blocks are false positives, and `#### **Heading**` becomes `<h4>Heading</h4>`, losing the literal `**` to match on. + +### Phase 6 — Delete and clean up references + +```bash +git rm docs/src/pricing.md docs/src/architecture/feature-tiers.md \ + docs/src/technical/business-tier.md docs/src/technical/enterprise-tier.md \ + spec/product_strategy.md spec/product.md +git rm -r spec/procmond + +# Update any TOC/index files manually (mdbook SUMMARY.md, etc.) + +# Final sweep for dangling references +grep -r -n "pricing\.md\|feature-tiers\.md\|business-tier\.md\|enterprise-tier\.md" docs/src/ AGENTS.md +``` + +Fix every dangling link. Also search `introduction.md`, `project-overview.md`, and similar "meta" docs for tier-enumerating license sections — replace with boundary statements. + +### Phase 7 — Residual scrub + +File-level deletion misses inline pollution: configuration examples that reference paid-tier features, "Already Planned" footnotes that point to now-deleted docs, Mermaid diagrams that aggregate to commercial components. Run a second grep pass after deletion and fix with surgical edits. Perl one-liners are useful for repeated footnote patterns: + +```bash +perl -i -pe ' + s/ \*\*Already Planned\*\*: [^.]+ (are |is )?specified in product\.md( and tech\.md)?( for Business\/Enterprise tiers| for Enterprise tier)?\.?//g; + s/ for Business\/Enterprise tiers//g; + s/ specified in product\.md//g; + s/Business Tier/commercial tiers/g; + s/Enterprise Tier/commercial tiers/g; +' docs/src/technical/security_design_overview.md +``` + +### Phase 8 — Commit in distinct, reviewable chunks + +One commit per change class. A single mega-commit mixing deletion and content rewrites is unreviewable. + +```text +docs(spec): mark §9.1-§9.4 as superseded by ADR-0006 +docs: scrub paid-tier leaks and internal references from AGENTS.md +docs: remove paid-tier + internal planning docs from public repo +docs: scrub residual paid-tier content from mdbook user-facing docs +``` + +Commit bodies should enumerate what moved where ("Pricing content migrated to Confluence page id=1802381") so `git log` itself is the migration audit trail. + +## Why This Matters + +- **Verification-before-deletion prevents silent data loss.** In this session, one "migrated" Confluence page existed at the expected location but was 0 chars — an empty stub from a prior half-finished migration. Trusting its existence would have permanently destroyed the pricing history. The size-compare check caught it before `git rm` ran. +- **Commit structure keeps review tractable.** Scrubs touch dozens of files. Mixing AGENTS.md redaction with bulk deletions and inline rewrites in one commit produces a diff nobody can review. Four focused commits let a reviewer reason about each change class independently. +- **Boundary footnotes keep the repo honest.** Pretending the commercial tiers don't exist while their architecture diagrams clearly show integration points is disorienting for operators. Acknowledging "commercial tiers extend this foundation, sold separately, not in this repo" is honest, adds no leak surface, and matches what the marketing site already says. +- **Pandoc + curl beats tool invocations at batch sizes.** For an 11-file migration, invoking an MCP/API tool per file burned output tokens linearly with input because responses included the generated storage body. Shelling to pandoc + curl to the v2 API skips that cost entirely — ~3× wall-clock speedup for this case. +- **Grep-first, structured-second catches the long tail.** Literal vocabulary grep caught ~95% of pollution. The post-deletion dangling-reference sweep and a second-pass grep for residual inline content caught the rest. A structured-only approach (AST parsing the docs) would have been overkill and would still have missed content inside prose. + +## When to Apply + +Run this workflow when: + +- **Splitting an open-core project** whose public repo accumulated paid-tier content before the split. +- **Pre-release cleanup** before opening a project to external contributors or before a public launch. +- **License boundary changes** — moving from single-license to dual/tiered licensing, or vice versa. +- **Post-acquisition repo hygiene** where internal planning artifacts (tickets, wikis, strategy docs) leaked into a repo that's becoming public. +- **Periodic audit** of long-lived open-core repos (e.g., quarterly) — drift is gradual, so accumulated pollution tends to go unnoticed until it's significant. + +Do **not** apply this workflow for: + +- Internal-only repos where all readers have access to paid-tier content anyway (no leak surface). +- Repos where the "paid-tier content" is actually design discussion that belongs in the public repo under an OSS-compatible license. +- Single-file scrubs (just edit and commit — the eight-phase process is overhead for small changes). + +## Examples + +### Before/after: AGENTS.md Source-of-Truth Map + +Before: + +```markdown +| Business Features | [.kiro/specs/business-tier-features/requirements.md](./.kiro/specs/business-tier-features/requirements.md) | +| Enterprise Features | [.kiro/specs/enterprise-tier-features/requirements.md](./.kiro/specs/enterprise-tier-features/requirements.md) | +``` + +After (the directories never existed — phantom links): + +```markdown +<!-- Business/Enterprise spec links removed — those tiers live in separate repos --> +<!-- See evilbitlabs.io for commercial tier information --> +``` + +### Before/after: one deletion + migration cycle + +1. Inventory: `grep -r "pricing" docs/src/` finds `docs/src/pricing.md` (HIGH severity). + +2. Search Confluence: + + ```bash + curl -s -u "$TOKEN" --data-urlencode "cql=space = ES AND title ~ \"pricing\"" \ + -G "https://your-instance.atlassian.net/wiki/rest/api/content/search?limit=5" | jq '.results' + ``` + + Returns `Pricing (id=1802381)`. + +3. Size-check: fetched page body is 0 chars. Repo file is 8,412 chars. Ratio 0.00 — **stop, don't delete**. + +4. Migrate with pandoc+curl (`upload_confluence.sh docs/src/pricing.md "Pricing" 1802381`). Returns `OK: Pricing -> id=1802381`. + +5. Re-verify: fetched body is now 7,981 chars of stripped text. Ratio 0.95 — acceptable (losses are markdown emphasis and link URL compaction). + +6. `git rm docs/src/pricing.md`, update `docs/src/SUMMARY.md`, grep for dangling references to `pricing.md`, fix. + +7. Commit with body: `Pricing content migrated to Confluence page id=1802381 on 2026-04-17. Repo copy removed.` + +### Boundary footnote examples + +Instead of deleting all mention of commercial features in `introduction.md`: + +```markdown +DaemonEye is distributed as open-core. This repository contains the Community +tier — a self-contained host-monitoring agent. Commercial tiers (fleet +management, GUI, federation, compliance modules) extend this foundation and +are sold separately through evilbitlabs.io; they are not in this repo. +``` + +Instead of removing dotted-line boxes from architecture diagrams: + +```mermaid +flowchart LR + subgraph "Host (this repo)" + P[procmond] --> A[daemoneye-agent] + end + A -.->|outbound, commercial tiers only| SC[Security Center] +``` + +The dotted line plus the "commercial tiers only" label is more informative than either hiding the edge or spelling out Proxy Node + mTLS + federation details. + +### Residual-pollution example + +File-level deletion of `docs/src/technical/enterprise-tier.md` didn't catch the 16 inline `**Already Planned**: X is specified in product.md for Business/Enterprise tiers.` footnotes scattered through `security_design_overview.md` that now pointed to a deleted `product.md`. A second grep pass (`grep -n "Already Planned" docs/src/`) surfaced them, and the perl one-liner in Phase 7 stripped them in one pass. Always do a post-deletion residual sweep — file boundaries don't align with content boundaries. + +## Related + +- This was the first workflow of its kind documented for the DaemonEye repo. Existing entries in `docs/solutions/` cover Rust code-level security/performance findings (`best-practices/rust-security-batch-cleanup-patterns-2026-04-04.md`, `security-issues/binary-hashing-authorization-and-toctou-fixes.md`) and are orthogonal to this documentation-governance workflow. +- Branch `minor-cleanup` on `EvilBit-Labs/DaemonEye` contains the four commits that executed this workflow end-to-end. See `git log` for the per-commit migration manifest. +- ADR-0006 and ADR-0007 (Confluence ES space) were authored during this session and illustrate the "Confluence authoritative" side of the policy. + +## Auto memory note + +One auto-memory entry (`feedback_agents_md_directives`) informed AGENTS.md editing during this workflow: `@` directive syntax (e.g., `@.tessl/RULES.md`) is intentional and should not be rewritten to prose during scrubs. Preserve existing `@` includes; only redact content inside the files they reference. (auto memory [claude]) diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index b8b3442a..bba6b261 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -3,18 +3,14 @@ - [Introduction](./introduction.md) - [Getting Started](./getting-started.md) - [Project Overview](./project-overview.md) -- [Pricing](./pricing.md) - [Architecture](./architecture.md) - [System Architecture](./architecture/system-architecture.md) - [Collector-Core Framework](./architecture/collector-core-framework.md) - - [Feature Tiers](./architecture/feature-tiers.md) - [Technical Documentation](./technical.md) - [Core Monitoring](./technical/core-monitoring.md) - [EventBus Architecture](./technical/eventbus-architecture.md) - [RPC and EventBus Architecture](./technical/rpc-eventbus-architecture.md) - [IPC Implementation](./technical/ipc-implementation.md) - - [Business Tier Features](./technical/business-tier.md) - - [Enterprise Tier Features](./technical/enterprise-tier.md) - [macOS Process Collector](./technical/macos-process-collector.md) - [Windows Process Collector](./technical/windows-process-collector.md) - [Query Pipeline and SQL Dialect](./technical/query-pipeline.md) diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 006ef62d..cc839091 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -368,9 +368,8 @@ message DetectionResult { ### **External Communication** -- **Alert Delivery**: Outbound-only network connections -- **SIEM Integration**: HTTPS, mTLS, webhook protocols -- **Security Center**: mTLS with certificate authentication +- **Alert Delivery**: Outbound-only network connections to configured sinks +- **SIEM Integration**: HTTPS + webhook protocols for alert forwarding ## Error Handling Strategy diff --git a/docs/src/architecture/feature-tiers.md b/docs/src/architecture/feature-tiers.md deleted file mode 100644 index d48a1d91..00000000 --- a/docs/src/architecture/feature-tiers.md +++ /dev/null @@ -1,273 +0,0 @@ -# DaemonEye Feature Tiers - -DaemonEye is offered in three distinct tiers, each designed to meet different organizational needs and deployment scales. All tiers maintain the core security-first architecture while adding progressively more advanced capabilities. - -## Core Tier (Open Source) - -**License**: Apache 2.0 **Target**: Individual users, small teams, proof-of-concept deployments - -### Core Components - -- **procmond**: Privileged process collector with minimal attack surface -- **daemoneye-agent**: User-space detection orchestrator with SQL-based rules -- **daemoneye-cli**: Command-line interface for queries and management -- **daemoneye-lib**: Shared library with common functionality - -### Key Features - -- ✅ **Process Monitoring**: Cross-platform process enumeration and monitoring -- ✅ **SQL Detection Engine**: Flexible rule creation using standard SQL queries -- ✅ **Multi-Channel Alerting**: stdout, syslog, webhook, email, file output -- ✅ **Audit Logging**: Certificate Transparency-style Merkle tree with inclusion proofs -- ✅ **Offline Operation**: Full functionality without internet access -- ✅ **CLI Interface**: Comprehensive command-line management tools -- ✅ **Configuration Management**: Hierarchical configuration system -- ✅ **Cross-Platform Support**: Linux, macOS, Windows - -### Performance Characteristics - -- **CPU Usage**: \<5% sustained during continuous monitoring -- **Memory Usage**: \<100MB resident under normal operation -- **Process Enumeration**: \<5 seconds for 10,000+ processes -- **Database Operations**: >1,000 records/second write rate -- **Alert Latency**: \<100ms per detection rule execution - -### Use Cases - -- Individual security researchers and analysts -- Small development teams requiring process monitoring -- Proof-of-concept security deployments -- Educational and training environments -- Airgapped or offline environments - ---- - -## Business Tier (Commercial) - -**License**: Per-site license (TBD) **Target**: Small to medium teams, consultancies, managed security services - -### All Core Tier Features Plus - -#### **Security Center Server** - -- **Centralized Management**: Single point of control for multiple agents -- **Agent Registration**: Secure mTLS-based agent authentication -- **Data Aggregation**: Centralized collection of alerts and process data -- **Configuration Distribution**: Centralized rule management and deployment -- **Integration Hub**: Single point for external SIEM integrations - -#### **Web GUI Frontend** - -- **Fleet Dashboard**: Real-time view of all connected agents -- **Alert Management**: Filtering, sorting, and export of alerts -- **Rule Management**: Visual rule editor and deployment interface -- **System Health**: Agent connectivity and performance metrics -- **Data Visualization**: Charts and graphs for security analytics - -#### **Enhanced Output Connectors** - -- **Splunk HEC**: Native Splunk HTTP Event Collector integration -- **Elasticsearch**: Bulk indexing with index pattern management -- **Kafka**: High-throughput message streaming -- **CEF Format**: Common Event Format for SIEM compatibility -- **STIX 2.1**: Structured Threat Information eXpression export - -#### **Curated Rule Packs** - -- **Malware TTPs**: Common malware tactics, techniques, and procedures -- **MITRE ATT&CK**: Framework-based detection rules -- **Industry Standards**: CIS, NIST, and other compliance frameworks -- **Cryptographic Signatures**: Ed25519-signed rule packs for integrity -- **Auto-Update**: Automatic rule pack distribution and updates - -#### **Container & Kubernetes Support** - -- **Docker Images**: Pre-built container images for all components -- **Kubernetes Manifests**: DaemonSet and deployment configurations -- **Helm Charts**: Package management for Kubernetes deployments -- **Service Mesh**: Istio and Linkerd integration support - -#### **Deployment Patterns** - -- **Direct Agent-to-SIEM**: Agents send directly to configured SIEM systems -- **Centralized Proxy**: All agents route through Security Center -- **Hybrid Mode**: Agents send to both Security Center and direct SIEM (recommended) - -### Performance Characteristics - -- **Agents per Security Center**: 1,000+ agents -- **Alert Throughput**: 10,000+ alerts per minute -- **Data Retention**: Configurable retention policies -- **Query Performance**: Sub-second queries across agent fleet - -### Use Cases - -- Security consultancies managing multiple clients -- Managed Security Service Providers (MSSPs) -- Small to medium enterprises with distributed infrastructure -- Organizations requiring centralized security management -- Teams needing enhanced SIEM integration - ---- - -## Enterprise Tier (Commercial) - -**License**: Custom pricing **Target**: Large enterprises, government agencies, critical infrastructure - -### All Business Tier Features Plus - -#### **Kernel-Level Monitoring** - -- **Linux eBPF**: Real-time syscall monitoring and process tracking -- **Windows ETW**: Event Tracing for Windows integration -- **macOS EndpointSecurity**: Native security framework integration -- **Container Awareness**: Kubernetes and Docker container monitoring -- **Network Correlation**: Process-to-network activity correlation - -#### **Federated Security Centers** - -- **Hierarchical Architecture**: Regional and Primary Security Centers -- **Distributed Queries**: Cross-center query execution and aggregation -- **Data Replication**: Automatic data synchronization between centers -- **Failover Support**: Automatic failover and load balancing -- **Geographic Distribution**: Multi-region deployment support - -#### **Advanced Threat Intelligence** - -- **STIX/TAXII Integration**: Automated threat intelligence ingestion -- **Indicator Conversion**: STIX indicators to detection rules -- **Threat Feed Management**: Multiple threat intelligence sources -- **IOC Matching**: Indicator of Compromise correlation -- **Threat Hunting**: Advanced query capabilities for threat hunting - -#### **Enterprise Analytics** - -- **Distributed Analytics**: Cross-fleet security analytics -- **Machine Learning**: Anomaly detection and behavioral analysis -- **Risk Scoring**: Dynamic risk assessment and prioritization -- **Compliance Reporting**: Automated compliance and audit reporting -- **Custom Dashboards**: Configurable security dashboards - -#### **Advanced Security Features** - -- **Zero Trust Architecture**: Comprehensive zero trust implementation -- **Identity Integration**: Active Directory and LDAP integration -- **Role-Based Access Control**: Granular permission management -- **Audit Trail**: Comprehensive audit logging and compliance -- **Data Encryption**: End-to-end encryption for all data flows - -#### **High Availability & Scalability** - -- **Clustering**: Multi-node Security Center clusters -- **Load Balancing**: Automatic load distribution -- **Disaster Recovery**: Backup and recovery procedures -- **Horizontal Scaling**: Scale-out architecture support -- **Performance Optimization**: Advanced caching and optimization - -### Performance Characteristics - -- **Agents per Federation**: 10,000+ agents -- **Regional Centers**: 100+ regional centers per federation -- **Query Latency**: \<100ms for distributed queries -- **Data Volume**: Petabyte-scale data processing -- **Uptime**: 99.99% availability SLA - -### Use Cases - -- Large enterprises with global infrastructure -- Government agencies and critical infrastructure -- Financial services and healthcare organizations -- Organizations requiring compliance (SOX, HIPAA, PCI-DSS) -- Multi-tenant service providers - ---- - -## Feature Comparison Matrix - -| Feature | Core | Business | Enterprise | -| -------------------------- | ---- | -------- | ---------- | -| **Process Monitoring** | ✅ | ✅ | ✅ | -| **SQL Detection Engine** | ✅ | ✅ | ✅ | -| **Multi-Channel Alerting** | ✅ | ✅ | ✅ | -| **Audit Logging** | ✅ | ✅ | ✅ | -| **Offline Operation** | ✅ | ✅ | ✅ | -| **CLI Interface** | ✅ | ✅ | ✅ | -| **Security Center** | ❌ | ✅ | ✅ | -| **Web GUI** | ❌ | ✅ | ✅ | -| **Enhanced Connectors** | ❌ | ✅ | ✅ | -| **Curated Rule Packs** | ❌ | ✅ | ✅ | -| **Container Support** | ❌ | ✅ | ✅ | -| **Kernel Monitoring** | ❌ | ❌ | ✅ | -| **Federation** | ❌ | ❌ | ✅ | -| **STIX/TAXII** | ❌ | ❌ | ✅ | -| **Advanced Analytics** | ❌ | ❌ | ✅ | -| **Zero Trust** | ❌ | ❌ | ✅ | -| **High Availability** | ❌ | ❌ | ✅ | - -## Licensing Architecture - -### **Dual-License Strategy** - -The DaemonEye project maintains a dual-license approach to balance open source accessibility with commercial sustainability: - -- **Core Components**: Apache 2.0 licensed (procmond, daemoneye-agent, daemoneye-cli, daemoneye-lib) -- **Business Tier Features**: Per-site license, TBD (Security Center, GUI, enhanced connectors, curated rules) -- **Enterprise Tier Features**: Custom pricing (kernel monitoring, federation, STIX/TAXII integration) - -### **Feature Gating Implementation** - -```rust,ignore -// Compile-time feature gates -#[cfg(feature = "business-tier")] -pub mod security_center; - -#[cfg(feature = "business-tier")] -pub mod enhanced_connectors; - -#[cfg(feature = "enterprise-tier")] -pub mod kernel_monitoring; - -#[cfg(feature = "enterprise-tier")] -pub mod federation; -``` - -### **Runtime License Validation** - -- **Cryptographic Signatures**: Ed25519 signatures for license validation -- **Site Restrictions**: Hostname/domain matching for license compliance -- **Feature Activation**: Runtime feature activation based on license -- **Graceful Degradation**: Fallback to lower tier when license is invalid - -### **License Distribution** - -- **Core Tier**: GitHub releases with Apache 2.0 license -- **Business Tier**: Separate distribution channel with license keys -- **Enterprise Tier**: Enterprise distribution with support and SLA -- **Hybrid Builds**: Single binary with runtime feature activation - -## Migration Path - -### **Core → Business** - -- Install Security Center server -- Configure agent uplink connections -- Deploy curated rule packs -- Set up enhanced connectors - -### **Business → Enterprise** - -- Enable kernel-level monitoring -- Deploy federated Security Centers -- Integrate STIX/TAXII feeds -- Configure advanced analytics - -### **Backward Compatibility** - -- All tiers maintain API compatibility -- Configuration migration tools provided -- Data export/import capabilities -- Gradual feature activation - ---- - -*Choose the tier that best fits your organization's needs, with the flexibility to upgrade as requirements grow and evolve.* diff --git a/docs/src/deployment.md b/docs/src/deployment.md index 143fc105..06739068 100644 --- a/docs/src/deployment.md +++ b/docs/src/deployment.md @@ -86,38 +86,7 @@ graph TB #### Multi-Node Deployment -For large environments with multiple monitoring targets: - -```mermaid -graph TB - subgraph "Node 1" - A1[<b>ProcMonD</b>] - B1[<b>daemoneye-agent</b>] - A1 <--> B1 - end - - subgraph "Node 2" - A2[<b>ProcMonD</b>] - B2[<b>daemoneye-agent</b>] - A2 <--> B2 - end - - subgraph "Node 3" - A3[<b>ProcMonD</b>] - B3[<b>daemoneye-agent</b>] - A3 <--> B3 - end - - subgraph "Central Management" - C[<b>Security Center</b>] - D[<b>Database</b>] - C <--> D - end - - B1 --> C - B2 --> C - B3 --> C -``` +Each host runs the same single-node topology independently. Fleet-wide aggregation across nodes is handled by commercial tiers and is outside the scope of this repo. #### Container Deployment diff --git a/docs/src/deployment/docker.md b/docs/src/deployment/docker.md index 6312b3ac..340b72d4 100644 --- a/docs/src/deployment/docker.md +++ b/docs/src/deployment/docker.md @@ -27,7 +27,6 @@ DaemonEye uses a multi-container architecture: - **procmond**: Privileged process monitoring daemon - **daemoneye-agent**: User-space orchestrator and alerting - **daemoneye-cli**: Command-line interface and management -- **Security Center**: Web-based management interface (Business/Enterprise tiers) ## Container Images @@ -51,10 +50,6 @@ docker pull daemoneye/daemoneye-agent:1.0.0 # CLI interface docker pull daemoneye/daemoneye-cli:latest docker pull daemoneye/daemoneye-cli:1.0.0 - -# Security Center (Business/Enterprise) -docker pull daemoneye/security-center:latest -docker pull daemoneye/security-center:1.0.0 ``` **Planned Image Tags**: diff --git a/docs/src/deployment/installation.md b/docs/src/deployment/installation.md index 34815bd4..016d8d07 100644 --- a/docs/src/deployment/installation.md +++ b/docs/src/deployment/installation.md @@ -48,12 +48,6 @@ This guide provides comprehensive installation instructions for DaemonEye across - Disk: 10GB+ free space - Network: Stable internet connection -**Enhanced Features** (Enterprise Tier): - -- Linux: Kernel 4.7+ for eBPF support -- Windows: Windows 7+ for ETW support -- macOS: 10.15+ for EndpointSecurity support - ## Installation Methods ### Method 1: Pre-built Binaries (Recommended) diff --git a/docs/src/deployment/kubernetes.md b/docs/src/deployment/kubernetes.md index 03163c50..81b73e78 100644 --- a/docs/src/deployment/kubernetes.md +++ b/docs/src/deployment/kubernetes.md @@ -25,7 +25,6 @@ DaemonEye is designed to run efficiently on Kubernetes, providing: - **procmond**: DaemonSet for process monitoring on each node - **daemoneye-agent**: Deployment for alerting and orchestration - **daemoneye-cli**: Job/CronJob for management tasks -- **Security Center**: Deployment for web-based management (Business/Enterprise) ## Prerequisites diff --git a/docs/src/introduction.md b/docs/src/introduction.md index a3f84bfd..d748c0e7 100644 --- a/docs/src/introduction.md +++ b/docs/src/introduction.md @@ -65,11 +65,7 @@ If you need help with DaemonEye: ## License -DaemonEye follows a dual-license strategy: - -- **Core Components**: Apache 2.0 licensed (procmond, daemoneye-agent, daemoneye-cli, daemoneye-lib) -- **Business Tier Features**: Per-site license, TBD (Security Center, GUI, enhanced connectors, curated rules) -- **Enterprise Tier Features**: Custom pricing (kernel monitoring, federation, STIX/TAXII integration) +The DaemonEye components in this repository — procmond, daemoneye-agent, daemoneye-cli, daemoneye-lib — are licensed under Apache 2.0. Commercial extensions ship separately; see evilbitlabs.io for details. --- diff --git a/docs/src/pricing.md b/docs/src/pricing.md deleted file mode 100644 index 5e2c2e3a..00000000 --- a/docs/src/pricing.md +++ /dev/null @@ -1,129 +0,0 @@ -# DaemonEye Pricing - -> Part of the DaemonEye suite of tools: _Continuous monitoring. Immediate alerts._ -> -> **"Auditd without the noise. Osquery without the bloat."** - -## DaemonEye Architecture Note - -ProcMonD is the privileged process monitoring component of the DaemonEye package. DaemonEye consists of three components: - -- **ProcMonD (Collector):** Runs with high privileges, focused solely on process monitoring, with a minimal attack surface and no direct network functionality. -- **Orchestrator:** Operates in user space with very few privileges, receives events from ProcMonD, handles all network communication and alert delivery to log sinks, and communicates with ProcMonD only via secure, memory-only IPC (e.g., Unix sockets). -- **CLI:** Local command-line interface that interacts with the orchestrator for querying data, exporting results, and tuning service configuration. This separation ensures robust security: ProcMonD remains isolated and hardened, while orchestration/network tasks are delegated to a low-privilege process, and all user interaction is handled via the CLI through the orchestrator. - ---- - -## Free / Homelab - -**Always Free** For hackers, homelabbers, and operators who want clean visibility without SaaS strings. - -- Full daemon (Rust core) -- SQL rule engine (DIY + community rules) -- Syslog, email, webhook alerts -- Tamper-evident logging -- Cross-platform (Linux, macOS, Windows) -- GitHub Sponsors tip jar if you dig it - -_For the lab. For your side projects. For free, forever._ - ---- - -## Business - -**Per-site license (TBD)** For small teams and consultancies who need more polish and integrations. One-time fee, no subscription. - -- Everything in Free -- Curated **rule packs** (malware TTPs, suspicious parent/child, process hollowing) -- Output connectors: Splunk HEC, Elastic, Kafka -- Container / K8s DaemonSet deployment -- Export to CEF, JSON, or STIX-lite -- Optional GUI frontend (per-seat add-on, TBD) -- Signed installers (MSI/DMG, ready to deploy) - -_Professional-grade monitoring you can actually run offline._ - ---- - -## Enterprise - -**Org License — Let's Talk** For SOCs, IR teams, and industrial/government environments where process visibility is non-negotiable. (Custom pricing, one-time license. Optional paid update packs.) - -- Everything in Business -- **eBPF integration** for kernel-level visibility -- **Central collector** for fleet monitoring -- Advanced SIEM integration (full STIX/TAXII, compliance mappings) -- Hardened builds with **SLSA provenance & Cosign signatures** -- Optional commercial license for enterprises who can't ship Apache 2.0 -- Quarterly **Enterprise Rule Packs** with threat intel updates - -_When compliance meets detection. Built for enclaves, critical infrastructure, and SOCs that need serious visibility._ - ---- - -### Notes - -- No subscriptions. No license servers. No hidden telemetry. -- Free tier is fully functional — paid tiers add polish and scale. -- Pricing is a starting point — EvilBit Labs is not a sales shop, we keep it simple. - ---- - -## Feature Comparison - -| Feature | Free/Homelab | Business | Enterprise | -| ----------------------- | ------------ | -------- | --------------- | -| **Core Monitoring** | ✅ | ✅ | ✅ | -| **SQL Rule Engine** | ✅ | ✅ | ✅ | -| **Basic Alerts** | ✅ | ✅ | ✅ | -| **Cross-Platform** | ✅ | ✅ | ✅ | -| **Curated Rule Packs** | ❌ | ✅ | ✅ | -| **SIEM Connectors** | ❌ | ✅ | ✅ | -| **Container Support** | ❌ | ✅ | ✅ | -| **Export Formats** | Basic | CEF/STIX | Full STIX/TAXII | -| **GUI Frontend** | ❌ | Optional | ✅ | -| **Kernel Monitoring** | ❌ | ❌ | ✅ | -| **Fleet Management** | ❌ | ❌ | ✅ | -| **Compliance Mappings** | ❌ | ❌ | ✅ | -| **Enterprise Support** | ❌ | ❌ | ✅ | - -## Getting Started - -### Free Tier - -1. Download from GitHub releases -2. Follow the [Installation Guide](./deployment/installation.md) -3. Start monitoring immediately -4. No registration required - -### Business Tier - -1. Contact EvilBit Labs for license key -2. Download Business tier build -3. Apply license key during installation -4. Access to curated rule packs and connectors - -### Enterprise Tier - -1. Contact EvilBit Labs for custom pricing -2. Discuss requirements and deployment scale -3. Receive tailored solution and support -4. Full enterprise features and support - -## Support - -- **Free Tier**: Community support via GitHub Issues -- **Business Tier**: Email support with 48-hour response -- **Enterprise Tier**: Dedicated support with SLA - -## Contact - -For Business and Enterprise licensing: - -- **Email**: [support@evilbitlabs.com](mailto:support@evilbitlabs.com) -- **GitHub**: [EvilBit-Labs/daemoneye](https://github.com/EvilBit-Labs/daemoneye) -- **Website**: [evilbitlabs.io/daemoneye](https://evilbitlabs.io/daemoneye) - ---- - -_Pricing is subject to change. Contact [EvilBit Labs](support@evilbitlabs.com) for the most current pricing information._ diff --git a/docs/src/project-overview.md b/docs/src/project-overview.md index ba59c3c1..49df3e7a 100644 --- a/docs/src/project-overview.md +++ b/docs/src/project-overview.md @@ -145,9 +145,8 @@ DaemonEye implements a **three-component security architecture** with strict pri #### Multi-Channel Alerting - **Local Outputs**: stdout, syslog, file output -- **Network Outputs**: webhooks, email, Kafka -- **SIEM Integration**: Splunk HEC, Elasticsearch, CEF format -- **Enterprise Integration**: STIX/TAXII feeds, federated Security Centers +- **Network Outputs**: webhooks, email +- **SIEM Integration**: CEF format for alert forwarding #### Certificate Transparency Audit Logging @@ -218,9 +217,8 @@ DaemonEye implements a **three-component security architecture** with strict pri ### **Scalability** - **Single Agent**: Monitor 10,000+ processes with minimal overhead -- **Fleet Management**: Support for 1,000+ agents per Security Center -- **Regional Centers**: Aggregate data from multiple regional deployments -- **Enterprise Federation**: Hierarchical data aggregation and query distribution + +Fleet-level aggregation and federation are commercial-tier concerns, handled outside this repo. ## Security Principles @@ -247,18 +245,7 @@ DaemonEye implements a **three-component security architecture** with strict pri ## License Model -### **Dual-License Strategy** - -- **Core Components**: Apache 2.0 licensed (procmond, daemoneye-agent, daemoneye-cli, daemoneye-lib) -- **Business Tier Features**: Per-site license, TBD (Security Center, GUI, enhanced connectors, curated rules) -- **Enterprise Tier Features**: Custom pricing (kernel monitoring, federation, STIX/TAXII integration) - -### **Feature Gating** - -- Compile-time feature gates for tier-specific functionality -- Runtime license validation with cryptographic signatures -- Graceful degradation when license is invalid or expired -- Site restriction validation for license compliance +The DaemonEye components in this repository are licensed under Apache 2.0. Commercial extensions (with their own licensing) ship separately; see evilbitlabs.io for details. ## Getting Started diff --git a/docs/src/technical.md b/docs/src/technical.md index 5c08553f..7a7ad3e5 100644 --- a/docs/src/technical.md +++ b/docs/src/technical.md @@ -16,18 +16,6 @@ The core monitoring system provides real-time process monitoring and threat dete [Read Core Monitoring Documentation →](./technical/core-monitoring.md) -## Business Tier Features - -Business tier features extend the core monitoring with additional capabilities including the Security Center, enhanced integrations, and curated rule packs. - -[Read Business Tier Documentation →](./technical/business-tier.md) - -## Enterprise Tier Features - -Enterprise tier features provide advanced monitoring capabilities including kernel monitoring, network event monitoring, and federated security center architecture. - -[Read Enterprise Tier Documentation →](./technical/enterprise-tier.md) - ## Platform-Specific Process Collectors DaemonEye provides platform-specific process collectors that leverage native operating system capabilities for enhanced monitoring and security analysis. diff --git a/docs/src/technical/business-tier.md b/docs/src/technical/business-tier.md deleted file mode 100644 index 11fdd193..00000000 --- a/docs/src/technical/business-tier.md +++ /dev/null @@ -1,883 +0,0 @@ -# Business Tier Features Technical Specification - -## Overview - -The Business Tier Features extend the core DaemonEye architecture with professional-grade capabilities targeting small teams and consultancies. The design maintains the security-first, offline-capable philosophy while adding enterprise integrations, curated content, and centralized management capabilities. - -The key architectural addition is the **DaemonEye Security Center**, a new component that provides centralized aggregation, management, and visualization capabilities while preserving the autonomous operation of individual agents. - ---- - -## Table of Contents - -[TOC] - ---- - -## Security Center Architecture - -### Component Overview - -```mermaid -graph TB - subgraph "Business Tier Architecture" - SC[Security Center Server] - GUI[Web GUI Frontend] - - subgraph "Agent Node 1" - PM1[procmond] - SA1[daemoneye-agent] - CLI1[daemoneye-cli] - end - - subgraph "Agent Node 2" - PM2[procmond] - SA2[daemoneye-agent] - CLI2[daemoneye-cli] - end - - subgraph "External Integrations" - SPLUNK[Splunk HEC] - ELASTIC[Elasticsearch] - KAFKA[Kafka] - end - end - - SA1 -.->|mTLS| SC - SA2 -.->|mTLS| SC - GUI -->|HTTPS| SC - SC -->|HTTP/HTTPS| SPLUNK - SC -->|HTTP/HTTPS| ELASTIC - SC -->|TCP| KAFKA - - PM1 --> SA1 - PM2 --> SA2 -``` - -### Security Center Server - -**Technology Stack**: - -- **Framework**: Axum web framework with tokio async runtime -- **Database**: PostgreSQL with connection pooling for scalable data storage -- **Authentication**: Mutual TLS (mTLS) for agent connections, JWT for web GUI -- **Configuration**: Same hierarchical config system as core DaemonEye -- **Observability**: OpenTelemetry tracing with Prometheus metrics export - -**Core Modules**: - -```rust,ignore -pub mod security_center { - pub mod agent_registry; // Agent authentication and management - pub mod data_aggregator; // Central data collection and storage - pub mod database; // PostgreSQL connection pool and migrations - pub mod health; - pub mod integration_hub; // External system connectors - pub mod observability; // OpenTelemetry tracing and Prometheus metrics - pub mod rule_distributor; // Rule pack management and distribution - pub mod web_api; // REST API for GUI frontend // Health checks and system monitoring -} -``` - -### Database Layer - -**Connection Pool**: sqlx::PgPool with configurable min/max connections - -```rust,ignore -pub struct SecurityCenterDatabase { - pool: sqlx::PgPool, - metrics: DatabaseMetrics, -} - -impl SecurityCenterDatabase { - pub async fn new(config: DatabaseConfig) -> Result<Self> { - let pool = sqlx::PgPool::builder() - .max_connections(config.max_connections) - .min_connections(config.min_connections) - .acquire_timeout(Duration::from_secs(config.connection_timeout)) - .idle_timeout(Duration::from_secs(config.idle_timeout)) - .max_lifetime(Duration::from_secs(config.max_lifetime)) - .build(&config.url) - .await?; - - // Run migrations - sqlx::migrate!("./migrations").run(&pool).await?; - - Ok(Self { - pool, - metrics: DatabaseMetrics::new(), - }) - } -} -``` - -**Database Schema**: - -```sql --- Agent registration and management -CREATE TABLE agents ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - hostname VARCHAR(255) NOT NULL, - ip_address INET, - certificate_fingerprint VARCHAR(128) NOT NULL UNIQUE, - first_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(), - last_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(), - version VARCHAR(50) NOT NULL, - status VARCHAR(20) NOT NULL CHECK (status IN ('active', 'inactive', 'error')), - metadata JSONB, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - --- Aggregated alerts from all agents -CREATE TABLE aggregated_alerts ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - agent_id UUID NOT NULL REFERENCES agents(id) ON DELETE CASCADE, - rule_id VARCHAR(100) NOT NULL, - rule_name VARCHAR(255) NOT NULL, - severity VARCHAR(20) NOT NULL CHECK (severity IN ('low', 'medium', 'high', 'critical')), - timestamp TIMESTAMPTZ NOT NULL, - hostname VARCHAR(255) NOT NULL, - process_data JSONB NOT NULL, - metadata JSONB, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - --- Rule pack management -CREATE TABLE rule_packs ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - name VARCHAR(255) NOT NULL, - version VARCHAR(50) NOT NULL, - description TEXT, - author VARCHAR(255), - signature TEXT NOT NULL, - content BYTEA NOT NULL, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - deployed_at TIMESTAMPTZ, - UNIQUE(name, version) -); -``` - -## Agent Registration and Authentication - -### Certificate-Based Authentication - -**Agent Registration Flow**: - -1. Agent generates client certificate during first startup -2. Agent connects to Security Center with certificate -3. Security Center validates certificate and registers agent -4. Ongoing communication uses established mTLS session - -```rust,ignore -pub struct AgentRegistry { - db: SecurityCenterDatabase, - ca_cert: X509Certificate, - agent_certs: Arc<Mutex<HashMap<String, X509Certificate>>>, -} - -impl AgentRegistry { - pub async fn register_agent(&self, cert: &X509Certificate) -> Result<AgentInfo> { - let fingerprint = cert.fingerprint()?; - - // Validate certificate against CA - self.validate_certificate(cert)?; - - // Extract agent information from certificate - let hostname = self.extract_hostname(cert)?; - let version = self.extract_version(cert)?; - - // Register agent in database - let agent = AgentInfo { - id: Uuid::new_v4(), - hostname, - certificate_fingerprint: fingerprint, - version, - status: AgentStatus::Active, - first_seen: Utc::now(), - last_seen: Utc::now(), - }; - - self.db.insert_agent(&agent).await?; - self.agent_certs - .lock() - .await - .insert(fingerprint, cert.clone()); - - Ok(agent) - } - - pub async fn authenticate_agent(&self, fingerprint: &str) -> Result<AgentInfo> { - // Check if agent is registered and active - let agent = self.db.get_agent_by_fingerprint(fingerprint).await?; - - if agent.status != AgentStatus::Active { - return Err(AuthenticationError::AgentInactive); - } - - // Update last seen timestamp - self.db.update_agent_last_seen(agent.id, Utc::now()).await?; - - Ok(agent) - } -} -``` - -### Enhanced Agent Capabilities - -**Uplink Communication**: Secure connection to Security Center with fallback to standalone operation. - -```rust,ignore -pub struct EnhancedDaemoneyeAgent { - base_agent: DaemoneyeAgent, - security_center_client: Option<SecurityCenterClient>, - uplink_config: UplinkConfig, -} - -impl EnhancedDaemoneyeAgent { - pub async fn new(config: AgentConfig) -> Result<Self> { - let base_agent = DaemoneyeAgent::new(config.clone()).await?; - - let security_center_client = if config.uplink.enabled { - Some(SecurityCenterClient::new(&config.uplink).await?) - } else { - None - }; - - Ok(Self { - base_agent, - security_center_client, - uplink_config: config.uplink, - }) - } - - pub async fn start(&mut self) -> Result<()> { - // Start base agent - self.base_agent.start().await?; - - // Connect to Security Center if configured - if let Some(client) = &mut self.security_center_client { - client.connect().await?; - self.start_uplink_communication().await?; - } - - Ok(()) - } - - async fn start_uplink_communication(&self) -> Result<()> { - let client = self.security_center_client.as_ref().unwrap(); - - // Start periodic heartbeat - let heartbeat_interval = Duration::from_secs(30); - let mut interval = tokio::time::interval(heartbeat_interval); - - tokio::spawn(async move { - loop { - interval.tick().await; - if let Err(e) = client.send_heartbeat().await { - tracing::warn!("Heartbeat failed: {}", e); - } - } - }); - - // Start alert forwarding - self.start_alert_forwarding().await?; - - Ok(()) - } -} -``` - -## Curated Rule Packs - -### Rule Pack Structure - -**YAML-based Rule Packs** with cryptographic signatures: - -```yaml -# rule-pack-malware-ttps.yaml -metadata: - name: Malware TTPs - version: 1.2.0 - description: Common malware tactics, techniques, and procedures - author: DaemonEye Security Team - signature: ed25519:base64-signature - -rules: - - id: process-hollowing-detection - name: Process Hollowing Detection - description: Detects potential process hollowing attacks - sql: | - SELECT * FROM process_snapshots - WHERE executable_path != mapped_image_path - AND parent_pid IN (SELECT pid FROM process_snapshots WHERE name = 'explorer.exe') - severity: high - tags: [process-hollowing, malware, defense-evasion] -``` - -### Rule Pack Validation - -**Cryptographic Signatures**: Ed25519 signatures for rule pack integrity. - -```rust,ignore -pub struct RulePackValidator { - public_key: ed25519_dalek::PublicKey, -} - -impl RulePackValidator { - pub fn validate_rule_pack(&self, pack: &RulePack) -> Result<ValidationResult> { - // Verify cryptographic signature - let signature = ed25519_dalek::Signature::from_bytes(&pack.signature)?; - let message = serde_json::to_vec(&pack.metadata)?; - - self.public_key.verify_strict(&message, &signature)?; - - // Validate SQL syntax for all rules - for rule in &pack.rules { - self.validate_rule_sql(&rule.sql)?; - } - - // Check for rule ID conflicts - self.check_rule_conflicts(&pack.rules)?; - - Ok(ValidationResult::Valid) - } - - fn validate_rule_sql(&self, sql: &str) -> Result<()> { - let validator = SqlValidator::new(); - validator.validate_query(sql)?; - Ok(()) - } -} -``` - -### Rule Distribution - -**Automatic Distribution**: Agents automatically download and apply rule packs. - -```rust,ignore -pub struct RuleDistributor { - db: SecurityCenterDatabase, - rule_pack_storage: RulePackStorage, - distribution_scheduler: DistributionScheduler, -} - -impl RuleDistributor { - pub async fn deploy_rule_pack(&self, pack: RulePack) -> Result<DeploymentResult> { - // Validate rule pack - let validator = RulePackValidator::new(); - validator.validate_rule_pack(&pack)?; - - // Store rule pack - let pack_id = self.rule_pack_storage.store(&pack).await?; - - // Schedule distribution to agents - self.distribution_scheduler - .schedule_distribution(pack_id) - .await?; - - Ok(DeploymentResult::Success) - } - - pub async fn distribute_to_agent(&self, agent_id: Uuid, pack_id: Uuid) -> Result<()> { - let pack = self.rule_pack_storage.get(pack_id).await?; - let agent = self.db.get_agent(agent_id).await?; - - // Send rule pack to agent - let client = SecurityCenterClient::for_agent(&agent)?; - client.send_rule_pack(&pack).await?; - - // Update agent rule assignments - self.db.assign_rule_pack(agent_id, pack_id).await?; - - Ok(()) - } -} -``` - -## Enhanced Output Connectors - -### Splunk HEC Integration - -**Splunk HTTP Event Collector** integration with authentication and batching. - -```rust,ignore -pub struct SplunkHecConnector { - endpoint: Url, - token: SecretString, - index: Option<String>, - source_type: String, - client: reqwest::Client, - batch_size: usize, - batch_timeout: Duration, -} - -impl SplunkHecConnector { - pub async fn send_event(&self, event: &ProcessAlert) -> Result<(), ConnectorError> { - let hec_event = HecEvent { - time: event.timestamp.timestamp(), - host: event.hostname.clone(), - source: "daemoneye", - sourcetype: &self.source_type, - index: self.index.as_deref(), - event: serde_json::to_value(event)?, - }; - - let response = self - .client - .post(&self.endpoint) - .header( - "Authorization", - format!("Splunk {}", self.token.expose_secret()), - ) - .json(&hec_event) - .send() - .await?; - - response.error_for_status()?; - Ok(()) - } - - pub async fn send_batch(&self, events: &[ProcessAlert]) -> Result<(), ConnectorError> { - let hec_events: Vec<HecEvent> = events - .iter() - .map(|event| self.convert_to_hec_event(event)) - .collect::<Result<Vec<_>, _>>()?; - - let response = self - .client - .post(&self.endpoint) - .header( - "Authorization", - format!("Splunk {}", self.token.expose_secret()), - ) - .json(&hec_events) - .send() - .await?; - - response.error_for_status()?; - Ok(()) - } -} -``` - -### Elasticsearch Integration - -**Elasticsearch** bulk indexing with index pattern management. - -```rust,ignore -pub struct ElasticsearchConnector { - client: elasticsearch::Elasticsearch, - index_pattern: String, - pipeline: Option<String>, - batch_size: usize, -} - -impl ElasticsearchConnector { - pub async fn bulk_index(&self, events: &[ProcessAlert]) -> Result<(), ConnectorError> { - let mut body = Vec::new(); - - for event in events { - let index_name = self.resolve_index_name(&event.timestamp); - let action = json!({ - "index": { - "_index": index_name, - "_type": "_doc" - } - }); - body.push(action); - body.push(serde_json::to_value(event)?); - } - - let response = self.client.bulk(BulkParts::None).body(body).send().await?; - - self.handle_bulk_response(response).await - } - - fn resolve_index_name(&self, timestamp: &DateTime<Utc>) -> String { - self.index_pattern - .replace("{YYYY}", ×tamp.format("%Y").to_string()) - .replace("{MM}", ×tamp.format("%m").to_string()) - .replace("{DD}", ×tamp.format("%d").to_string()) - } -} -``` - -### Kafka Integration - -**Kafka** high-throughput message streaming with partitioning. - -```rust,ignore -pub struct KafkaConnector { - producer: FutureProducer, - topic: String, - partition_strategy: PartitionStrategy, -} - -impl KafkaConnector { - pub async fn send_event(&self, event: &ProcessAlert) -> Result<(), ConnectorError> { - let key = self.generate_partition_key(event); - let payload = serde_json::to_vec(event)?; - - let record = FutureRecord::to(&self.topic) - .key(&key) - .payload(&payload) - .partition(self.calculate_partition(&key)); - - self.producer.send(record, Duration::from_secs(5)).await?; - Ok(()) - } - - fn generate_partition_key(&self, event: &ProcessAlert) -> String { - match self.partition_strategy { - PartitionStrategy::ByHostname => event.hostname.clone(), - PartitionStrategy::ByRuleId => event.rule_id.clone(), - PartitionStrategy::BySeverity => event.severity.to_string(), - PartitionStrategy::RoundRobin => Uuid::new_v4().to_string(), - } - } -} -``` - -## Export Format Implementations - -### CEF (Common Event Format) - -**CEF Format** for SIEM compatibility. - -```rust,ignore -pub struct CefFormatter; - -impl CefFormatter { - pub fn format_process_alert(alert: &ProcessAlert) -> String { - format!( - "CEF:0|DaemonEye|DaemonEye|1.0|{}|{}|{}|{}", - alert.rule_id, - alert.rule_name, - Self::map_severity(&alert.severity), - Self::build_extensions(alert) - ) - } - - fn build_extensions(alert: &ProcessAlert) -> String { - format!( - "rt={} src={} suser={} sproc={} cs1Label=Command Line cs1={} cs2Label=Parent Process cs2={}", - alert.timestamp.timestamp_millis(), - alert.hostname, - alert.process.user.unwrap_or_default(), - alert.process.name, - alert.process.command_line.unwrap_or_default(), - alert.process.parent_name.unwrap_or_default() - ) - } - - fn map_severity(severity: &AlertSeverity) -> u8 { - match severity { - AlertSeverity::Low => 3, - AlertSeverity::Medium => 5, - AlertSeverity::High => 7, - AlertSeverity::Critical => 10, - } - } -} -``` - -### STIX 2.1 Objects - -**STIX 2.1** structured threat information export. - -```rust,ignore -pub struct StixExporter; - -impl StixExporter { - pub fn create_process_object(process: &ProcessSnapshot) -> StixProcess { - StixProcess { - type_: "process".to_string(), - spec_version: "2.1".to_string(), - id: format!("process--{}", Uuid::new_v4()), - created: process.timestamp, - modified: process.timestamp, - pid: process.pid, - name: process.name.clone(), - command_line: process.command_line.clone(), - parent_ref: process - .parent_pid - .map(|ppid| format!("process--{}", Self::get_parent_uuid(ppid))), - binary_ref: Some(format!( - "file--{}", - Self::create_file_object(&process.executable_path).id - )), - } - } - - pub fn create_indicator_object(alert: &ProcessAlert) -> StixIndicator { - StixIndicator { - type_: "indicator".to_string(), - spec_version: "2.1".to_string(), - id: format!("indicator--{}", Uuid::new_v4()), - created: alert.timestamp, - modified: alert.timestamp, - pattern: Self::build_stix_pattern(alert), - pattern_type: "stix".to_string(), - pattern_version: "2.1".to_string(), - valid_from: alert.timestamp, - labels: vec![alert.severity.to_string()], - confidence: Self::map_confidence(alert), - } - } -} -``` - -## Web GUI Frontend - -### Technology Stack - -**Frontend**: React with TypeScript for type safety **State Management**: React Query for server state management **UI Framework**: Tailwind CSS with shadcn/ui components **Charts**: Recharts for data visualization **Authentication**: JWT tokens with automatic refresh - -### Core Features - -**Fleet Dashboard**: Real-time view of all connected agents - -```typescript -interface FleetDashboard { - agents: AgentStatus[]; - totalAlerts: number; - activeRules: number; - systemHealth: HealthStatus; -} - -interface AgentStatus { - id: string; - hostname: string; - status: 'active' | 'inactive' | 'error'; - lastSeen: Date; - alertCount: number; - version: string; -} -``` - -**Alert Management**: Filtering, sorting, and export of alerts - -```typescript -interface AlertManagement { - alerts: Alert[]; - filters: AlertFilters; - sortBy: SortOption; - exportFormat: ExportFormat; -} - -interface AlertFilters { - severity: AlertSeverity[]; - ruleId: string[]; - hostname: string[]; - dateRange: DateRange; -} -``` - -**Rule Management**: Visual rule editor and deployment interface - -```typescript -interface RuleManagement { - rules: DetectionRule[]; - rulePacks: RulePack[]; - editor: RuleEditor; - deployment: DeploymentStatus; -} - -interface RuleEditor { - sql: string; - validation: ValidationResult; - testResults: TestResult[]; -} -``` - -## Deployment Patterns - -### Pattern 1: Direct Agent-to-SIEM - -Agents send alerts directly to configured SIEM systems without Security Center. - -```yaml -# Agent configuration for direct SIEM integration -alerting: - routing_strategy: direct - sinks: - - type: splunk_hec - enabled: true - endpoint: https://splunk.example.com:8088/services/collector - token: ${SPLUNK_HEC_TOKEN} -``` - -### Pattern 2: Centralized Proxy - -All agents route through Security Center for centralized management. - -```yaml -# Agent configuration for centralized proxy -alerting: - routing_strategy: proxy - security_center: - enabled: true - endpoint: https://security-center.example.com:8443 - certificate_path: /etc/daemoneye/agent.crt - key_path: /etc/daemoneye/agent.key -``` - -### Pattern 3: Hybrid (Recommended) - -Agents send to both Security Center and direct SIEM systems. - -```yaml -# Agent configuration for hybrid routing -alerting: - routing_strategy: hybrid - security_center: - enabled: true - endpoint: https://security-center.example.com:8443 - sinks: - - type: splunk_hec - enabled: true - endpoint: https://splunk.example.com:8088/services/collector -``` - -## Container and Kubernetes Support - -### Docker Images - -**Multi-stage builds** for optimized container images. - -```dockerfile -# Build stage -FROM rust:1.91 as builder -WORKDIR /app -COPY . . -RUN cargo build --release - -# Runtime stage -FROM debian:bookworm-slim -RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* -COPY --from=builder /app/target/release/daemoneye-agent /usr/local/bin/ -COPY --from=builder /app/target/release/procmond /usr/local/bin/ -COPY --from=builder /app/target/release/daemoneye-cli /usr/local/bin/ - -# Create non-root user -RUN useradd -r -s /bin/false daemoneye -USER daemoneye - -ENTRYPOINT ["daemoneye-agent"] -``` - -### Kubernetes Manifests - -**DaemonSet** for agent deployment across all nodes. - -```yaml -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: daemoneye-agent - namespace: security -spec: - selector: - matchLabels: - app: daemoneye-agent - template: - metadata: - labels: - app: daemoneye-agent - spec: - serviceAccountName: daemoneye-agent - hostPID: true - hostNetwork: true - containers: - - name: procmond - image: daemoneye/procmond:latest - securityContext: - privileged: true - capabilities: - add: [SYS_PTRACE] - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: data - mountPath: /var/lib/daemoneye - - name: daemoneye-agent - image: daemoneye/daemoneye-agent:latest - securityContext: - runAsNonRoot: true - runAsUser: 1000 - volumeMounts: - - name: data - mountPath: /var/lib/daemoneye - - name: config - mountPath: /etc/daemoneye - volumes: - - name: proc - hostPath: - path: /proc - - name: data - hostPath: - path: /var/lib/daemoneye - - name: config - configMap: - name: daemoneye-config -``` - -## Performance and Scalability - -### Security Center Performance - -**Target Metrics**: - -- **Agents per Security Center**: 1,000+ agents -- **Alert Throughput**: 10,000+ alerts per minute -- **Query Latency**: \<100ms for dashboard queries -- **Data Retention**: Configurable retention policies - -### Optimization Strategies - -**Connection Pooling**: Efficient database connection management - -```rust,ignore -pub struct ConnectionPoolManager { - pool: sqlx::PgPool, - metrics: PoolMetrics, -} - -impl ConnectionPoolManager { - pub async fn get_connection(&self) -> Result<PooledConnection> { - let start = Instant::now(); - let conn = self.pool.acquire().await?; - - self.metrics.connection_acquired.record(start.elapsed()); - Ok(PooledConnection::new(conn)) - } -} -``` - -**Batch Processing**: Efficient alert processing and delivery - -```rust,ignore -pub struct BatchProcessor { - batch_size: usize, - batch_timeout: Duration, - processor: Arc<dyn AlertProcessor>, -} - -impl BatchProcessor { - pub async fn process_alerts(&self, alerts: Vec<Alert>) -> Result<()> { - let batches = alerts.chunks(self.batch_size); - - for batch in batches { - self.processor.process_batch(batch).await?; - } - - Ok(()) - } -} -``` - ---- - -*The Business Tier Features provide professional-grade capabilities for small to medium teams while maintaining DaemonEye's core security principles and performance characteristics.* diff --git a/docs/src/technical/core-monitoring.md b/docs/src/technical/core-monitoring.md index 73abe6a1..5e7d9cf3 100644 --- a/docs/src/technical/core-monitoring.md +++ b/docs/src/technical/core-monitoring.md @@ -63,71 +63,7 @@ impl ProcessCollector { #### Platform-Specific Enhancements -**Linux eBPF Integration (Enterprise Tier)**: - -```rust,ignore -#[cfg(target_os = "linux")] -pub struct EbpfProcessCollector { - base_collector: ProcessCollector, - ebpf_monitor: Option<EbpfMonitor>, -} - -impl EbpfProcessCollector { - pub async fn enumerate_processes(&self) -> Result<Vec<ProcessRecord>> { - // Use eBPF for real-time process events if available - if let Some(ebpf) = &self.ebpf_monitor { - return self.enumerate_with_ebpf(ebpf).await; - } - - // Fallback to sysinfo - self.base_collector.enumerate_processes().await - } -} -``` - -**Windows ETW Integration (Enterprise Tier)**: - -```rust,ignore -#[cfg(target_os = "windows")] -pub struct EtwProcessCollector { - base_collector: ProcessCollector, - etw_monitor: Option<EtwMonitor>, -} - -impl EtwProcessCollector { - pub async fn enumerate_processes(&self) -> Result<Vec<ProcessRecord>> { - // Use ETW for enhanced process monitoring if available - if let Some(etw) = &self.etw_monitor { - return self.enumerate_with_etw(etw).await; - } - - // Fallback to sysinfo - self.base_collector.enumerate_processes().await - } -} -``` - -**macOS EndpointSecurity Integration (Enterprise Tier)**: - -```rust,ignore -#[cfg(target_os = "macos")] -pub struct EndpointSecurityProcessCollector { - base_collector: ProcessCollector, - es_monitor: Option<EndpointSecurityMonitor>, -} - -impl EndpointSecurityProcessCollector { - pub async fn enumerate_processes(&self) -> Result<Vec<ProcessRecord>> { - // Use EndpointSecurity for real-time monitoring if available - if let Some(es) = &self.es_monitor { - return self.enumerate_with_endpoint_security(es).await; - } - - // Fallback to sysinfo - self.base_collector.enumerate_processes().await - } -} -``` +Kernel-level collection (eBPF on Linux, ETW on Windows, EndpointSecurity on macOS) is handled by commercial tiers, not by this repo. The Community tier uses user-space collection via `sysinfo` across all platforms. ### Executable Integrity Verification diff --git a/docs/src/technical/enterprise-tier.md b/docs/src/technical/enterprise-tier.md deleted file mode 100644 index 848b907a..00000000 --- a/docs/src/technical/enterprise-tier.md +++ /dev/null @@ -1,360 +0,0 @@ -# Enterprise Tier Features - -This document describes the Enterprise tier features of DaemonEye, including kernel monitoring, network event monitoring, and federated security center architecture. - -## Overview - -The Enterprise tier extends DaemonEye with advanced monitoring capabilities and enterprise-grade features: - -- **Kernel Monitoring Layer**: eBPF, ETW, and EndpointSecurity integration -- **Network Event Monitor**: Real-time network traffic analysis -- **Federated Security Center**: Multi-site security center architecture -- **STIX/TAXII Integration**: Threat intelligence sharing -- **Advanced Analytics**: Machine learning and behavioral analysis - -## Kernel Monitoring Layer - -### Linux eBPF Integration - -DaemonEye uses eBPF (Extended Berkeley Packet Filter) for low-level system monitoring: - -```rust,ignore -use aya::{ - Bpf, - programs::{Xdp, XdpFlags}, -}; - -pub struct EBPFMonitor { - bpf: Bpf, - program: Xdp, -} - -impl EBPFMonitor { - pub async fn new() -> Result<Self, MonitorError> { - let bpf = Bpf::load_file("monitor.o")?; - let program: &mut Xdp = bpf.program_mut("monitor").unwrap().try_into()?; - program.load()?; - program.attach("eth0", XdpFlags::default())?; - - Ok(Self { bpf, program }) - } -} -``` - -### Windows ETW Integration - -Windows Event Tracing for Windows (ETW) provides comprehensive system monitoring: - -```rust,ignore -use windows::{Win32::System::Diagnostics::Etw::*, core::PCWSTR}; - -pub struct ETWMonitor { - session_handle: TRACEHANDLE, - trace_properties: EVENT_TRACE_PROPERTIES, -} - -impl ETWMonitor { - pub fn new() -> Result<Self, MonitorError> { - let mut trace_properties = EVENT_TRACE_PROPERTIES::default(); - trace_properties.Wnode.BufferSize = std::mem::size_of::<EVENT_TRACE_PROPERTIES>() as u32; - trace_properties.Wnode.Guid = GUID::from("12345678-1234-1234-1234-123456789012"); - trace_properties.Wnode.ClientContext = 1; - trace_properties.Wnode.Flags = WNODE_FLAG_TRACED_GUID; - trace_properties.LogFileMode = EVENT_TRACE_REAL_TIME_MODE; - trace_properties.LoggerNameOffset = std::mem::size_of::<EVENT_TRACE_PROPERTIES>() as u32; - trace_properties.LogFileNameOffset = 0; - trace_properties.BufferSize = 64; - trace_properties.MinimumBuffers = 2; - trace_properties.MaximumBuffers = 2; - trace_properties.FlushTimer = 1; - trace_properties.EnableFlags = EVENT_TRACE_FLAG_PROCESS; - - let session_handle = StartTraceW( - &mut 0, - PCWSTR::from_raw("DaemonEye\0".as_ptr() as *const u16), - &mut trace_properties, - )?; - - Ok(Self { - session_handle, - trace_properties, - }) - } -} -``` - -### macOS EndpointSecurity Integration - -macOS EndpointSecurity framework provides real-time security event monitoring: - -```rust,ignore -use endpoint_sec::{Client, ClientBuilder, Event, EventType, Process}; - -pub struct EndpointSecurityMonitor { - client: Client, -} - -impl EndpointSecurityMonitor { - pub async fn new() -> Result<Self, MonitorError> { - let client = ClientBuilder::new() - .name("com.daemoneye.monitor") - .build() - .await?; - - Ok(Self { client }) - } - - pub async fn start_monitoring(&self) -> Result<(), MonitorError> { - let mut stream = self - .client - .subscribe(&[ - EventType::NotifyExec, - EventType::NotifyFork, - EventType::NotifyExit, - EventType::NotifySignal, - ]) - .await?; - - while let Some(event) = stream.next().await { - self.handle_event(event).await?; - } - - Ok(()) - } -} -``` - -## Network Event Monitor - -The Network Event Monitor provides real-time network traffic analysis: - -```rust,ignore -use pcap::{Capture, Device}; - -pub struct NetworkMonitor { - capture: Capture<Device>, -} - -impl NetworkMonitor { - pub fn new(interface: &str) -> Result<Self, MonitorError> { - let device = Device::lookup()? - .find(|d| d.name == interface) - .ok_or(MonitorError::DeviceNotFound)?; - - let capture = Capture::from_device(device)? - .promisc(true) - .buffer_size(65536) - .open()?; - - Ok(Self { capture }) - } - - pub async fn start_capture(&mut self) -> Result<(), MonitorError> { - while let Ok(packet) = self.capture.next() { - self.process_packet(packet).await?; - } - Ok(()) - } -} -``` - -## Federated Security Center Architecture - -The Federated Security Center enables multi-site security center deployment: - -```rust,ignore -pub struct FederatedSecurityCenter { - primary_center: SecurityCenter, - regional_centers: Vec<RegionalSecurityCenter>, - federation_config: FederationConfig, -} - -pub struct FederationConfig { - pub primary_endpoint: String, - pub regional_endpoints: Vec<String>, - pub sync_interval: Duration, - pub conflict_resolution: ConflictResolution, -} - -pub enum ConflictResolution { - PrimaryWins, - TimestampWins, - ManualReview, -} -``` - -## STIX/TAXII Integration - -DaemonEye integrates with STIX/TAXII for threat intelligence sharing: - -```rust,ignore -use stix::{ - objects::{Indicator, Malware, ThreatActor}, - taxii::client::TaxiiClient, -}; - -pub struct STIXTAXIIIntegration { - client: TaxiiClient, - collection_id: String, -} - -impl STIXTAXIIIntegration { - pub async fn new(endpoint: &str, collection_id: &str) -> Result<Self, IntegrationError> { - let client = TaxiiClient::new(endpoint)?; - Ok(Self { - client, - collection_id: collection_id.to_string(), - }) - } - - pub async fn fetch_indicators(&self) -> Result<Vec<Indicator>, IntegrationError> { - let objects = self - .client - .get_objects(&self.collection_id, "indicator") - .await?; - - let indicators: Vec<Indicator> = objects - .into_iter() - .filter_map(|obj| obj.try_into().ok()) - .collect(); - - Ok(indicators) - } -} -``` - -## Advanced Analytics - -Enterprise tier includes machine learning and behavioral analysis: - -```rust,ignore -pub struct BehavioralAnalyzer { - models: Vec<BehavioralModel>, - anomaly_threshold: f64, -} - -pub struct BehavioralModel { - name: String, - features: Vec<String>, - model: Box<dyn Model>, -} - -impl BehavioralAnalyzer { - pub fn analyze_process(&self, process: &ProcessInfo) -> Result<AnomalyScore, AnalysisError> { - let features = self.extract_features(process); - let mut scores = Vec::new(); - - for model in &self.models { - let score = model.model.predict(&features)?; - scores.push(score); - } - - let anomaly_score = self.aggregate_scores(scores); - Ok(anomaly_score) - } -} -``` - -## Deployment Considerations - -### Resource Requirements - -Enterprise tier features require additional resources: - -- **CPU**: 2+ cores for kernel monitoring -- **Memory**: 4+ GB for network monitoring and analytics -- **Storage**: 100+ GB for event storage and analytics data -- **Network**: High-bandwidth for network monitoring - -### Security Considerations - -- Kernel monitoring requires elevated privileges -- Network monitoring may capture sensitive data -- Federated architecture requires secure communication -- STIX/TAXII integration requires secure authentication - -### Performance Impact - -- Kernel monitoring: 2-5% CPU overhead -- Network monitoring: 5-10% CPU overhead -- Analytics processing: 10-20% CPU overhead -- Storage requirements: 10x increase for event data - -## Configuration - -Enterprise tier configuration extends the base configuration: - -```yaml -enterprise: - kernel_monitoring: - enable_ebpf: true - ebpf_program_path: /etc/daemoneye/ebpf/monitor.o - enable_etw: true - etw_session_name: DaemonEye - enable_endpoint_security: true - es_client_name: com.daemoneye.monitor - - network_monitoring: - enable_packet_capture: true - capture_interface: eth0 - capture_filter: tcp port 80 or tcp port 443 - max_packet_size: 1500 - buffer_size_mb: 100 - - federation: - enable_federation: true - primary_endpoint: https://primary.daemoneye.com - regional_endpoints: - - https://region1.daemoneye.com - - https://region2.daemoneye.com - sync_interval: 300 - conflict_resolution: primary_wins - - stix_taxii: - enable_integration: true - taxii_endpoint: https://taxii.example.com - collection_id: daemoneye-indicators - sync_interval: 3600 - - analytics: - enable_behavioral_analysis: true - anomaly_threshold: 0.8 - model_update_interval: 86400 - enable_machine_learning: true -``` - -## Troubleshooting - -### Common Issues - -**Kernel Monitoring Failures**: - -- Check kernel version compatibility -- Verify eBPF/ETW/EndpointSecurity support -- Check privilege requirements -- Review kernel logs for errors - -**Network Monitoring Issues**: - -- Verify network interface permissions -- Check packet capture filters -- Monitor buffer usage -- Review network performance impact - -**Federation Sync Issues**: - -- Check network connectivity -- Verify authentication credentials -- Review sync logs -- Check conflict resolution settings - -**Analytics Performance**: - -- Monitor CPU and memory usage -- Check model update frequency -- Review feature extraction performance -- Optimize anomaly detection thresholds - ---- - -*This document provides comprehensive information about Enterprise tier features. For additional help, consult the troubleshooting section or contact support.* diff --git a/docs/src/technical/security_design_overview.md b/docs/src/technical/security_design_overview.md index 1e61ede5..5556e9ac 100644 --- a/docs/src/technical/security_design_overview.md +++ b/docs/src/technical/security_design_overview.md @@ -405,56 +405,7 @@ DaemonEye supports all standard government data classification levels with appro - Audit logging of all data access - Principle of least privilege for data access -### Business Tier Data Protection Features - -**Centralized Data Management**: - -- **Security Center**: Centralized aggregation and management of data from multiple agents -- **mTLS Authentication**: Mutual TLS with certificate chain validation for secure agent connections -- **Certificate Management**: Automated certificate provisioning and rotation -- **Role-Based Access Control**: Granular permissions for different user roles - -**Enhanced Data Export**: - -- **Standard Format Support**: CEF (Common Event Format), structured JSON, and STIX-lite exports -- **SIEM Integration**: Native connectors for Splunk, Elasticsearch, and Kafka -- **Data Portability**: Comprehensive export capabilities for data migration and analysis - -**Code Signing and Integrity**: - -- **Signed Installers**: MSI installers for Windows and DMG packages for macOS with valid code signing certificates -- **Enterprise Deployment**: Proper metadata for enterprise deployment tools -- **Security Validation**: Operating system security validation without warnings - -### Enterprise Tier Data Protection Features - -**Advanced Cryptographic Security**: - -- **SLSA Level 3 Provenance**: Complete software supply chain attestation -- **Cosign Signatures**: Hardware security module-backed code signing -- **Software Bill of Materials (SBOM)**: Complete dependency and component inventory -- **Signature Verification**: Mandatory signature verification before execution - -**Federated Data Architecture**: - -- **Multi-Tier Security Centers**: Hierarchical data aggregation across geographic regions -- **Federated Storage**: Distributed data storage with local and global aggregation -- **Data Sovereignty**: Regional data residency compliance -- **Cross-Region Replication**: Secure data replication with integrity verification - -**Advanced Compliance and Threat Intelligence**: - -- **STIX/TAXII Integration**: Automated threat intelligence feed consumption and processing -- **Compliance Framework Mappings**: NIST, ISO 27001, CIS framework mappings -- **Quarterly Rule Packs**: Curated threat intelligence updates with automated rule deployment -- **Compliance Reporting**: Automated compliance reporting and evidence collection - -**Kernel-Level Data Protection**: - -- **Real-Time Event Processing**: Sub-millisecond processing of kernel-level events -- **Network Correlation**: Process-to-network event correlation for lateral movement detection -- **Memory Analysis**: Advanced memory analysis capabilities for sophisticated attack detection -- **Platform-Specific Monitoring**: eBPF (Linux), ETW (Windows), EndpointSecurity (macOS) integration +Commercial tiers extend data protection with centralized aggregation (mTLS, RBAC, federated architecture), advanced threat-intelligence integration, and kernel-level collection. Those components are out of scope for this repo. ### Compliance Features @@ -482,43 +433,7 @@ DaemonEye supports all standard government data classification levels with appro - Respond: Incident response and forensics - Recover: Business continuity and restoration -**Business Tier Compliance Features**: - -**Enhanced Audit and Reporting**: - -- **Centralized Audit Logs**: Aggregated audit logs from multiple agents -- **Automated Compliance Reporting**: Scheduled compliance reports and dashboards -- **Data Retention Management**: Centralized data retention policy enforcement -- **Audit Trail Integrity**: Cryptographic verification of audit log integrity across the fleet - -**Enterprise Integration Compliance**: - -- **SIEM Integration**: Native compliance with major SIEM platforms (Splunk, Elasticsearch, QRadar) -- **Standard Format Support**: CEF, STIX-lite, and other compliance-standard formats -- **Data Lineage Tracking**: Complete data lineage from collection to reporting - -**Enterprise Tier Compliance Features**: - -**Advanced Compliance Frameworks**: - -- **NIST SP 800-53**: Complete security controls mapping and implementation -- **ISO 27001**: Information security management system compliance -- **CIS Controls**: Center for Internet Security controls implementation -- **FedRAMP**: Federal Risk and Authorization Management Program compliance - -**Threat Intelligence and Advanced Monitoring**: - -- **STIX/TAXII Integration**: Automated threat intelligence feed consumption -- **Compliance Mappings**: Real-time mapping of detection events to compliance controls -- **Advanced SIEM Integration**: Full STIX/TAXII support with compliance mappings -- **Quarterly Threat Updates**: Automated deployment of curated threat intelligence rule packs - -**Hardened Security and Supply Chain**: - -- **SLSA Level 3 Provenance**: Complete software supply chain attestation -- **Cosign Signatures**: Hardware security module-backed code signing -- **Software Bill of Materials (SBOM)**: Complete dependency and component inventory -- **Supply Chain Security**: End-to-end supply chain security verification +Commercial tiers add centralized audit aggregation, compliance-framework mappings (NIST, ISO 27001, CIS, FedRAMP), advanced SIEM connectors, automated threat-intelligence feeds, and hardened supply-chain signing (SLSA, Cosign, SBOM). Those capabilities are out of scope for this repo. **FISMA Compliance**: @@ -920,7 +835,7 @@ For IC environments, DaemonEye provides: - Automated audit log generation with cryptographic integrity - Built-in compliance reporting capabilities -- Standard format exports (CEF, STIX-lite) for SIEM integration +- CEF export for SIEM integration (additional formats available in commercial tiers) - Comprehensive documentation for ATO packages **Enhanced Security Posture**: @@ -1023,7 +938,7 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme - **Vendor Implementation**: Device authentication for agent connections - **Product Requirements**: Implement device certificates, mutual TLS, and device identity verification -- **Implementation Notes**: Include certificate management, device enrollment, and authentication protocols. Already planned: mTLS authentication and certificate management for Business/Enterprise tiers. Additional work needed: device authentication for agent connections and device identity verification. +- **Implementation Notes**: Include certificate management, device enrollment, and authentication protocols. Device authentication for agent connections and device identity verification remain open items for this repo. **IA-4 (Identifier Management)**: @@ -1097,7 +1012,7 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme - **Vendor Implementation**: Secure acquisition process - **Product Requirements**: Implement secure distribution, verification, and installation procedures -- **Implementation Notes**: Include code signing, package verification, and secure distribution. Already planned: code signing and integrity verification for Business/Enterprise tiers. Additional work needed: secure acquisition process and package verification. +- **Implementation Notes**: Include code signing, package verification, and secure distribution. Code signing and integrity verification remain open items for this repo. Additional work needed: secure acquisition process and package verification. **SA-5 (Information System Documentation)**: @@ -1145,13 +1060,13 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme - **Vendor Implementation**: Tamper resistance and detection - **Product Requirements**: Implement tamper detection, integrity verification, and protection mechanisms -- **Implementation Notes**: Include integrity checking, tamper detection, and protection mechanisms. Already planned: code signing verification and integrity checking for Business/Enterprise tiers. Additional work needed: enhanced tamper detection and protection mechanisms. +- **Implementation Notes**: Include integrity checking, tamper detection, and protection mechanisms. Code signing verification and integrity checking remain open items for this repo. Additional work needed: enhanced tamper detection and protection mechanisms. **SA-19 (Component Authenticity)**: - **Vendor Implementation**: Component authenticity verification - **Product Requirements**: Implement component verification, authenticity checking, and validation -- **Implementation Notes**: Include signature verification, authenticity validation, and integrity checking. Already planned: code signing and integrity verification for Business/Enterprise tiers. Additional work needed: component authenticity verification and validation. +- **Implementation Notes**: Include signature verification, authenticity validation, and integrity checking. Code signing and integrity verification remain open items for this repo. Additional work needed: component authenticity verification and validation. **SA-20 (Customized Development of Critical Components)**: @@ -1229,19 +1144,19 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-16 (Transmission of Security Attributes)**: -- **Vendor Implementation**: Transmit security attributes with all DaemonEye data and communications. **Already Planned**: Data classification support is specified in product.md. **Additional Required**: Enhanced security attribute transmission and formal security attribute documentation. +- **Vendor Implementation**: Transmit security attributes with all DaemonEye data and communications. Data classification support is an open item for this repo. **Additional Required**: Enhanced security attribute transmission and formal security attribute documentation. - **Product Requirements**: Include data classification, sensitivity labels, and security markings in all transmissions - **Implementation Notes**: Embed security attributes in protobuf messages, database records, and alert payloads **SC-17 (Public Key Infrastructure Certificates)**: -- **Vendor Implementation**: Implement PKI certificate management for DaemonEye components. **Already Planned**: mTLS authentication and certificate management are specified in product.md for Business/Enterprise tiers. **Additional Required**: Enhanced PKI certificate management and formal PKI documentation. +- **Vendor Implementation**: Implement PKI certificate management for DaemonEye components. PKI and certificate management are out of scope for this repo. **Additional Required**: Enhanced PKI certificate management and formal PKI documentation. - **Product Requirements**: Support certificate-based authentication, mutual TLS, and certificate validation - **Implementation Notes**: Include certificate generation, validation, rotation, and revocation for agent authentication and alert delivery **SC-18 (Mobile Code)**: -- **Vendor Implementation**: Control mobile code execution in DaemonEye environment. **Already Planned**: Code signing verification is specified in product.md for Business/Enterprise tiers. **Additional Required**: Enhanced mobile code controls and formal mobile code documentation. +- **Vendor Implementation**: Control mobile code execution in DaemonEye environment. **Additional Required**: Enhanced mobile code controls and formal mobile code documentation. - **Product Requirements**: Implement controls for mobile code execution and validation - **Implementation Notes**: Include code signing verification, execution sandboxing, and mobile code validation @@ -1279,7 +1194,7 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-24 (Fail in Known State)**: -- **Vendor Implementation**: Ensure DaemonEye fails in a known, secure state. **Already Planned**: Graceful degradation and fail-safe design are specified in product.md. **Additional Required**: Enhanced fail-safe mechanisms and formal fail-safe documentation. +- **Vendor Implementation**: Ensure DaemonEye fails in a known, secure state. **Additional Required**: Enhanced fail-safe mechanisms and formal fail-safe documentation. - **Product Requirements**: Implement fail-safe mechanisms that maintain security boundaries during failures - **Implementation Notes**: Include graceful shutdown procedures, secure state preservation, and recovery from known states @@ -1321,7 +1236,7 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-32 (Information System Partitioning)**: -- **Vendor Implementation**: Implement system partitioning in DaemonEye architecture. **Already Planned**: Process isolation and separate databases are specified in product.md and tech.md. **Additional Required**: Enhanced system partitioning and formal partitioning documentation. +- **Vendor Implementation**: Implement system partitioning in DaemonEye architecture. **Additional Required**: Enhanced system partitioning and formal partitioning documentation. - **Product Requirements**: Ensure logical and physical separation of different security domains - **Implementation Notes**: Use process isolation, separate databases, and controlled data flow between partitions @@ -1333,7 +1248,7 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-34 (Modifiable Components)**: -- **Vendor Implementation**: Control modification of DaemonEye components. **Already Planned**: Code signing verification and integrity checking are specified in product.md for Business/Enterprise tiers. **Additional Required**: Enhanced tamper detection and formal modification control documentation. +- **Vendor Implementation**: Control modification of DaemonEye components. **Additional Required**: Enhanced tamper detection and formal modification control documentation. - **Product Requirements**: Implement tamper detection, code signing verification, and modification controls - **Implementation Notes**: Include integrity checking, signature verification, and protection against unauthorized modifications @@ -1345,13 +1260,13 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-36 (Distributed Processing and Storage)**: -- **Vendor Implementation**: Support distributed processing and storage for DaemonEye. **Already Planned**: Federated security centers and distributed data storage are specified in product.md for Enterprise tier. **Additional Required**: Enhanced distributed processing and formal distributed architecture documentation. +- **Vendor Implementation**: Support distributed processing and storage for DaemonEye. **Additional Required**: Enhanced distributed processing and formal distributed architecture documentation. - **Product Requirements**: Implement distributed architecture with secure communication and data consistency - **Implementation Notes**: Include federated security centers, distributed data storage, and secure inter-node communication **SC-37 (Out-of-Band Channels)**: -- **Vendor Implementation**: Support out-of-band communication channels for DaemonEye. **Already Planned**: Bundle-based configuration distribution for airgapped systems is specified in product.md. **Additional Required**: Enhanced out-of-band channels and formal out-of-band documentation. +- **Vendor Implementation**: Support out-of-band communication channels for DaemonEye. **Additional Required**: Enhanced out-of-band channels and formal out-of-band documentation. - **Product Requirements**: Implement alternative communication methods for critical operations - **Implementation Notes**: Include secure out-of-band alert delivery, administrative access, and emergency communication channels @@ -1377,7 +1292,7 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-42 (Sensor Capability and Data)**: -- **Vendor Implementation**: Implement sensor capabilities and data protection for DaemonEye. **Already Planned**: Process monitoring sensors and data collection security are specified in product.md and tech.md. **Additional Required**: Enhanced sensor capabilities and formal sensor documentation. +- **Vendor Implementation**: Implement sensor capabilities and data protection for DaemonEye. **Additional Required**: Enhanced sensor capabilities and formal sensor documentation. - **Product Requirements**: Provide sensor data collection, processing, and protection capabilities - **Implementation Notes**: Include process monitoring sensors, data collection security, and sensor data integrity verification @@ -1395,25 +1310,25 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-45 (System Time Synchronization)**: -- **Vendor Implementation**: Ensure time synchronization for DaemonEye components. **Already Planned**: Millisecond-precision timestamps are specified in product.md. **Additional Required**: Enhanced time synchronization and formal time synchronization documentation. +- **Vendor Implementation**: Ensure time synchronization for DaemonEye components. **Additional Required**: Enhanced time synchronization and formal time synchronization documentation. - **Product Requirements**: Implement accurate time synchronization and time-based security controls - **Implementation Notes**: Include NTP synchronization, time validation, and timestamp integrity verification **SC-46 (Cross-Service Attack Prevention)**: -- **Vendor Implementation**: Prevent cross-service attacks in DaemonEye. **Already Planned**: Service isolation and access controls are specified in product.md and tech.md. **Additional Required**: Enhanced cross-service attack prevention and formal attack prevention documentation. +- **Vendor Implementation**: Prevent cross-service attacks in DaemonEye. **Additional Required**: Enhanced cross-service attack prevention and formal attack prevention documentation. - **Product Requirements**: Implement isolation and protection between different services and components - **Implementation Notes**: Include service isolation, access controls, and monitoring for cross-service attack attempts **SC-47 (Alternate Communications Paths)**: -- **Vendor Implementation**: Provide alternate communication paths for DaemonEye. **Already Planned**: Multiple alert delivery channels are specified in product.md. **Additional Required**: Enhanced alternate communication paths and formal communication path documentation. +- **Vendor Implementation**: Provide alternate communication paths for DaemonEye. **Additional Required**: Enhanced alternate communication paths and formal communication path documentation. - **Product Requirements**: Implement redundant communication channels and failover mechanisms - **Implementation Notes**: Include multiple alert delivery channels, backup communication methods, and automatic failover **SC-48 (Application Partitioning)**: -- **Vendor Implementation**: Implement application partitioning for DaemonEye security. **Already Planned**: Component isolation and data separation are specified in product.md and tech.md. **Additional Required**: Enhanced application partitioning and formal partitioning documentation. +- **Vendor Implementation**: Implement application partitioning for DaemonEye security. **Additional Required**: Enhanced application partitioning and formal partitioning documentation. - **Product Requirements**: Ensure logical separation of application components and data @@ -1427,13 +1342,13 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-50 (Software-Enforced Separation)**: -- **Vendor Implementation**: Implement software-enforced separation in DaemonEye. **Already Planned**: Process isolation and access control enforcement are specified in product.md and tech.md. **Additional Required**: Enhanced software-enforced separation and formal separation documentation. +- **Vendor Implementation**: Implement software-enforced separation in DaemonEye. **Additional Required**: Enhanced software-enforced separation and formal separation documentation. - **Product Requirements**: Use software controls to enforce security boundaries and separation - **Implementation Notes**: Include process isolation, memory protection, and access control enforcement **SC-51 (Hardware-Based Security)**: -- **Vendor Implementation**: Leverage hardware-based security features for DaemonEye. **Already Planned**: HSM integration and TPM support are specified in product.md for Enterprise tier. **Additional Required**: Enhanced hardware-based security and formal hardware security documentation. +- **Vendor Implementation**: Leverage hardware-based security features for DaemonEye. **Additional Required**: Enhanced hardware-based security and formal hardware security documentation. - **Product Requirements**: Utilize hardware security modules and trusted platform modules where available - **Implementation Notes**: Include HSM integration, TPM support, and hardware-based key storage @@ -1457,19 +1372,19 @@ Based on analysis of DaemonEye's current design against NIST SP 800-53 requireme **SC-55 (Enforceable Access Control)**: -- **Vendor Implementation**: Implement enforceable access control for DaemonEye. **Already Planned**: Access control enforcement is specified in product.md and tech.md. **Additional Required**: Enhanced access control and formal access control documentation. +- **Vendor Implementation**: Implement enforceable access control for DaemonEye. **Additional Required**: Enhanced access control and formal access control documentation. - **Product Requirements**: Provide mandatory access controls and enforcement mechanisms - **Implementation Notes**: Include role-based access control, mandatory access controls, and access enforcement **SC-56 (Enforceable Execution Domains)**: -- **Vendor Implementation**: Implement enforceable execution domains for DaemonEye. **Already Planned**: Execution isolation and domain separation are specified in product.md and tech.md. **Additional Required**: Enhanced execution domains and formal execution domain documentation. +- **Vendor Implementation**: Implement enforceable execution domains for DaemonEye. **Additional Required**: Enhanced execution domains and formal execution domain documentation. - **Product Requirements**: Control execution environments and domain boundaries - **Implementation Notes**: Include execution isolation, domain separation, and execution environment controls **SC-57 (Data Location)**: -- **Vendor Implementation**: Control data location for DaemonEye. **Already Planned**: Data residency controls are specified in product.md for Enterprise tier. **Additional Required**: Enhanced data location controls and formal data location documentation. +- **Vendor Implementation**: Control data location for DaemonEye. **Additional Required**: Enhanced data location controls and formal data location documentation. - **Product Requirements**: Implement data residency controls and location restrictions - **Implementation Notes**: Include data location tracking, residency controls, and geographic restrictions diff --git a/docs/src/user-guides.md b/docs/src/user-guides.md index fb3dde1b..850e509c 100644 --- a/docs/src/user-guides.md +++ b/docs/src/user-guides.md @@ -442,19 +442,14 @@ integrations: cef: enabled: true output_file: /var/log/daemoneye/cef.log - cef_version: "1.0" - device_vendor: "DaemonEye" - device_product: "Process Monitor" + cef_version: '1.0' + device_vendor: DaemonEye + device_product: Process Monitor -# STIX Export -integrations: - export: - stix: - enabled: true - output_file: /var/log/daemoneye/stix.json - stix_version: "2.1" ``` +Additional export formats (STIX/TAXII and others) are available in commercial tiers. + ### Performance Tuning **Optimize for High Load**: diff --git a/docs/src/user-guides/configuration.md b/docs/src/user-guides/configuration.md index 76081508..392c480a 100644 --- a/docs/src/user-guides/configuration.md +++ b/docs/src/user-guides/configuration.md @@ -10,8 +10,6 @@ This guide provides comprehensive information about configuring DaemonEye for di - [Alerting Configuration](#alerting-configuration) - [Database Configuration](#database-configuration) - [Platform-Specific Configuration](#platform-specific-configuration) -- [Business Tier Configuration](#business-tier-configuration) -- [Enterprise Tier Configuration](#enterprise-tier-configuration) - [Environment Variables](#environment-variables) - [Configuration Examples](#configuration-examples) - [Troubleshooting](#troubleshooting) @@ -215,36 +213,10 @@ alerting: from: daemoneye@example.com to: [security@example.com] subject: 'DaemonEye Alert: {severity} - {title}' - - # Splunk HEC sink (Business Tier) - - type: splunk_hec - enabled: false - endpoint: https://splunk.example.com:8088/services/collector - token: ${SPLUNK_HEC_TOKEN} - index: daemoneye - source_type: daemoneye:alert - sourcetype: daemoneye:alert - - # Elasticsearch sink (Business Tier) - - type: elasticsearch - enabled: false - hosts: [https://elastic.example.com:9200] - username: ${ELASTIC_USERNAME} - password: ${ELASTIC_PASSWORD} - index_pattern: daemoneye-{YYYY.MM.DD} - pipeline: daemoneye-alerts - - # Kafka sink (Business Tier) - - type: kafka - enabled: false - brokers: [kafka.example.com:9092] - topic: daemoneye.alerts - security_protocol: SASL_SSL - sasl_mechanism: PLAIN - sasl_username: ${KAFKA_USERNAME} - sasl_password: ${KAFKA_PASSWORD} ``` +Additional sink types (Splunk HEC, Elasticsearch, Kafka, and others) are available in commercial tiers. + ### Alert Filtering ```yaml @@ -310,15 +282,6 @@ database: ```yaml platform: linux: - # Enable eBPF monitoring (Enterprise Tier) - enable_ebpf: false - - # eBPF program path - ebpf_program_path: /usr/lib/daemoneye/daemoneye_monitor.o - - # eBPF ring buffer size - ebpf_ring_buffer_size: 1048576 # 1MB - # Enable process namespace monitoring enable_namespace_monitoring: true @@ -326,7 +289,7 @@ platform: enable_cgroup_monitoring: true # Process collection method - collection_method: sysinfo # sysinfo, ebpf, hybrid + collection_method: sysinfo # Privilege requirements privileges: @@ -340,23 +303,13 @@ platform: privilege_drop_timeout_secs: 30 ``` +Kernel-level collection (eBPF) is available in commercial tiers. + ### Windows Configuration ```yaml platform: windows: - # Enable ETW monitoring (Enterprise Tier) - enable_etw: false - - # ETW session name - etw_session_name: DaemonEye - - # ETW buffer size in KB - etw_buffer_size_kb: 64 - - # ETW maximum buffers - etw_max_buffers: 100 - # Enable registry monitoring enable_registry_monitoring: false @@ -364,7 +317,7 @@ platform: enable_filesystem_monitoring: false # Process collection method - collection_method: sysinfo # sysinfo, etw, hybrid + collection_method: sysinfo # Privilege requirements privileges: @@ -375,20 +328,13 @@ platform: drop_privileges: true ``` +Kernel-level collection (ETW) is available in commercial tiers. + ### macOS Configuration ```yaml platform: macos: - # Enable EndpointSecurity monitoring (Enterprise Tier) - enable_endpoint_security: false - - # EndpointSecurity event types - es_event_types: - - ES_EVENT_TYPE_NOTIFY_EXEC - - ES_EVENT_TYPE_NOTIFY_FORK - - ES_EVENT_TYPE_NOTIFY_EXIT - # Enable file system monitoring enable_filesystem_monitoring: false @@ -396,7 +342,7 @@ platform: enable_network_monitoring: false # Process collection method - collection_method: sysinfo # sysinfo, endpoint_security, hybrid + collection_method: sysinfo # Privilege requirements privileges: @@ -407,255 +353,7 @@ platform: drop_privileges: true ``` -## Business Tier Configuration - -### Security Center - -```yaml -business_tier: - # License configuration - license: - # License key - key: ${DAEMONEYE_LICENSE_KEY} - - # License validation endpoint (optional) - validation_endpoint: - - # Offline validation only - offline_only: true - - # Security Center configuration - security_center: - # Enable Security Center - enabled: false - - # Security Center endpoint - endpoint: https://security-center.example.com:8443 - - # Client certificate path - client_cert_path: /etc/daemoneye/agent.crt - - # Client key path - client_key_path: /etc/daemoneye/agent.key - - # CA certificate path - ca_cert_path: /etc/daemoneye/ca.crt - - # Connection timeout in seconds - connection_timeout_secs: 30 - - # Heartbeat interval in seconds - heartbeat_interval_secs: 30 - - # Retry configuration - retry: - max_attempts: 3 - base_delay_ms: 1000 - max_delay_ms: 30000 - backoff_multiplier: 2.0 -``` - -### Rule Packs - -```yaml -business_tier: - # Rule pack configuration - rule_packs: - # Enable automatic updates - auto_update: true - - # Update interval in hours - update_interval_hours: 24 - - # Rule pack sources - sources: - - name: official - url: https://rules.daemoneye.com/packs/ - signature_key: ed25519:public-key - enabled: true - - - name: custom - url: https://internal-rules.company.com/ - signature_key: ed25519:custom-key - enabled: true - - # Local rule pack directory - local_directory: /etc/daemoneye/rule-packs - - # Signature validation - signature_validation: - enabled: true - strict_mode: true - allowed_keys: [ed25519:official-key, ed25519:custom-key] -``` - -### Enhanced Connectors - -```yaml -business_tier: - # Enhanced output connectors - enhanced_connectors: - # Splunk HEC connector - splunk_hec: - enabled: false - endpoint: https://splunk.example.com:8088/services/collector - token: ${SPLUNK_HEC_TOKEN} - index: daemoneye - source_type: daemoneye:alert - sourcetype: daemoneye:alert - batch_size: 100 - batch_timeout_ms: 5000 - - # Elasticsearch connector - elasticsearch: - enabled: false - hosts: [https://elastic.example.com:9200] - username: ${ELASTIC_USERNAME} - password: ${ELASTIC_PASSWORD} - index_pattern: daemoneye-{YYYY.MM.DD} - pipeline: daemoneye-alerts - batch_size: 1000 - batch_timeout_ms: 10000 - - # Kafka connector - kafka: - enabled: false - brokers: [kafka.example.com:9092] - topic: daemoneye.alerts - security_protocol: SASL_SSL - sasl_mechanism: PLAIN - sasl_username: ${KAFKA_USERNAME} - sasl_password: ${KAFKA_PASSWORD} - batch_size: 100 - batch_timeout_ms: 5000 -``` - -## Enterprise Tier Configuration - -### Kernel Monitoring - -```yaml -enterprise_tier: - # Kernel monitoring configuration - kernel_monitoring: - # Enable kernel monitoring - enabled: false - - # Monitoring method - method: auto # auto, ebpf, etw, endpoint_security, disabled - - # eBPF configuration (Linux) - ebpf: - enabled: false - program_path: /usr/lib/daemoneye/daemoneye_monitor.o - ring_buffer_size: 2097152 # 2MB - max_events_per_second: 10000 - - # ETW configuration (Windows) - etw: - enabled: false - session_name: DaemonEye - buffer_size_kb: 128 - max_buffers: 200 - providers: - - name: Microsoft-Windows-Kernel-Process - guid: 22FB2CD6-0E7B-422B-A0C7-2FAD1FD0E716 - level: 5 - keywords: 0xFFFFFFFFFFFFFFFF - - # EndpointSecurity configuration (macOS) - endpoint_security: - enabled: false - event_types: - - ES_EVENT_TYPE_NOTIFY_EXEC - - ES_EVENT_TYPE_NOTIFY_FORK - - ES_EVENT_TYPE_NOTIFY_EXIT - - ES_EVENT_TYPE_NOTIFY_OPEN - - ES_EVENT_TYPE_NOTIFY_CLOSE -``` - -### Federation - -```yaml -enterprise_tier: - # Federation configuration - federation: - # Enable federation - enabled: false - - # Federation tier - tier: agent # agent, regional, primary - - # Regional Security Center - regional_center: - endpoint: https://regional-center.example.com:8443 - certificate_path: /etc/daemoneye/regional.crt - key_path: /etc/daemoneye/regional.key - - # Primary Security Center - primary_center: - endpoint: https://primary-center.example.com:8443 - certificate_path: /etc/daemoneye/primary.crt - key_path: /etc/daemoneye/primary.key - - # Data synchronization - sync: - # Sync interval in minutes - interval_minutes: 5 - - # Sync batch size - batch_size: 1000 - - # Enable compression - compression: true - - # Enable encryption - encryption: true -``` - -### STIX/TAXII Integration - -```yaml -enterprise_tier: - # STIX/TAXII configuration - stix_taxii: - # Enable STIX/TAXII integration - enabled: false - - # TAXII servers - servers: - - name: threat-intel-server - url: https://threat-intel.example.com/taxii2/ - username: ${TAXII_USERNAME} - password: ${TAXII_PASSWORD} - collections: [malware-indicators, attack-patterns] - - # Polling configuration - polling: - # Poll interval in minutes - interval_minutes: 60 - - # Maximum indicators per poll - max_indicators: 10000 - - # Indicator confidence threshold - min_confidence: 50 - - # Indicator conversion - conversion: - # Convert STIX indicators to detection rules - auto_convert: true - - # Rule template for converted indicators - rule_template: stix-indicator-{id} - - # Rule severity mapping - severity_mapping: - low: low - medium: medium - high: high - critical: critical -``` +Kernel-level collection (EndpointSecurity) is available in commercial tiers. ## Environment Variables @@ -683,41 +381,6 @@ export DAEMONEYE_ENABLE_ETW=false export DAEMONEYE_ENABLE_ENDPOINT_SECURITY=false ``` -### Business Tier Variables - -```bash -# Security Center -export DAEMONEYE_SECURITY_CENTER_ENABLED=false -export DAEMONEYE_SECURITY_CENTER_ENDPOINT=https://security-center.example.com:8443 -export DAEMONEYE_CLIENT_CERT_PATH=/etc/daemoneye/agent.crt -export DAEMONEYE_CLIENT_KEY_PATH=/etc/daemoneye/agent.key - -# Enhanced connectors -export SPLUNK_HEC_TOKEN=your-splunk-token -export ELASTIC_USERNAME=your-elastic-username -export ELASTIC_PASSWORD=your-elastic-password -export KAFKA_USERNAME=your-kafka-username -export KAFKA_PASSWORD=your-kafka-password -``` - -### Enterprise Tier Variables - -```bash -# Kernel monitoring -export DAEMONEYE_KERNEL_MONITORING_ENABLED=false -export DAEMONEYE_EBPF_ENABLED=false -export DAEMONEYE_ETW_ENABLED=false -export DAEMONEYE_ENDPOINT_SECURITY_ENABLED=false - -# Federation -export DAEMONEYE_FEDERATION_ENABLED=false -export DAEMONEYE_REGIONAL_CENTER_ENDPOINT=https://regional.example.com:8443 - -# STIX/TAXII -export TAXII_USERNAME=your-taxii-username -export TAXII_PASSWORD=your-taxii-password -``` - ## Configuration Examples ### Basic Production Configuration diff --git a/spec/daemon_eye_spec_sql_to_ipc_detection_architecture.md b/spec/daemon_eye_spec_sql_to_ipc_detection_architecture.md index ae90d48d..0d4270e6 100644 --- a/spec/daemon_eye_spec_sql_to_ipc_detection_architecture.md +++ b/spec/daemon_eye_spec_sql_to_ipc_detection_architecture.md @@ -1264,6 +1264,20 @@ This approach maintains full SQLite dialect compatibility while adding DaemonEye ## 9) Storage & Execution Model +> **Superseded by ADR-0006 — Detection Query Execution (redb + DataFusion) (2026-04-17).** +> +> The "custom operator pipeline" direction in §9.2 is retained below for historical context and for its redb schema / indexing guidance (§11.6–11.7), which remains authoritative. However, Phase 2 SQL execution is no longer a hand-rolled operator pipeline — it is Apache DataFusion layered over redb via per-collector `TableProvider` implementations. See ADR-0006 for rationale, alternatives considered (sled, GlueSQL, Turso, rusqlite/duckdb, Polars), and the compile-time contract between the dialect lowering stage and DataFusion-compatible SQL. +> +> Sections that remain authoritative after ADR-0006: +> +> - §11.5 Smart Joins (INLJ / SHJ / MRC strategies) +> - §11.6 Write-Through & Persistence Semantics +> - §11.7 redb Performance Playbook (partitioning, key encoding, indexes, writer architecture) +> +> Sections that are superseded: +> +> - §11.1–§11.4 (Why Not a Full RDBMS, Chosen Approach: Operator Pipeline, Store Abstraction, Operator Examples) — replaced by ADR-0006's "redb per-domain tables + DataFusion TableProvider" model. + ### 11.1 Why Not a Full RDBMS? - Embedded SQL engines like SQLite are heavyweight, require unsafe code, and don't align with the zero-network, operator-focused design. diff --git a/spec/procmond/index.md b/spec/procmond/index.md deleted file mode 100644 index 5fa4299c..00000000 --- a/spec/procmond/index.md +++ /dev/null @@ -1,95 +0,0 @@ -# Procmond Implementation Epic - Ticket Index - -- **Epic**: Complete Procmond Implementation -- **Related Issues**: #39, #89, #40, #103, #64 - -## Ticket Completion Order - -Execute tickets in order. Each ticket's dependencies must be complete before starting. - -### Phase 1: Event Bus Integration - -- [x] **Ticket 1**: [Implement Write-Ahead Log and Event Bus Connector](./tickets/Implement_Write-Ahead_Log_and_Event_Bus_Connector.md) - - ✅ WAL component (verified existing implementation meets all criteria) - - ✅ EventBusConnector with WAL integration - - ✅ Event buffering (10MB) and replay - - ✅ Dynamic backpressure (70% threshold) - -### Phase 2: RPC and Lifecycle Management - -- [x] **Ticket 2**: [Implement Actor Pattern and Startup Coordination](./tickets/Implement_Actor_Pattern_and_Startup_Coordination.md) - - - Actor pattern in ProcmondMonitorCollector - - Replace LocalEventBus with EventBusConnector - - Startup coordination ("begin monitoring" wait) - - Dynamic interval adjustment from backpressure - - *Requires: Ticket 1* - -- [x] **Ticket 3**: [Implement RPC Service and Registration Manager](<./tickets/Implement_RPC_Service_and_Registration_Manager_(procmond).md>) - - - RpcServiceHandler component - - RegistrationManager component - - Lifecycle operations (HealthCheck, UpdateConfig, GracefulShutdown) - - Heartbeat publishing (30s interval) - - *Requires: Ticket 2* - -- [x] **Ticket 4**: [Implement Agent Loading State and Heartbeat Detection](./tickets/Implement_Agent_Loading_State_and_Heartbeat_Detection.md) - - - Collector configuration format (agent.yaml) - - Loading state machine (Loading → Ready → Steady State) - - Heartbeat failure detection with escalating actions - - **Note**: This is daemoneye-agent work, not procmond - - *Requires: Tickets 2, 3* - -### Phase 3: Testing - -- [ ] **Ticket 5**: [Implement Comprehensive Test Suite](./tickets/Implement_Comprehensive_Test_Suite.md) - - Unit tests (>80% coverage) - - Integration tests (event bus, RPC, cross-platform) - - Chaos tests (connection failures, backpressure) - - Security tests (privilege escalation, injection, DoS) - - *Requires: Tickets 1, 2, 3, 4* - -### Phase 4: Hardening - -- [ ] **Ticket 6**: [Implement Security Hardening and Data Sanitization](./tickets/Implement_Security_Hardening_and_Data_Sanitization.md) - - Privilege detection (Linux caps, Windows tokens, macOS entitlements) - - Command-line and environment variable sanitization - - Security boundary validation - - Security test suite - - *Requires: Ticket 5* - -### Phase 5: Platform and Performance Validation - -- [ ] **Ticket 7**: [Validate FreeBSD Platform Support](./tickets/Validate_FreeBSD_Platform_Support.md) - - - Test FallbackProcessCollector on FreeBSD 13+ - - Document limitations (basic metadata only) - - Platform detection and capability reporting - - *Requires: Ticket 5* - -- [ ] **Ticket 8**: [Validate Performance and Optimize](./tickets/Validate_Performance_and_Optimize.md) - - - Benchmark process enumeration (\<100ms for 1,000 processes) - - Load test with 10,000+ processes - - Memory profiling (\<100MB sustained) - - CPU monitoring (\<5% sustained) - - Regression testing - - *Requires: Tickets 6, 7* - ---- - -## Reference Documents - -- [Epic Brief](./specs/Epic_Brief__Complete_Procmond_Implementation.md) -- [Core Flows](./specs/Core_Flows__Procmond_Process_Monitoring.md) -- [Tech Plan](./specs/Tech_Plan__Complete_Procmond_Implementation.md) - -## Success Criteria - -- [ ] Process enumeration works on Linux, macOS, Windows (full) and FreeBSD (basic) -- [ ] Event bus communication with daemoneye-agent is reliable -- [ ] Service lifecycle (start/stop/health) works via RPC -- [ ] Privilege boundaries enforced and validated -- [ ] Performance targets met (see Ticket 8) -- [ ] >80% unit test coverage, >90% critical path coverage diff --git a/spec/procmond/specs/Core_Flows__Procmond_Process_Monitoring.md b/spec/procmond/specs/Core_Flows__Procmond_Process_Monitoring.md deleted file mode 100644 index a0b37e24..00000000 --- a/spec/procmond/specs/Core_Flows__Procmond_Process_Monitoring.md +++ /dev/null @@ -1,683 +0,0 @@ -# Core Flows: Procmond Process Monitoring - -## Overview - -This document describes the core user flows for procmond, the process monitoring daemon in DaemonEye. These flows capture how operators interact with procmond through daemoneye-agent and how the system behaves during normal operation and failure scenarios. - -**Key Principles:** - -- Operators interact through daemoneye-agent/CLI, not directly with procmond -- procmond runs autonomously with minimal operator intervention -- Configuration is centrally managed and pushed from daemoneye-agent -- System validates connectivity before starting and adapts to runtime conditions - ---- - -## Flow 1: Initial Deployment and First-Run Setup - -**Description:** How operators set up procmond for the first time on a new system - -**Trigger:** Operator installs DaemonEye on a new system - -**Steps:** - -01. Operator installs DaemonEye package (deb, rpm, pkg, msi, or homebrew) -02. Installation creates default configuration files in system location -03. Operator reviews and adjusts configuration via `daemoneye-cli config show procmond` -04. Operator sets collection interval, metadata options, and resource limits -05. Operator validates configuration via `daemoneye-cli config validate` -06. Operator starts daemoneye-agent service (systemd, launchd, Windows Service) -07. daemoneye-agent starts embedded event bus broker -08. daemoneye-agent spawns procmond with validated configuration -09. procmond connects to event bus and registers capabilities -10. procmond performs initial process enumeration -11. Operator runs `daemoneye-cli health procmond` to verify setup -12. Operator sees "procmond: healthy" status confirming successful setup - -**First-Run Validation:** - -- Configuration syntax is valid -- procmond can connect to event bus -- Platform-specific collector initializes successfully -- Initial process enumeration completes -- Health check passes - -**Common First-Run Issues:** - -- **Insufficient privileges:** Operator sees permission errors; must run daemoneye-agent with appropriate privileges -- **Invalid configuration:** Operator sees validation errors; must correct configuration file -- **Event bus connection fails:** Operator sees connection timeout; must verify daemoneye-agent is running - ---- - -## Flow 2: System Startup and Initialization - -**Description:** How procmond starts up, connects to the event bus, and begins monitoring (subsequent starts after initial deployment) - -**Trigger:** daemoneye-agent starts procmond as part of system initialization - -**Steps:** - -01. daemoneye-agent starts its embedded event bus broker -02. daemoneye-agent spawns procmond process with configuration -03. procmond initializes logging and loads configuration from daemoneye-agent -04. procmond validates configuration parameters (intervals, limits, metadata options) -05. procmond attempts to connect to daemoneye-agent's event bus broker -06. **Decision Point:** If connection fails, procmond retries with exponential backoff (up to 3 attempts) -07. **Success Path:** procmond registers with broker, publishes registration message with capabilities -08. procmond initializes platform-specific collector (Linux/macOS/Windows/FreeBSD) -09. procmond performs initial health check and reports status to daemoneye-agent -10. procmond begins continuous monitoring loop -11. Operator sees "procmond: healthy" status in daemoneye-agent health report - -**Failure Paths:** - -- **Event bus unreachable:** procmond logs error, retries, then exits if all attempts fail; daemoneye-agent shows "procmond: disconnected" status -- **Invalid configuration:** procmond logs validation errors and exits; operator sees error in daemoneye-agent logs -- **Platform collector initialization fails:** procmond falls back to basic sysinfo collector; operator sees warning in health status - -```mermaid -sequenceDiagram - participant Operator - participant Agent as daemoneye-agent - participant Broker as Event Bus Broker - participant Procmond as procmond - - Operator->>Agent: Start DaemonEye system - Agent->>Broker: Initialize embedded broker - Broker-->>Agent: Broker ready - Agent->>Procmond: Start procmond with config - Procmond->>Procmond: Load configuration - Procmond->>Procmond: Validate parameters - Procmond->>Broker: Connect to event bus - alt Connection successful - Broker-->>Procmond: Connection established - Procmond->>Broker: Publish registration (capabilities) - Procmond->>Procmond: Initialize platform collector - Procmond->>Broker: Publish health status (healthy) - Procmond->>Procmond: Start monitoring loop - Agent->>Operator: Display "procmond: healthy" - else Connection failed - Broker-->>Procmond: Connection timeout - Procmond->>Procmond: Retry with backoff (3 attempts) - Procmond->>Procmond: Exit after retries exhausted - Agent->>Operator: Display "procmond: disconnected" - end -``` - ---- - -## Flow 3: Continuous Process Monitoring - -**Description:** The ongoing cycle of collecting process data and publishing events to the event bus - -**Trigger:** procmond's monitoring loop runs on configured interval (default: 30 seconds) - -**Steps:** - -01. procmond waits for next collection interval tick -02. procmond enumerates all running processes using platform-specific collector -03. procmond collects basic metadata (PID, name, executable path, command line, resource usage) -04. **Decision Point:** If enhanced metadata is enabled, collect platform-specific details (network connections, file descriptors, security contexts) -05. procmond compares current process list with previous snapshot (lifecycle tracking) -06. procmond identifies lifecycle events (process starts, stops, modifications) -07. procmond publishes process events to event bus topic `events.process.batch` -08. procmond publishes lifecycle events to topic `events.process.lifecycle` -09. **Decision Point:** If backpressure detected (event bus queue full), procmond slows down event publishing -10. procmond updates internal statistics (processes collected, events published, errors) -11. procmond stores audit trail in local database -12. Cycle repeats on next interval - -**Operator Visibility:** - -- No real-time visibility during normal operation -- Statistics available through daemoneye-agent health endpoint -- Errors logged and visible in daemoneye-agent status - -**Performance Expectations:** - -- Collection completes within interval (30 seconds default) -- Enumerate 1,000 processes in \<100ms -- Memory usage stays \<100MB -- CPU usage \<5% sustained - -```mermaid -sequenceDiagram - participant Procmond as procmond - participant Collector as Platform Collector - participant Tracker as Lifecycle Tracker - participant Broker as Event Bus - - loop Every collection interval - Procmond->>Collector: Enumerate processes - Collector-->>Procmond: Process list with metadata - Procmond->>Tracker: Compare with previous snapshot - Tracker-->>Procmond: Lifecycle events (starts/stops/changes) - Procmond->>Broker: Publish process batch (events.process.batch) - Procmond->>Broker: Publish lifecycle events (events.process.lifecycle) - alt Backpressure detected - Broker-->>Procmond: Queue full signal - Procmond->>Procmond: Slow down publishing rate - end - Procmond->>Procmond: Update statistics - Procmond->>Procmond: Store audit trail - end -``` - ---- - -## Flow 4: Suspicious Process Detection and Triggering - -**Description:** How procmond identifies suspicious processes and triggers deeper analysis by other collectors - -**Trigger:** Suspicious process detected during lifecycle tracking (PID reuse, unsigned binary, anomalous behavior) - -**Steps:** - -1. procmond detects suspicious process during lifecycle analysis -2. procmond evaluates configured detection rules (operator-defined via daemoneye-agent) -3. **Decision Point:** Rule matches determine if process is suspicious -4. procmond creates trigger request with priority (Low/Normal/High/Critical) -5. procmond publishes trigger request to event bus topic `control.collector.task.{collector_type}.{id}` -6. daemoneye-agent receives trigger request and routes to appropriate collector (e.g., binary hasher) -7. Target collector performs analysis and publishes results back to event bus -8. procmond continues monitoring without waiting for analysis completion -9. Operator can review triggered analyses through daemoneye-cli query interface - -**Operator Configuration:** - -- Operators define detection rules through daemoneye-agent configuration -- Rules specify conditions (unsigned binaries, network connections, privilege escalation) -- Rules specify which collectors to trigger (binary hasher, memory analyzer, etc.) - -**Example Scenarios:** - -- **Unsigned binary detected:** Trigger binary hasher for integrity verification -- **PID reuse detected:** Trigger behavioral analysis for anomaly detection -- **Privilege escalation:** Trigger memory analyzer for credential dumping detection - -```mermaid -sequenceDiagram - participant Procmond as procmond - participant Tracker as Lifecycle Tracker - participant Rules as Detection Rules - participant Broker as Event Bus - participant Agent as daemoneye-agent - participant Analyzer as Analysis Collector - - Procmond->>Tracker: Detect process changes - Tracker-->>Procmond: Suspicious event (PID reuse) - Procmond->>Rules: Evaluate detection rules - Rules-->>Procmond: Rule matched: trigger binary hasher - Procmond->>Broker: Publish trigger request (High priority) - Broker->>Agent: Route trigger request - Agent->>Analyzer: Start binary hash analysis - Analyzer->>Broker: Publish analysis results - Note over Procmond: Continues monitoring independently -``` - ---- - -## Flow 5: Configuration Update - -**Description:** How operators update procmond's configuration through daemoneye-cli - -**Trigger:** Operator modifies procmond configuration (e.g., change collection interval, enable enhanced metadata) - -**Steps:** - -01. Operator updates configuration via daemoneye-cli: `daemoneye-cli config update procmond --interval=60 --enhanced-metadata=true` -02. daemoneye-agent validates new configuration parameters -03. daemoneye-agent publishes configuration update to event bus topic `control.collector.config` -04. procmond receives configuration update message -05. procmond validates new configuration (intervals, limits, feature flags) -06. **Decision Point:** If validation fails, procmond rejects update and reports error -07. **Success Path:** procmond applies new configuration without restarting -08. procmond adjusts monitoring behavior (new interval, metadata collection level) -09. procmond publishes configuration acknowledgment to event bus -10. daemoneye-agent confirms configuration applied successfully -11. Operator sees "Configuration updated successfully" message - -**Configuration Changes Supported:** - -- Collection interval adjustment (5-3600 seconds) -- Enhanced metadata toggle (on/off) -- Executable hashing toggle (on/off) -- Maximum processes per cycle limit -- Detection rule updates - -**No Restart Required:** - -- Configuration changes apply to next collection cycle -- No service interruption or process restart needed - ---- - -## Flow 6: Health Monitoring and Status Reporting - -**Description:** How operators monitor procmond's health and diagnose issues - -**Trigger:** Operator checks system health via daemoneye-cli or daemoneye-agent reports degraded status - -**Steps:** - -1. Operator runs health check command through daemoneye-cli - -2. daemoneye-cli queries daemoneye-agent for component health - -3. daemoneye-agent requests health status from procmond via event bus - -4. procmond performs self-health check: - - - Verify event bus connectivity - - Check collection cycle success rate - - Validate resource usage (memory, CPU) - - Check for consecutive failures - -5. procmond publishes health status to topic `control.health.status` - -6. daemoneye-agent aggregates health data and returns to CLI - -7. Operator sees health report with status indicators: - - - **Healthy:** All checks passing, normal operation - - **Degraded:** Some issues but still functional (e.g., enhanced metadata unavailable) - - **Unhealthy:** Critical issues requiring intervention (e.g., event bus disconnected) - -**Health Indicators:** - -- Event bus connectivity status -- Collection success rate (last 10 cycles) -- Current resource usage (memory, CPU) -- Backpressure events count -- Last successful collection timestamp -- Platform collector status - -**Operator Actions Based on Status:** - -- **Healthy:** No action needed -- **Degraded:** Review warnings, consider configuration adjustments -- **Unhealthy:** Investigate errors, check logs, potentially restart daemoneye-agent - ---- - -## Flow 7: Error Handling and Recovery - -**Description:** How procmond handles failures and recovers gracefully - -### 7.1: Event Bus Connection Failure - -**Trigger:** procmond loses connection to daemoneye-agent's event bus broker - -**Steps:** - -1. procmond detects connection failure during event publishing -2. procmond logs error with connection details -3. procmond enters reconnection mode with exponential backoff -4. procmond buffers events locally (up to configured limit) -5. **Decision Point:** After 3 failed reconnection attempts, procmond reports critical failure -6. daemoneye-agent detects procmond disconnection via missing heartbeats -7. daemoneye-agent attempts to restart procmond -8. **Recovery:** When connection restored, procmond publishes buffered events -9. Operator sees "procmond: reconnected" status update - -### 7.2: Permission/Privilege Failure - -**Trigger:** procmond cannot access process information due to insufficient privileges - -**Steps:** - -1. procmond attempts to collect process metadata -2. Platform collector returns permission denied error -3. procmond logs specific process and permission error -4. **Decision Point:** If error is for single process, skip and continue; if systemic, report degraded status -5. procmond publishes partial results with error metadata -6. procmond reports degraded health status to daemoneye-agent -7. Operator sees warning in health report: "Limited process visibility due to permissions" -8. Operator can review logs to identify privilege requirements - -### 7.3: Performance Degradation and Backpressure - -**Trigger:** Collection takes too long or event bus cannot keep up with event volume - -**Steps:** - -1. procmond detects collection exceeding interval time or event bus backpressure -2. procmond activates circuit breaker after 5 consecutive backpressure events -3. procmond reduces event publishing rate (drops low-priority events) -4. procmond logs performance degradation with metrics -5. procmond publishes degraded health status -6. **Decision Point:** If degradation persists, procmond requests configuration adjustment from daemoneye-agent -7. daemoneye-agent may increase collection interval or disable enhanced metadata -8. Operator sees "procmond: degraded (performance)" in health status -9. Operator can review performance metrics and adjust configuration - -### 7.4: Platform-Specific Enumeration Failure - -**Trigger:** Platform-specific collector fails (e.g., procfs unavailable on Linux, WinAPI error on Windows) - -**Steps:** - -1. Platform-specific collector encounters error during enhanced metadata collection -2. procmond logs platform-specific error details -3. procmond falls back to basic sysinfo collector -4. procmond continues with reduced metadata (no network connections, file descriptors, etc.) -5. procmond publishes events with "degraded_metadata" flag -6. procmond reports degraded health status with reason -7. Operator sees "procmond: degraded (limited metadata)" in health report -8. Operator can investigate platform-specific issues (missing kernel modules, security policies) - -### 7.5: Resource Exhaustion - -**Trigger:** procmond approaches memory or CPU limits - -**Steps:** - -1. procmond monitors its own resource usage -2. procmond detects memory usage approaching limit (>90MB of 100MB budget) -3. procmond reduces buffer sizes and clears old snapshots -4. procmond disables enhanced metadata collection temporarily -5. procmond logs resource exhaustion warning -6. **Decision Point:** If resource usage continues to grow, procmond requests restart from daemoneye-agent -7. daemoneye-agent gracefully restarts procmond -8. Operator sees "procmond: restarted (resource limits)" in event log -9. Operator can review resource usage trends and adjust limits - -```mermaid -flowchart TD - Start[Collection Cycle Start] --> Enumerate[Enumerate Processes] - Enumerate --> CheckSuccess{Collection<br/>Successful?} - - CheckSuccess -->|Yes| Lifecycle[Lifecycle Analysis] - CheckSuccess -->|No| CheckError{Error Type?} - - CheckError -->|Permission| PartialResults[Publish Partial Results] - CheckError -->|Platform| Fallback[Fall Back to Basic Collection] - CheckError -->|Timeout| Retry[Retry with Backoff] - - PartialResults --> ReportDegraded[Report Degraded Status] - Fallback --> ReportDegraded - Retry --> CheckRetries{Retries<br/>Exhausted?} - - CheckRetries -->|No| Enumerate - CheckRetries -->|Yes| ReportUnhealthy[Report Unhealthy Status] - - Lifecycle --> Publish[Publish Events to Bus] - Publish --> CheckBackpressure{Backpressure<br/>Detected?} - - CheckBackpressure -->|Yes| CircuitBreaker{Circuit Breaker<br/>Threshold?} - CheckBackpressure -->|No| UpdateStats[Update Statistics] - - CircuitBreaker -->|Activated| DropEvents[Drop Low-Priority Events] - CircuitBreaker -->|Not Yet| SlowDown[Slow Publishing Rate] - - DropEvents --> UpdateStats - SlowDown --> UpdateStats - - UpdateStats --> CheckResources{Resource Usage<br/>OK?} - - CheckResources -->|Yes| WaitInterval[Wait for Next Interval] - CheckResources -->|No| ReduceLoad[Reduce Memory/CPU Load] - - ReduceLoad --> WaitInterval - ReportDegraded --> WaitInterval - ReportUnhealthy --> RequestRestart[Request Restart from Agent] - - WaitInterval --> Start -``` - ---- - -## Flow 8: Graceful Shutdown - -**Description:** How procmond cleanly stops and releases resources - -**Trigger:** daemoneye-agent sends shutdown signal to procmond (system shutdown, maintenance, restart) - -**Steps:** - -01. daemoneye-agent publishes shutdown command to topic `control.collector.lifecycle` -02. procmond receives shutdown signal -03. procmond stops accepting new collection cycles -04. procmond completes current collection cycle if in progress (with 30-second timeout) -05. procmond publishes any buffered events to event bus -06. procmond flushes audit trail to local database -07. procmond publishes deregistration message to event bus -08. procmond closes event bus connection -09. procmond releases platform-specific resources (file handles, memory) -10. procmond exits with success code -11. daemoneye-agent confirms procmond stopped cleanly -12. Operator sees "procmond: stopped" status - -**Timeout Handling:** - -- If shutdown takes >30 seconds, daemoneye-agent forcefully terminates procmond -- Operator sees "procmond: force stopped" warning in logs - ---- - -## Flow 9: Operator Troubleshooting - -**Description:** How operators diagnose and resolve procmond issues - -**Trigger:** Operator notices degraded or unhealthy procmond status in daemoneye-agent health report - -**Steps:** - -1. Operator runs diagnostic command via daemoneye-cli: `daemoneye-cli health procmond --detailed` - -2. daemoneye-cli queries daemoneye-agent for procmond diagnostics - -3. daemoneye-agent requests detailed health report from procmond via event bus - -4. procmond gathers diagnostic information: - - - Recent error messages and stack traces - - Collection cycle statistics (success rate, latency) - - Resource usage trends (memory, CPU over time) - - Event bus connectivity status - - Platform collector status and capabilities - -5. procmond publishes diagnostic report to topic `control.health.diagnostics` - -6. daemoneye-agent formats and returns diagnostic data to CLI - -7. Operator reviews diagnostic output showing: - - - **Status:** Current health state with reason - - **Statistics:** Collection cycles, events published, errors - - **Resources:** Memory usage, CPU usage, buffer sizes - - **Connectivity:** Event bus connection status, last successful publish - - **Recent Errors:** Last 10 errors with timestamps and context - -8. **Decision Point:** Based on diagnostics, operator takes action: - - - **Permission errors:** Adjust procmond privileges or security policies - - **Performance issues:** Increase collection interval or disable enhanced metadata - -- **Connectivity issues:** Check daemoneye-agent status and event bus health -- **Resource exhaustion:** Increase resource limits or reduce collection scope - -09. Operator applies configuration changes through daemoneye-agent -10. Operator monitors health status to confirm issue resolved - -**Common Troubleshooting Scenarios:** - -| Issue | Diagnostic Indicator | Operator Action | -| ----------------- | ----------------------------------- | ----------------------------------------------- | -| High error rate | Collection success rate \<80% | Review error logs, check permissions | -| Backpressure | Backpressure events >100/hour | Increase interval, reduce metadata | -| Memory growth | Memory usage trending upward | Reduce max processes, disable enhanced metadata | -| Missing events | Events published but not received | Check daemoneye-agent event bus health | -| Platform failures | Platform collector status: degraded | Check OS-specific requirements (procfs, WinAPI) | - ---- - -## Flow 10: Detection Rule Configuration - -**Description:** How operators configure what procmond considers "suspicious" and what analysis to trigger - -**Trigger:** Operator wants to add or modify detection rules for suspicious process behavior - -**Steps:** - -01. Operator defines detection rule via daemoneye-cli: `daemoneye-cli rules add --name="unsigned-binary" --condition="code_signed=false" --trigger="binary_hasher" --priority=high` -02. Rule specifies conditions (e.g., "unsigned binary", "network connection to suspicious IP", "privilege escalation") -03. Rule specifies which collector to trigger (binary hasher, memory analyzer, network analyzer) -04. Rule specifies priority level (Low/Normal/High/Critical) -05. Operator saves rule configuration -06. daemoneye-agent validates rule syntax and feasibility -07. daemoneye-agent publishes rule update to procmond via topic `control.collector.config` -08. procmond receives and validates rule update -09. procmond applies new rules to lifecycle tracker -10. procmond acknowledges rule update to daemoneye-agent -11. Operator sees "Detection rules updated successfully" confirmation -12. procmond begins applying new rules in next collection cycle - -**Rule Examples:** - -- "Trigger binary hasher for any unsigned executable" -- "Trigger network analyzer for processes with >10 network connections" -- "Trigger memory analyzer for processes with privilege escalation" -- "Trigger behavioral analyzer for PID reuse events" - ---- - -## Flow 11: Cross-Platform Behavior - -**Description:** How procmond adapts to different operating systems while maintaining consistent operator experience - -**Trigger:** procmond starts on different platforms (Linux, macOS, Windows, FreeBSD) - -**Platform-Specific Behaviors:** - -### Linux - -1. procmond detects Linux platform during initialization -2. procmond initializes Linux-specific collector with procfs access -3. procmond collects enhanced metadata: network connections, file descriptors, cgroups, namespaces -4. procmond respects SELinux/AppArmor restrictions -5. Operator sees full metadata in process events - -### macOS - -1. procmond detects macOS platform during initialization -2. procmond initializes macOS-specific collector with BSD sysctl -3. procmond collects enhanced metadata: code signing info, sandbox profiles -4. procmond respects System Integrity Protection (SIP) boundaries -5. Operator sees macOS-specific metadata (code signing, sandboxing) - -### Windows - -1. procmond detects Windows platform during initialization -2. procmond initializes Windows-specific collector with WinAPI -3. procmond collects enhanced metadata: session IDs, handle counts, integrity levels, WOW64 status -4. procmond respects UAC and Windows security boundaries -5. Operator sees Windows-specific metadata (sessions, integrity levels) - -### FreeBSD - -1. procmond detects FreeBSD platform during initialization -2. procmond initializes basic sysinfo collector (enhanced collector not available) -3. procmond collects basic metadata only (PID, name, paths, resource usage) -4. procmond reports degraded status: "Enhanced metadata not available on FreeBSD" -5. Operator sees warning about limited metadata but monitoring continues - -**Operator Experience:** - -- Consistent health status and error reporting across all platforms -- Platform-specific metadata differences documented in health report -- Same configuration interface regardless of platform -- Graceful degradation on secondary platforms (FreeBSD) - ---- - -## Flow 12: Performance Optimization Cycle - -**Description:** How operators tune procmond performance based on system load and requirements - -**Trigger:** Operator notices performance issues or wants to optimize resource usage - -**Steps:** - -1. Operator reviews performance metrics via daemoneye-cli: `daemoneye-cli metrics procmond` -2. daemoneye-cli displays performance dashboard: - -- Collection latency (average, p95, p99) -- Memory usage trend -- CPU usage trend -- Event publishing rate -- Backpressure frequency - -3. Operator identifies performance bottleneck: - -- **High latency:** Collection taking too long -- **High memory:** Too many processes or snapshots -- **Backpressure:** Event bus can't keep up - -4. Operator adjusts configuration through daemoneye-agent: - -- Increase collection interval (reduce frequency) -- Disable enhanced metadata (reduce per-process overhead) -- Reduce max processes per cycle (limit scope) -- Disable executable hashing (reduce CPU usage) - -5. daemoneye-agent pushes configuration update to procmond -6. procmond applies changes and adjusts behavior -7. Operator monitors metrics to confirm improvement -8. **Decision Point:** If performance improves, keep changes; if not, try different adjustments -9. Operator iterates until performance meets targets - -**Performance Targets:** - -- Collection latency \<100ms for 1,000 processes -- Memory usage \<100MB sustained -- CPU usage \<5% sustained -- Zero backpressure events under normal load - ---- - -## Summary of Key Flows - -| Flow | Operator Involvement | Frequency | Criticality | -| --------------------- | ------------------------------------ | ------------------------ | ----------- | -| Initial Deployment | Direct (via daemoneye-cli) | Once per system | Critical | -| System Startup | Indirect (via daemoneye-agent) | Once per deployment | Critical | -| Continuous Monitoring | None (autonomous) | Every 30s (configurable) | Critical | -| Suspicious Detection | None (automatic) | As events occur | High | -| Configuration Update | Direct (via daemoneye-agent) | Occasional | Medium | -| Health Monitoring | Direct (via daemoneye-cli) | On-demand | High | -| Error Recovery | Automatic (with operator escalation) | As failures occur | Critical | -| Graceful Shutdown | Indirect (via daemoneye-agent) | Rare | Medium | -| Troubleshooting | Direct (via daemoneye-cli) | When issues arise | High | -| Rule Configuration | Direct (via daemoneye-agent) | Occasional | Medium | -| Cross-Platform | None (automatic adaptation) | Platform-dependent | High | -| Performance Tuning | Direct (via daemoneye-agent) | Periodic | Medium | - ---- - -## Operator Touchpoints - -**Primary Interface:** daemoneye-cli (for configuration, health checks, diagnostics, and metrics) - -**Secondary Interface:** daemoneye-agent (for lifecycle management - start/stop/restart) - -**No Direct Interface:** Operators do not interact with procmond directly; all interactions are mediated through daemoneye-agent - -**Visibility Channels:** - -- Health status reports (healthy/degraded/unhealthy) -- Error logs (structured logging via daemoneye-agent) -- Performance metrics (via daemoneye-cli) -- Diagnostic reports (detailed troubleshooting data) - ---- - -## References - -- Epic Brief: spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- Event Bus Architecture: file:docs/embedded-broker-architecture.md -- Topic Hierarchy: file:daemoneye-eventbus/docs/topic-hierarchy.md -- Process Collector Implementation: file:procmond/src/process_collector.rs -- Lifecycle Tracking: file:procmond/src/lifecycle.rs -- Monitor Collector: file:procmond/src/monitor_collector.rs diff --git a/spec/procmond/specs/Epic_Brief__Complete_Procmond_Implementation.md b/spec/procmond/specs/Epic_Brief__Complete_Procmond_Implementation.md deleted file mode 100644 index 6731dad4..00000000 --- a/spec/procmond/specs/Epic_Brief__Complete_Procmond_Implementation.md +++ /dev/null @@ -1,281 +0,0 @@ -# Epic Brief: Complete Procmond Implementation - -## Summary - -DaemonEye requires a production-ready process monitoring daemon (procmond) that serves as the foundation for security monitoring across Linux, macOS, Windows, and FreeBSD platforms. While core process enumeration functionality exists for primary platforms, the implementation needs architectural refinement, FreeBSD support, security hardening, performance validation, and integration with the daemoneye-agent service orchestrator. This Epic covers completing the procmond implementation as a Monitor Collector that continuously observes system processes, detects suspicious activity, and triggers analysis collectors through an event-driven architecture. The work includes finishing platform-specific features, implementing IPC communication with daemoneye-agent, hardening security boundaries, validating performance against targets, and achieving comprehensive test coverage to deliver a reliable, secure, and performant process monitoring foundation for the DaemonEye security platform. - -## Context & Problem - -### Who's Affected - -**Primary Users:** - -- **Security Operations Teams**: Need reliable, real-time process monitoring to detect threats and investigate incidents across heterogeneous infrastructure -- **System Administrators**: Require low-overhead monitoring that works consistently across Linux, macOS, Windows, and FreeBSD environments -- **Compliance Officers**: Need tamper-evident audit trails and comprehensive process metadata for regulatory requirements -- **DevOps Engineers**: Need monitoring that operates in air-gapped, containerized, and high-security environments - -**Secondary Users:** - -- **Incident Response Teams**: Depend on accurate process genealogy and metadata for forensic analysis -- **Threat Hunters**: Need rich process context (network connections, file descriptors, security contexts) for proactive threat hunting - -### Current Pain Points - -**Incomplete Platform Coverage** - -- FreeBSD support is missing or incomplete, limiting deployment options for security-conscious organizations that standardize on BSD systems -- Platform-specific metadata collection varies in depth, creating inconsistent detection capabilities across environments -- Secondary platform support (Solaris, AIX) is undefined, blocking enterprise adoption in heterogeneous data centers - -**Architectural Gaps** - -- Process collectors exist but lack full integration with the daemoneye-agent service orchestrator, preventing complete lifecycle management and health monitoring -- Event bus communication between procmond and daemoneye-agent needs refinement for the event-driven architecture -- Monitor Collector behavior (continuous operation, event generation, triggering other collectors) requires completion -- Privilege separation between privileged process enumeration and unprivileged detection logic needs validation - -**Security Concerns** - -- Privilege management is incomplete - unclear when elevated privileges are required and how to drop them safely -- Data sanitization (command-line arguments, environment variables) is not consistently applied, risking exposure of secrets in logs -- Security boundaries between procmond (privileged) and daemoneye-agent (unprivileged) are not enforced -- No validation that the implementation meets least-privilege principles - -**Performance Uncertainty** - -- Performance benchmarks haven't been validated against targets (e.g., enumerate 1,000 processes in \<100ms) -- No load testing with 10,000+ processes to validate scalability claims -- Memory usage and CPU overhead under sustained operation are unverified -- No regression testing to prevent performance degradation - -**Testing Gaps** - -- Test coverage is below target thresholds (\<80% unit, \<90% critical paths) -- Cross-platform integration tests are incomplete or missing -- Security testing (privilege escalation, injection attacks, DoS) is insufficient -- No chaos testing for resilience validation - -### Where in the Product - -This Epic affects the **core monitoring foundation** of DaemonEye: - -**Component Hierarchy:** - -``` -DaemonEye Platform -├── daemoneye-agent (service orchestrator) -│ └── procmond (process monitor) ← THIS EPIC -│ ├── Process Enumeration Engine -│ ├── Platform-Specific Collectors (Linux, macOS, Windows, FreeBSD) -│ ├── Event Detection & Triggering -│ └── Event Bus Integration Layer -├── daemoneye-cli (management interface) -└── Detection Engine (consumes procmond data) -``` - -**Integration Points:** - -- **Upstream**: Operating system APIs (procfs, WinAPI, BSD sysctl) -- **Downstream**: daemoneye-agent (lifecycle management, event bus), detection engine (SQL queries), alert system -- **Lateral**: Other collectors (binary hasher, memory analyzer) triggered by procmond events via event bus - -### Root Cause Analysis - -The current state reflects **incremental development without complete architectural integration**: - -1. **Phase 1 (Complete)**: Core process enumeration implemented using sysinfo crate with platform-specific enhancements -2. **Phase 2 (Incomplete)**: Integration with daemoneye-agent architecture and IPC communication -3. **Phase 3 (Not Started)**: Security hardening, performance validation, comprehensive testing -4. **Phase 4 (Not Started)**: Production readiness (observability, documentation, deployment) - -The gap exists because: - -- Initial focus was on proving cross-platform enumeration feasibility -- Architectural decisions for service orchestration (daemoneye-agent) and event bus integration evolved during development -- Security and performance requirements were deferred to avoid premature optimization -- Testing infrastructure development lagged behind feature implementation - -### Business Impact - -**Without completing this Epic:** - -- ❌ DaemonEye cannot be deployed in production environments (no service lifecycle management) -- ❌ FreeBSD users cannot adopt DaemonEye (platform gap) -- ❌ Security-conscious organizations cannot trust the implementation (unvalidated security boundaries) -- ❌ Performance claims are unverified (risk of production issues) -- ❌ Detection capabilities are inconsistent across platforms (metadata gaps) - -**With this Epic complete:** - -- ✅ Production-ready process monitoring across all target platforms -- ✅ Reliable service lifecycle management through daemoneye-agent -- ✅ Validated security boundaries and privilege separation -- ✅ Proven performance characteristics under load -- ✅ Comprehensive test coverage for confidence in reliability -- ✅ Foundation for event-driven security monitoring architecture - -## Scope - -### In Scope - -**Platform Completion** - -- Complete FreeBSD support with basic process enumeration (best-effort, documented limitations) -- Fill metadata gaps across primary platforms (Linux, macOS, Windows) - -**CLI Features (Basic)** - -- Health check commands for procmond status -- Diagnostic commands for troubleshooting -- Performance metrics display -- Configuration update commands -- Validate cross-platform consistency in data models and behavior - -**Architectural Integration** - -- Complete event bus integration between procmond and daemoneye-agent: - - Implement missing RPC patterns for lifecycle management (start/stop/restart) - - Add comprehensive error handling and reconnection logic for event bus failures - - Implement missing topic subscriptions and publishing patterns - - Performance optimization and load testing of event bus communication -- Integrate procmond with daemoneye-agent service lifecycle management -- Implement Monitor Collector behavior (continuous operation, event generation, triggering) -- Define and enforce privilege boundaries between components - -**Security Hardening** - -- Implement privilege detection and management (capabilities, tokens) -- Add data sanitization for command-line arguments and environment variables -- Validate security boundaries and least-privilege principles -- Create security test suite (privilege escalation, injection, DoS) - -**Performance Validation** - -- Benchmark process enumeration against targets (1,000 processes in \<100ms) -- Load test with 10,000+ processes -- Validate memory usage (\<100MB) and CPU overhead (\<5%) -- Implement performance regression testing - -**Testing & Quality** - -- Achieve >80% unit test coverage, >90% critical path coverage -- Create cross-platform integration test suite -- Implement chaos testing for resilience validation -- Add property-based testing for edge cases - -**Documentation** - -- Architecture documentation (component interactions, privilege model) -- Deployment guides (installation, configuration, troubleshooting) -- API documentation (ProcessCollector trait, data models) -- Security documentation (threat model, security controls) - -### Out of Scope - -**Deferred to Future Epics** - -- Advanced behavioral analysis and machine learning-based anomaly detection -- Real-time process monitoring with sub-second event detection -- Kernel-level monitoring (eBPF on Linux, ETW on Windows) -- Integration with external threat intelligence feeds -- Commercial features (Security Center, federated architecture) -- Support for secondary platforms beyond FreeBSD (Solaris, AIX) - -**Explicitly Not Included** - -- Detection rule authoring and management (separate Epic) -- Alert delivery and notification systems (separate Epic) -- Advanced CLI features (query interface, rule management, advanced diagnostics) (separate Epic) -- Database schema and storage layer (separate Epic) -- Enhanced FreeBSD metadata collection (deferred to future work) - -### Success Criteria - -**Functional Completeness** - -- ✅ Process enumeration works on Linux, macOS, Windows (full support) and FreeBSD (basic support) -- ✅ Platform-specific metadata collection is consistent on primary platforms (Linux, macOS, Windows) -- ✅ Event bus communication with daemoneye-agent is reliable and performant -- ✅ Service lifecycle (start, stop, restart, health checks) works correctly via RPC patterns -- ✅ Monitor Collector behavior (event generation, triggering) is functional -- ✅ Basic CLI commands for health checks and diagnostics are implemented - -**Security Validation** - -- ✅ Privilege boundaries are enforced and validated -- ✅ Data sanitization prevents secret exposure -- ✅ Security test suite passes with no critical vulnerabilities -- ✅ Least-privilege principles are documented and verified - -**Performance Targets** - -- ✅ Enumerate 1,000 processes in \<100ms (average, primary platforms) -- ✅ Support 10,000+ processes without degradation -- ✅ Memory usage \<100MB during normal operation -- ✅ CPU overhead \<5% during continuous monitoring - -**Quality Metrics** - -- ✅ >80% unit test coverage across all modules -- ✅ >90% critical path coverage: - - Process enumeration on all platforms - - Event bus communication (publish/subscribe/reconnection) - - Core monitoring loop and lifecycle detection - - All error handling and recovery paths - - Security boundaries (privilege management, data sanitization) -- ✅ Cross-platform integration tests pass on all target platforms -- ✅ Zero regressions in performance benchmarks - -**Operational Readiness** - -- ✅ Architecture documentation complete and reviewed -- ✅ Deployment guides tested on all platforms -- ✅ API documentation generated and published -- ✅ Security documentation reviewed by security team - -## Key Assumptions - -1. **Platform Support**: FreeBSD 13+ is the only secondary platform in scope; other BSDs and Unix variants are deferred -2. **Performance Targets**: Current targets (100ms for 1,000 processes) are based on typical deployment scenarios; extreme edge cases (100,000+ processes) are out of scope -3. **Security Model**: Privilege separation between procmond (elevated) and daemoneye-agent (unprivileged) is the correct architectural approach -4. **IPC Technology**: The daemoneye-eventbus, as the previous IPC and RPC technologies are not used within procmond. -5. **Testing Infrastructure**: Existing CI/CD pipeline (GitHub Actions) is sufficient for cross-platform testing -6. **Timeline Flexibility**: Milestone dates are flexible and will be updated based on actual progress - -## Constraints - -**Technical Constraints** - -- Must maintain backward compatibility with existing ProcessRecord data model -- Must use Rust 2024 edition with MSRV 1.91+ -- Must follow workspace-level lints (unsafe_code = "forbid", warnings = "deny") -- Must integrate with existing collector-core framework and daemoneye-eventbus - -**Resource Constraints** - -- Single developer (unclesp1d3r) as primary contributor -- Limited access to FreeBSD testing infrastructure -- No dedicated security audit team (self-review required) - -**Operational Constraints** - -- Must support air-gapped deployments (no external dependencies at runtime) -- Must operate in containerized environments (Docker, Kubernetes) -- Must respect platform security boundaries (SELinux, AppArmor, SIP) - -## Related Work - -- **GitHub Issue #39**: Cross-platform process enumeration (foundation) -- **GitHub Issue #89**: Complete procmond implementation (parent issue) -- **GitHub Issue #40**: Binary hashing collector (triggered by procmond) -- **GitHub Issue #103**: daemoneye-agent service architecture (integration point) -- **GitHub Issue #64**: Core Tier Functionality Epic (broader context) - -## References - -- Architecture: file:.kiro/steering/structure.md -- Technical Stack: file:.kiro/steering/tech.md -- Development Guide: file:AGENTS.md -- Existing Implementation: file:procmond/src/ -- Data Models: file:daemoneye-lib/src/models/process.rs diff --git a/spec/procmond/specs/Tech_Plan__Complete_Procmond_Implementation.md b/spec/procmond/specs/Tech_Plan__Complete_Procmond_Implementation.md deleted file mode 100644 index 88cd7d47..00000000 --- a/spec/procmond/specs/Tech_Plan__Complete_Procmond_Implementation.md +++ /dev/null @@ -1,996 +0,0 @@ -# Tech Plan: Complete Procmond Implementation - -## Architectural Approach - -### 1. Core Architectural Decisions - -**Child Process Model** - -- procmond runs as a child process spawned by daemoneye-agent -- daemoneye-agent's CollectorProcessManager handles lifecycle (start/stop/restart) -- Configuration and broker socket path passed via environment variables -- Single service deployment model (operators manage daemoneye-agent only) - -**Startup Coordination (Agent Loading State)** - -- Broker starts before agent spawns collectors (eliminates race condition) -- Agent spawns all configured collectors with broker socket path via environment variable -- Collectors connect to broker and register via RPC -- Collectors report "ready" status after successful registration -- Agent waits for all collectors to report "ready" before dropping privileges -- Agent remains in "loading state" until all configured collectors are ready -- Agent reads collector configuration from config file (defines which collectors to spawn) -- Agent transitions to "steady state" and broadcasts "begin monitoring" to `control.collector.lifecycle` -- All collectors subscribe to `control.collector.lifecycle` and start collection loops on receiving command -- This ensures: (1) no race conditions, (2) agent drops privileges only when safe, (3) coordinated startup - -**Event-Driven Architecture** - -- Replace LocalEventBus with DaemoneyeEventBus for broker communication -- Use embedded broker pattern: daemoneye-agent runs DaemoneyeBroker, procmond connects as client -- Topic-based pub/sub for events: `events.process.*` hierarchy -- RPC patterns for lifecycle management: `control.collector.procmond` - -**Privilege Separation Model** - -- **daemoneye-agent**: Starts privileged, drops privileges after spawning collectors -- **procmond**: Maintains full privileges throughout runtime (restricted attack surface, no network) -- Rationale: procmond needs persistent elevated access for process enumeration; agent has network connectivity (larger attack surface) so drops privileges after initialization - -### 2. Integration Strategy - -**Phase 1: Event Bus Integration (Foundation)** - -- Direct refactoring: Replace LocalEventBus with DaemoneyeEventBus in ProcmondMonitorCollector -- Implement connection management with retry logic (3 attempts at startup, then exit) -- Add event buffering (10MB limit) with replay on reconnection -- Validate connectivity before starting collection (strict validation) - -**Phase 2: RPC Service Implementation (Lifecycle Management)** - -- Create RPC service handler in procmond to receive lifecycle commands -- Implement operations: Start, Stop, Restart, HealthCheck, UpdateConfig, GracefulShutdown -- Add registration/deregistration with daemoneye-agent on startup/shutdown -- Implement heartbeat publishing to `control.health.heartbeat.procmond` - -**Phase 3: Testing (TDD Approach)** - -- Unit tests for event bus integration (>80% coverage target) -- Integration tests for RPC communication -- Cross-platform tests (Linux, macOS, Windows) -- Chaos testing for resilience (connection failures, backpressure) - -**Phase 4: Security Hardening** - -- Implement privilege detection at startup (capabilities on Linux, tokens on Windows) -- Add data sanitization for command-line arguments and environment variables -- Validate security boundaries between procmond and agent -- Security test suite (privilege escalation, injection, DoS) - -**Phase 5: FreeBSD Support** - -- Validate FallbackProcessCollector on FreeBSD 13+ -- Document limitations (basic metadata only, no enhanced features) -- Add platform detection and capability reporting -- Best-effort support (doesn't block Epic completion) - -**Phase 6: Performance Validation** - -- Benchmark process enumeration (target: 1,000 processes in \<100ms) -- Load testing with 10,000+ processes -- Memory profiling (target: \<100MB sustained) -- CPU monitoring (target: \<5% sustained) -- Regression testing to prevent performance degradation - -### 3. Key Trade-offs and Rationale - -**Trade-off 1: Direct Refactoring vs. Parallel Implementation** - -- **Decision**: Direct refactoring (replace LocalEventBus in place) -- **Rationale**: Faster development velocity, simpler codebase, LocalEventBus is internal-only (no external dependencies) -- **Risk Mitigation**: Comprehensive testing before merging, feature branch development - -**Trade-off 2: Event Buffering with Write-Ahead Log** - -- **Decision**: Write-ahead log (WAL) with 10MB buffer and replay on reconnection -- **Rationale**: Prevents data loss during crashes or non-graceful termination, ensures event durability -- **Implementation**: Events persisted to disk before buffering, replayed on restart if procmond crashes -- **Risk Mitigation**: Bounded buffer size, WAL rotation to prevent disk exhaustion, backpressure when buffer full - -**Trade-off 3: Privilege Model** - -- **Decision**: procmond maintains full privileges, agent drops after spawning -- **Rationale**: procmond needs persistent elevated access; agent has larger attack surface (network connectivity) -- **Risk Mitigation**: procmond has no network access, minimal attack surface, runs as child process (isolated) - -**Trade-off 4: FreeBSD Support Level** - -- **Decision**: Best-effort basic enumeration, documented limitations -- **Rationale**: FreeBSD is secondary platform, full feature parity would delay primary platform completion -- **Risk Mitigation**: Clear documentation of limitations, graceful degradation - -### 4. Technical Constraints - -**Platform Constraints** - -- Must support Linux, macOS, Windows (primary), FreeBSD (secondary) -- Must respect platform security boundaries (SELinux, AppArmor, SIP, UAC) -- Must use platform-native APIs for process enumeration - -**Performance Constraints** - -- CPU usage \<5% sustained during continuous monitoring -- Memory usage \<100MB during normal operation -- Process enumeration \<100ms for 1,000 processes (average) -- Event publishing must handle backpressure gracefully - -**Security Constraints** - -- No unsafe code (workspace-level `unsafe_code = "forbid"`) -- All external inputs must be validated and sanitized -- Privilege boundaries must be enforced and tested -- Audit trail for all security-relevant operations - -**Compatibility Constraints** - -- Must maintain backward compatibility with ProcessRecord data model -- Must integrate with existing collector-core framework -- Must use Rust 2024 edition with MSRV 1.91+ -- Must follow workspace-level lints (`warnings = "deny"`) - -### 5. Deployment Architecture - -```mermaid -sequenceDiagram - participant Operator - participant Agent as daemoneye-agent - participant Broker as DaemoneyeBroker<br/>(embedded) - participant Procmond as procmond<br/>(child process) - participant OS as Operating System - - Note over Operator,OS: System Startup - - Operator->>Agent: Start daemoneye-agent (privileged) - Agent->>Broker: Initialize embedded broker - Broker-->>Agent: Broker ready (socket path) - - Agent->>Procmond: Spawn procmond (privileged)<br/>ENV: DAEMONEYE_BROKER_SOCKET - Procmond->>Broker: Connect to broker - Broker-->>Procmond: Connection established - - Procmond->>Broker: Register (RPC)<br/>Topic: control.collector.procmond - Broker->>Agent: Route registration - Agent->>Agent: Wait for all collectors ready - Agent-->>Broker: Registration accepted - Broker-->>Procmond: Registration response - - Agent->>Agent: Drop privileges (after collectors ready) - Agent->>Broker: Send "begin monitoring" command - Broker->>Procmond: Route start command - - Note over Procmond,OS: Continuous Monitoring - - loop Every collection interval - Procmond->>OS: Enumerate processes (privileged) - OS-->>Procmond: Process list with metadata - Procmond->>Procmond: Lifecycle analysis - Procmond->>Broker: Publish events<br/>Topic: events.process.* - Broker->>Agent: Deliver events - - Procmond->>Broker: Publish heartbeat<br/>Topic: control.health.heartbeat.procmond - end - - Note over Operator,OS: Lifecycle Management - - Operator->>Agent: Request health check (via CLI) - Agent->>Broker: Health check RPC<br/>Topic: control.collector.procmond - Broker->>Procmond: Route health check - Procmond-->>Broker: Health status - Broker-->>Agent: Health response - Agent-->>Operator: Display health status - - Note over Operator,OS: Graceful Shutdown - - Operator->>Agent: Stop daemoneye-agent - Agent->>Broker: Graceful shutdown RPC<br/>Topic: control.collector.procmond - Broker->>Procmond: Route shutdown - Procmond->>Procmond: Complete current cycle - Procmond->>Broker: Flush buffered events - Procmond->>Broker: Deregister - Procmond-->>Agent: Exit (success) - Agent->>Broker: Shutdown broker - Agent-->>Operator: Shutdown complete -``` - ---- - -## Data Model - -### 1. Existing Data Models (No Changes Required) - -**ProcessEvent (collector-core)** - -```rust -// Used for event bus communication -pub struct ProcessEvent { - pub pid: u32, - pub ppid: Option<u32>, - pub name: String, - pub executable_path: Option<String>, - pub command_line: Vec<String>, - pub start_time: Option<SystemTime>, - pub cpu_usage: Option<f64>, - pub memory_usage: Option<u64>, - pub executable_hash: Option<String>, - pub user_id: Option<String>, - pub accessible: bool, - pub file_exists: bool, - pub timestamp: SystemTime, - pub platform_metadata: Option<serde_json::Value>, -} -``` - -**ProcessRecord (daemoneye-lib)** - -```rust -// Used for database storage -pub struct ProcessRecord { - pub id: ProcessId, - pub name: String, - pub executable_path: Option<String>, - pub command_line: Option<String>, - pub parent_id: Option<ProcessId>, - pub start_time: Option<DateTime<Utc>>, - pub cpu_usage: Option<f64>, - pub memory_usage: Option<u64>, - pub status: ProcessStatus, - pub user_id: Option<String>, - pub executable_hash: Option<String>, - // ... additional fields -} -``` - -**ProcessSnapshot (procmond)** - -```rust -// Used for lifecycle tracking -pub struct ProcessSnapshot { - pub pid: u32, - pub ppid: Option<u32>, - pub name: String, - pub executable_path: Option<String>, - pub command_line: Vec<String>, - pub start_time: Option<SystemTime>, - pub cpu_usage: Option<f64>, - pub memory_usage: Option<u64>, - pub executable_hash: Option<String>, - pub user_id: Option<String>, - pub accessible: bool, - pub file_exists: bool, - pub snapshot_time: SystemTime, - pub platform_metadata: Option<serde_json::Value>, -} -``` - -**Conversion Functions (Already Exist)** - -- `ProcessEvent` ↔ `ProcessSnapshot`: Bidirectional conversion via `From` trait -- `ProcessRecord` ← `ProcessEvent`: One-way conversion for database storage - -### 2. New Configuration Models - -**EventBusConfig (New)** - -```rust -// Configuration for event bus connection -pub struct EventBusConfig { - pub broker_socket_path: String, // From DAEMONEYE_BROKER_SOCKET env var - pub connection_timeout: Duration, // Default: 10 seconds - pub event_buffer_size_bytes: usize, // Default: 10MB - pub heartbeat_interval: Duration, // Default: 30 seconds - pub enable_event_buffering: bool, // Default: true - pub wal_directory: PathBuf, // Write-ahead log directory - pub wal_max_size_bytes: usize, // Default: 100MB (10x buffer) - pub wal_rotation_threshold: f64, // Default: 0.8 (80% full) - pub backpressure_buffer_threshold: f64, // Default: 0.7 (70% full triggers backpressure) - pub backpressure_interval_multiplier: f64, // Default: 1.5 (increase interval by 50%) -} -``` - -**RpcServiceConfig (New)** - -```rust -// Configuration for RPC service -pub struct RpcServiceConfig { - pub collector_id: String, // Default: "procmond" - pub collector_type: String, // Default: "process-monitor" - pub registration_timeout: Duration, // Default: 10 seconds - pub health_check_timeout: Duration, // Default: 5 seconds - pub graceful_shutdown_timeout: Duration, // Default: 60 seconds -} -``` - -**ActorMessage (New)** - -```rust -// Messages sent to ProcmondMonitorCollector actor -pub enum ActorMessage { - HealthCheck { - respond_to: oneshot::Sender<HealthCheckData>, - }, - UpdateConfig { - config: ProcmondMonitorConfig, - respond_to: oneshot::Sender<Result<()>>, - }, - GracefulShutdown { - respond_to: oneshot::Sender<Result<()>>, - }, - BeginMonitoring, // From control.collector.lifecycle broadcast - AdjustInterval { - new_interval: Duration, // From EventBusConnector backpressure - reason: BackpressureReason, // BufferFull, Reconnecting, etc. - }, -} - -pub enum BackpressureReason { - BufferFull { level_percent: f64 }, - Reconnecting, - WalRotation, -} -``` - -**WriteAheadLogEntry (New)** - -```rust -// Entry in the write-ahead log (bincode serialization) -pub struct WriteAheadLogEntry { - pub sequence: u64, // Monotonic sequence number - pub timestamp: SystemTime, // When event was written - pub event: ProcessEvent, // The actual event - pub checksum: u32, // CRC32 for corruption detection -} -``` - -**WAL File Format:** - -- Binary format using bincode serialization for efficiency -- Sequence-numbered files: `procmond-{sequence:05}.wal` (e.g., `procmond-00001.wal`) -- Each file contains multiple WriteAheadLogEntry records -- Rotation at 80% of max size (80MB of 100MB default) -- Delete WAL file after all events successfully published to broker -- Corruption handling: Skip corrupted entries (CRC32 validation), log warning, continue with next entry - -### 3. Event Bus Message Schemas - -**Registration Message** - -```rust -// Published to: control.collector.procmond (RPC) -pub struct RegistrationRequest { - pub collector_id: String, // "procmond" - pub collector_type: String, // "process-monitor" - pub hostname: String, // System hostname - pub version: Option<String>, // procmond version - pub pid: Option<u32>, // procmond PID - pub capabilities: Vec<String>, // ["process"] - pub attributes: HashMap<String, String>, // Platform-specific attributes - pub heartbeat_interval_ms: Option<u64>, // Requested heartbeat interval -} -``` - -**Heartbeat Message** - -```rust -// Published to: control.health.heartbeat.procmond -pub struct HeartbeatData { - pub collector_id: String, // "procmond" - pub timestamp: SystemTime, // Current time - pub sequence: u64, // Monotonic sequence number - pub status: HealthStatus, // Healthy/Degraded/Unhealthy -} -``` - -**Process Event Message** - -```rust -// Published to: events.process.batch or events.process.lifecycle -// Uses existing ProcessEvent struct (no changes needed) -``` - -### 4. Data Flow - -```mermaid -flowchart TD - A[OS Process APIs] -->|Raw Process Data| B[ProcessCollector] - B -->|ProcessEvent| C[LifecycleTracker] - C -->|ProcessSnapshot| C - C -->|ProcessLifecycleEvent| D[ProcmondMonitorCollector<br/>Actor] - D -->|ProcessEvent| E[EventBusConnector] - E -->|Persist| WAL[Write-Ahead Log<br/>Disk] - E -->|Buffer| F[Event Buffer<br/>10MB Memory] - F -->|Publish| G[DaemoneyeEventBus] - G -->|Topic: events.process.*| H[DaemoneyeBroker] - H -->|Deliver| I[daemoneye-agent] - I -->|ProcessRecord| J[Database] - - K[RPC Commands] -->|control.collector.procmond| H - H -->|Route| L[RpcServiceHandler] - L -->|Actor Messages| D - D -->|Oneshot Responses| L - - D -->|Heartbeat| M[RegistrationManager] - M -->|control.health.heartbeat.procmond| H - - WAL -.->|Replay on Restart| E - F -.->|Backpressure 70%| D - - style WAL fill:#ffa,stroke:#333,stroke-width:2px - style F fill:#f9f,stroke:#333,stroke-width:2px - style H fill:#bbf,stroke:#333,stroke-width:2px - style D fill:#afa,stroke:#333,stroke-width:2px -``` - ---- - -## Component Architecture - -### 1. New Components - -**WriteAheadLog (New)** - -- **Responsibility**: Durable event persistence for crash recovery -- **Location**: procmond/src/wal.rs -- **Key Functions**: - - Persist events to disk using bincode serialization (append-only log) - - Use sequence-numbered files: `procmond-{sequence:05}.wal` - - Rotate log files when size reaches 80% of max (80MB of 100MB default) - - Replay events from WAL on startup (crash recovery) - - Delete WAL files after all events successfully published to broker - - Handle WAL corruption (skip corrupted entries with CRC32 validation, log warning, continue) - - Track which events have been published (mark for deletion) - -**EventBusConnector (New)** - -- **Responsibility**: Manage connection to daemoneye-agent's embedded broker with durable event buffering -- **Location**: procmond/src/event_bus_connector.rs -- **Key Functions**: - - Connect to broker via socket path from `DAEMONEYE_BROKER_SOCKET` env var - - Integrate with WriteAheadLog for event persistence (write before buffering) - - Buffer events (10MB limit) when connection lost - - Replay buffered events (from WAL) on reconnection or restart - - Publish events to topic hierarchy (`events.process.*`) - - Dynamic backpressure: Monitor buffer level (70% threshold triggers backpressure) - - Send ActorMessage::AdjustInterval to MonitorCollector via shared channel reference - - Calculate new interval: current_interval * 1.5 (50% increase) - - Release backpressure when buffer drops below 50% (send AdjustInterval with original interval) - -**RpcServiceHandler (New)** - -- **Responsibility**: Handle incoming RPC requests and coordinate with MonitorCollector via actor pattern -- **Location**: procmond/src/rpc_service.rs -- **Key Functions**: - - Subscribe to `control.collector.procmond` topic (for RPC requests) - - Subscribe to `control.collector.lifecycle` topic (for "begin monitoring" broadcast) - - Handle lifecycle operations: Start, Stop, Restart, HealthCheck, UpdateConfig, GracefulShutdown - - Send messages to MonitorCollector actor via bounded mpsc channel (capacity: 100) - - Wait for MonitorCollector responses via oneshot channels - - Return RPC responses with appropriate status codes - - Handle channel full errors (return RPC error if actor channel full) - - Serialize concurrent RPC requests (process one at a time) - -**RegistrationManager (New)** - -- **Responsibility**: Handle collector registration and heartbeat publishing -- **Location**: procmond/src/registration.rs -- **Key Functions**: - - Register with daemoneye-agent on startup via RPC - - Report "ready" status after successful registration - - Publish periodic heartbeats to `control.health.heartbeat.procmond` (every 30 seconds) - - Include health status in heartbeat (Healthy/Degraded/Unhealthy) - - Deregister on graceful shutdown - - Track registration state and heartbeat sequence number - -**ConfigurationManager (Enhanced)** - -- **Responsibility**: Manage configuration with hot-reload support at cycle boundaries -- **Location**: procmond/src/config.rs (enhance existing) -- **Key Functions**: - - Load configuration from environment variables and config files - - Validate configuration changes via RPC - - Apply configuration updates at next collection cycle boundary (atomic) - - Send configuration change message to MonitorCollector actor - - Document which configurations are hot-reloadable vs. require restart - -### 2. Modified Components - -**ProcmondMonitorCollector (Modified)** - -- **Changes**: - - Replace `LocalEventBus` with `DaemoneyeEventBus` (via EventBusConnector) - - Implement actor pattern: Process messages from bounded mpsc channel (capacity: 100) - - Add configuration hot-reload at cycle boundaries (atomic application) - - Enhance health check to include event bus connectivity status - - Wait for "begin monitoring" broadcast on `control.collector.lifecycle` before starting collection loop - - Respond to dynamic interval adjustments from EventBusConnector backpressure - - Provide shared channel reference to EventBusConnector for backpressure signaling -- **Location**: file:procmond/src/monitor_collector.rs - -**main.rs (Modified)** - -- **Changes**: - - Read `DAEMONEYE_BROKER_SOCKET` environment variable - - Initialize WriteAheadLog with configured directory - - Initialize EventBusConnector with WAL integration - - Create bounded mpsc channel (capacity: 100) for actor messages - - Initialize RpcServiceHandler with channel sender and topic subscriptions - - Initialize RegistrationManager for registration and heartbeat - - Pass channel sender to EventBusConnector for backpressure signaling - - Initialize ProcmondMonitorCollector as actor with channel receiver - - Add graceful shutdown coordination with RPC -- **Location**: file:procmond/src/main.rs - -### 3. Component Interactions - -```mermaid -sequenceDiagram - participant Main as main.rs - participant Config as ConfigurationManager - participant EventBus as EventBusConnector - participant Reg as RegistrationManager - participant RPC as RpcServiceHandler - participant Monitor as ProcmondMonitorCollector - participant Collector as ProcessCollector - participant Lifecycle as LifecycleTracker - - Note over Main,Lifecycle: Startup Sequence - - Main->>Config: Load configuration - Config-->>Main: EventBusConfig + RpcServiceConfig - - Main->>EventBus: Connect to broker - EventBus-->>Main: Connection established - - Main->>Reg: Register with agent - Reg->>EventBus: Publish registration (RPC) - EventBus-->>Reg: Registration accepted - - Main->>Main: Wait for "begin monitoring" command - EventBus->>Main: Receive start command from agent - - Main->>RPC: Start RPC service - RPC->>EventBus: Subscribe to control.collector.procmond - - Main->>Monitor: Create collector - Monitor->>Collector: Initialize platform collector - Monitor->>Lifecycle: Initialize lifecycle tracker - - Main->>Monitor: Start monitoring - - Note over Main,Lifecycle: Runtime Operation - - loop Every collection interval - Monitor->>Collector: Collect processes - Collector-->>Monitor: ProcessEvent list - Monitor->>Lifecycle: Update and detect changes - Lifecycle-->>Monitor: ProcessLifecycleEvent list - Monitor->>EventBus: Publish events - EventBus->>EventBus: Write to WAL, then buffer - EventBus->>EventBus: Check buffer level for backpressure - alt Buffer > 70% full - EventBus->>Monitor: Increase collection interval (backpressure) - end - end - - loop Every heartbeat interval - Reg->>EventBus: Publish heartbeat - end - - Note over Main,Lifecycle: RPC Request Handling - - EventBus->>RPC: Incoming RPC request - RPC->>RPC: Parse request - - alt HealthCheck - RPC->>Monitor: Send health check message (actor) - Monitor-->>RPC: Health data via oneshot - RPC->>EventBus: Publish response - else UpdateConfig - RPC->>Config: Validate config changes - Config->>Monitor: Send config update message (actor) - Note over Monitor: Config applied at next cycle boundary - Monitor-->>RPC: Update result via oneshot - RPC->>EventBus: Publish response - else GracefulShutdown - RPC->>Monitor: Send shutdown message (actor) - Monitor->>Monitor: Complete current cycle - Monitor->>EventBus: Flush buffered events + WAL - Monitor-->>RPC: Shutdown ready via oneshot - RPC->>EventBus: Publish response - RPC->>Reg: Deregister - RPC->>Main: Signal shutdown - end - - Note over Main,Lifecycle: Graceful Shutdown - - Main->>Monitor: Stop monitoring - Monitor->>Collector: Cleanup - Monitor->>Lifecycle: Cleanup - Main->>EventBus: Disconnect - Main->>Main: Exit -``` - -### 4. Actor Pattern Coordination - -**ProcmondMonitorCollector as Actor:** - -- Runs in its own task with message processing loop -- Receives messages via mpsc channel from RpcServiceHandler -- Processes messages sequentially (no concurrent state mutations) -- Responds via oneshot channels for request/response patterns - -**Message Types:** - -```rust -enum ActorMessage { - HealthCheck { - respond_to: oneshot::Sender<HealthCheckData>, - }, - UpdateConfig { - config: Config, - respond_to: oneshot::Sender<Result<()>>, - }, - GracefulShutdown { - respond_to: oneshot::Sender<Result<()>>, - }, - BeginMonitoring, // From agent after loading state - AdjustInterval { - new_interval: Duration, - }, // From EventBusConnector backpressure -} -``` - -**Coordination Benefits:** - -- Eliminates race conditions (single-threaded message processing) -- Simplifies state management (no complex locking) -- Clear request/response semantics via oneshot channels -- Serializes concurrent RPC requests automatically - -**Configuration Hot-Reload at Cycle Boundary:** - -- Config update message queued in actor's message channel -- Actor processes message at start of next collection cycle -- Ensures atomic config application (no mid-cycle changes) -- Some configs may require restart (documented in ConfigurationManager) - -### 5. Integration Points - -**With daemoneye-agent:** - -- **BrokerManager**: Spawns procmond as child process, manages lifecycle -- **CollectorProcessManager**: Monitors procmond process health, handles restarts -- **CollectorRegistry**: Tracks procmond registration and heartbeat status -- **RPC Clients**: Sends lifecycle commands to procmond -- **Loading State Management**: - - Agent initializes broker first (before spawning collectors) - - Agent spawns all configured collectors with `DAEMONEYE_BROKER_SOCKET` env var - - Agent waits for all collectors to register and report "ready" status - - Agent drops privileges only after all collectors are ready - - Agent sends "begin monitoring" command to transition collectors to steady state -- **Heartbeat Monitoring**: Agent detects missed heartbeats (3+ consecutive) and takes escalating actions: - 1. Send health check RPC (timeout: 5 seconds) - verify responsiveness - 2. Send graceful shutdown RPC (timeout: 60 seconds) - attempt clean shutdown - 3. Kill procmond process (force termination) - last resort - 4. Restart procmond via CollectorProcessManager - restore service - -**With daemoneye-eventbus:** - -- **DaemoneyeBroker**: Embedded broker that procmond connects to -- **Topic Hierarchy**: `events.process.*` for events, `control.collector.procmond` for RPC -- **RPC Patterns**: Request/response for lifecycle management - -**With collector-core:** - -- **EventSource trait**: ProcmondMonitorCollector implements this interface -- **MonitorCollector trait**: Provides statistics and health check interface -- **ProcessEvent**: Standard event format for process data - -**AgentCollectorConfig (New)** - -```yaml -# Agent configuration file: /etc/daemoneye/agent.yaml -collectors: - - id: procmond - type: process-monitor - binary_path: /usr/bin/procmond - enabled: true - auto_restart: true - startup_timeout_secs: 60 - config: - collection_interval_secs: 30 - enhanced_metadata: true - compute_hashes: false -``` - -### 6. daemoneye-agent Enhancements Required - -**Collector Configuration Loading (New)** - -- Load collector configuration from `/etc/daemoneye/agent.yaml` on startup -- Parse collector list with binary paths, enabled status, and auto-restart settings -- Validate collector binary paths exist and are executable -- Spawn collectors in order defined in configuration file -- Pass collector-specific configuration via environment variables or config files - -**Loading State Management (New)** - -- Add state machine: Loading → Ready → Steady State -- Track collector readiness: Wait for all collectors to report "ready" -- Privilege dropping: Drop privileges only after all collectors ready -- Transition command: Broadcast "begin monitoring" to `control.collector.lifecycle` when entering steady state -- Timeout: If collectors don't report ready within timeout (60s default), fail startup with error - -**Heartbeat Failure Detection (Enhanced)** - -- Monitor heartbeat messages from all collectors -- Track missed heartbeat count per collector (threshold: 3 consecutive) -- Implement escalating recovery actions: - 1. Health check RPC with 5-second timeout - 2. Graceful shutdown RPC with 60-second timeout - 3. Force kill via CollectorProcessManager - 4. Automatic restart via CollectorProcessManager (if auto_restart enabled in config) -- Log all recovery actions for operator visibility -- Emit alerts for repeated collector failures (e.g., 3+ restarts in 10 minutes) - -**Configuration Push (Enhanced)** - -- Validate configuration changes before pushing to collectors -- Send configuration updates via RPC to `control.collector.{collector_id}` -- Track which configurations require restart vs. hot-reload -- Handle configuration update failures (rollback or retry) -- Support configuration validation without applying (validate_only mode) - -### 7. Error Handling Strategy - -**Connection Failures:** - -- Startup: Broker ready before spawn (no retry needed at startup) -- Runtime: Buffer events (10MB limit) with write-ahead log, attempt reconnection, replay on success -- If buffer full: Dynamic interval adjustment - connector increases collection interval by 50% -- WAL persistence: Events written to disk before buffering, replayed on restart after crash -- Reconnection: Exponential backoff (1s, 2s, 4s, 8s, max 30s) with indefinite retries - -**Heartbeat Failures:** - -- Agent detects missed heartbeats (threshold: 3 consecutive misses) -- Escalating recovery actions: - 1. Health check RPC (timeout: 5s) - verify procmond is responsive - 2. Graceful shutdown RPC (timeout: 60s) - attempt clean shutdown - 3. Force kill - terminate procmond process - 4. Restart - spawn new procmond instance -- Heartbeat independence: Heartbeat publishing runs in separate task (not blocked by collection) - -**RPC Failures:** - -- Invalid requests: Return error response with details -- Timeout: Return timeout error after configured duration -- State conflicts: Return error with current state information -- Concurrent requests: Serialize via actor pattern (process one at a time) -- Actor message failures: Return error if actor channel closed or full - -**Collection Failures:** - -- Permission denied: Log error, skip process, continue with others -- Platform API failure: Fall back to basic sysinfo collector -- Timeout: Cancel collection, report degraded health status -- Cycle boundary: Configuration changes applied only at cycle start (atomic) - -**Resource Exhaustion:** - -- Memory approaching limit: Reduce buffer size, disable enhanced metadata, rotate WAL -- CPU usage high: Increase collection interval, reduce metadata collection -- Event buffer full: Dynamic interval adjustment (increase by 50%), WAL rotation -- WAL disk space low: Rotate and compress old WAL files, alert operator - -### 8. Testing Strategy - -**Unit Tests (>80% coverage target):** - -- WriteAheadLog: Persistence, rotation, replay, corruption recovery, compression -- EventBusConnector: Connection, WAL integration, buffering, replay, dynamic backpressure -- RpcServiceHandler: Request parsing, actor message sending, response handling, concurrent request serialization -- RegistrationManager: Registration, "ready" reporting, heartbeat, deregistration -- ConfigurationManager: Loading, validation, cycle-boundary hot-reload, restart detection -- Actor Pattern: Message processing, oneshot responses, channel handling - -**Integration Tests:** - -- Event bus communication: Publish/subscribe, reconnection, buffering -- RPC communication: Lifecycle operations, health checks, config updates -- Cross-platform: Linux, macOS, Windows process enumeration -- Lifecycle tracking: Start/stop/modification detection - -**Chaos Tests:** - -- Connection failures: Broker restart, network interruption -- Backpressure: Slow consumer, high event volume -- Resource limits: Memory constraints, CPU throttling -- Concurrent operations: Multiple RPC requests, collection during shutdown - -**Security Tests:** - -- Privilege escalation: Attempt to gain unauthorized access -- Injection attacks: Malicious process names, command lines -- DoS attacks: Excessive RPC requests, event flooding -- Data sanitization: Verify secrets are not logged or published - ---- - -## Implementation Phases - -### Phase 1: Event Bus Integration (Week 1-2) - -**Goal**: Replace LocalEventBus with DaemoneyeEventBus with durable buffering - -**Tasks:** - -1. Create WriteAheadLog component for event persistence -2. Create EventBusConnector with WAL integration and dynamic backpressure -3. Implement event buffering (10MB limit) with WAL persistence -4. Implement WAL replay on startup (crash recovery) -5. Update ProcmondMonitorCollector to use EventBusConnector and actor pattern -6. Add environment variable reading for broker socket path -7. Implement startup coordination (wait for "begin monitoring" command) -8. Unit tests for WriteAheadLog and EventBusConnector -9. Integration tests for event publishing, WAL replay, and backpressure - -**Success Criteria:** - -- procmond connects to daemoneye-agent's broker on startup -- Events published to `events.process.*` topics -- WAL persists events before buffering -- WAL replay works after crash (events not lost) -- Dynamic backpressure adjusts collection interval when buffer fills -- procmond waits for agent's "begin monitoring" command before starting collection - -### Phase 2: RPC Service Implementation (Week 3-4) - -**Goal**: Enable lifecycle management via RPC with actor pattern coordination - -**Tasks:** - -**procmond Changes:** - -1. Implement actor pattern in ProcmondMonitorCollector (message processing loop) -2. Create ActorMessage enum for actor communication -3. Create RpcServiceHandler with actor message sending via mpsc channel -4. Implement lifecycle operations: Start, Stop, Restart, HealthCheck, UpdateConfig, GracefulShutdown -5. Implement configuration hot-reload at cycle boundaries -6. Create RegistrationManager for registration, "ready" reporting, and heartbeat -7. Implement "begin monitoring" command handling (wait before starting collection) -8. Unit tests for RpcServiceHandler, actor coordination, and RegistrationManager - -**daemoneye-agent Changes:** - -1. Add collector configuration file format (`/etc/daemoneye/agent.yaml`) -2. Implement configuration loading and validation on agent startup -3. Implement loading state management (Loading → Ready → Steady State) -4. Add collector readiness tracking (wait for all collectors to report "ready") -5. Implement privilege dropping after all collectors ready -6. Add "begin monitoring" broadcast to `control.collector.lifecycle` topic -7. Implement heartbeat failure detection with escalating actions: - -- Track missed heartbeats per collector (threshold: 3 consecutive) - - Action 1: Health check RPC (timeout: 5s) - - Action 2: Graceful shutdown RPC (timeout: 60s) - - Action 3: Force kill via CollectorProcessManager - - Action 4: Automatic restart (if auto_restart enabled) - -8. Integration tests for RPC communication and loading state coordination - -**Success Criteria:** - -- procmond registers with daemoneye-agent on startup -- procmond reports "ready" status after registration -- Agent waits for procmond "ready" before dropping privileges -- Agent sends "begin monitoring" command after all collectors ready -- procmond waits for "begin monitoring" before starting collection loop -- Heartbeats published every 30 seconds -- Agent detects missed heartbeats and takes escalating actions (health check → graceful shutdown → kill → restart) -- Health check RPC returns accurate status via actor pattern -- Graceful shutdown RPC completes within timeout -- Configuration update RPC applies changes at next cycle boundary (atomic) - -### Phase 3: Testing (TDD Approach) (Week 5-6) - -**Goal**: Achieve >80% unit coverage, >90% critical path coverage - -**Tasks:** - -1. Expand unit test coverage for all new components -2. Create integration test suite for event bus and RPC -3. Add cross-platform tests (Linux, macOS, Windows) -4. Implement chaos tests for resilience -5. Add security tests for privilege and injection -6. Performance baseline tests - -**Success Criteria:** - -- Unit test coverage >80% -- Critical path coverage >90% (enumeration, event bus, RPC, security) -- All tests pass on Linux, macOS, Windows -- Chaos tests validate resilience to failures - -### Phase 4: Security Hardening (Week 7) - -**Goal**: Implement privilege management and data sanitization - -**Tasks:** - -1. Add privilege detection at startup (capabilities, tokens) -2. Implement data sanitization for command-line args and env vars -3. Validate security boundaries between procmond and agent -4. Add security test suite (privilege escalation, injection, DoS) -5. Document security model and threat analysis - -**Success Criteria:** - -- Privilege detection works on all platforms -- Sensitive data sanitized before logging/publishing -- Security tests pass with no critical vulnerabilities -- Security documentation complete - -### Phase 5: FreeBSD Support (Week 8) - -**Goal**: Validate basic process enumeration on FreeBSD - -**Tasks:** - -1. Test FallbackProcessCollector on FreeBSD 13+ -2. Document limitations (basic metadata only) -3. Add platform detection and capability reporting -4. Create FreeBSD-specific tests -5. Update documentation with FreeBSD support status - -**Success Criteria:** - -- Basic process enumeration works on FreeBSD -- Limitations documented clearly -- Platform detection reports FreeBSD correctly -- Tests pass on FreeBSD 13+ - -### Phase 6: Performance Validation (Week 9) - -**Goal**: Validate performance against targets - -**Tasks:** - -1. Benchmark process enumeration (1,000 processes target: \<100ms) -2. Load testing with 10,000+ processes -3. Memory profiling (target: \<100MB sustained) -4. CPU monitoring (target: \<5% sustained) -5. Regression testing to prevent degradation -6. Performance optimization if targets not met - -**Success Criteria:** - -- Enumerate 1,000 processes in \<100ms (average) -- Support 10,000+ processes without degradation -- Memory usage \<100MB during normal operation -- CPU usage \<5% during continuous monitoring -- No performance regressions - ---- - -## References - -- Epic Brief: spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- Core Flows: spec:54226c8a-719a-479a-863b-9c91f43717a9/f086f464-1e81-42e8-89f5-74a8638360d1 -- Event Bus Architecture: file:docs/embedded-broker-architecture.md -- Topic Hierarchy: file:daemoneye-eventbus/docs/topic-hierarchy.md -- RPC Patterns: file:daemoneye-eventbus/docs/rpc-patterns.md -- Process Collector: file:procmond/src/process_collector.rs -- Monitor Collector: file:procmond/src/monitor_collector.rs -- Lifecycle Tracker: file:procmond/src/lifecycle.rs -- Broker Manager: file:daemoneye-agent/src/broker_manager.rs -- Collector Registry: file:daemoneye-agent/src/collector_registry.rs diff --git a/spec/procmond/tickets/Implement_Actor_Pattern_and_Startup_Coordination.md b/spec/procmond/tickets/Implement_Actor_Pattern_and_Startup_Coordination.md deleted file mode 100644 index 841d0d9f..00000000 --- a/spec/procmond/tickets/Implement_Actor_Pattern_and_Startup_Coordination.md +++ /dev/null @@ -1,210 +0,0 @@ -# Implement Actor Pattern and Startup Coordination - -## Overview - -Refactor ProcmondMonitorCollector to use actor pattern for coordinated state management and implement startup coordination with daemoneye-agent. This ticket replaces LocalEventBus with DaemoneyeEventBus (via EventBusConnector) and establishes the message-passing architecture for RPC coordination. - -## Scope - -**In Scope:** - -- Actor pattern implementation in ProcmondMonitorCollector -- ActorMessage enum for message-based coordination -- Bounded mpsc channel (capacity: 100) for actor messages -- Replace LocalEventBus with EventBusConnector -- Startup coordination: wait for "begin monitoring" command -- Dynamic interval adjustment from backpressure -- Configuration hot-reload at cycle boundaries -- Enhanced health check with event bus connectivity -- Update main.rs for actor initialization - -**Out of Scope:** - -- RPC service handler implementation (Ticket 3) -- Registration and heartbeat (Ticket 3) -- Agent-side loading state (Ticket 4) -- Comprehensive testing (Ticket 5) - -## Technical Details - -### Actor Pattern Architecture - -**Modified Component:** `file:procmond/src/monitor_collector.rs` - -**Key Changes:** - -- Run in dedicated task with message processing loop -- Receive messages via bounded mpsc channel (capacity: 100) -- Process messages sequentially (no concurrent state mutations) -- Respond via oneshot channels for request/response patterns -- Maintain collection state without complex locking - -**ActorMessage Enum:** - -```rust -enum ActorMessage { - HealthCheck { - respond_to: oneshot::Sender<HealthCheckData>, - }, - UpdateConfig { - config: Config, - respond_to: oneshot::Sender<Result<()>>, - }, - GracefulShutdown { - respond_to: oneshot::Sender<Result<()>>, - }, - BeginMonitoring, // From agent after loading state - AdjustInterval { - new_interval: Duration, - }, // From EventBusConnector backpressure -} -``` - -### Startup Coordination - -**Flow:** - -1. procmond starts and connects to broker -2. procmond subscribes to `control.collector.lifecycle` topic -3. procmond waits for "begin monitoring" broadcast from agent -4. Upon receiving command, procmond starts collection loop - -**Why:** Ensures agent has completed loading state (all collectors ready, privileges dropped) before procmond begins monitoring. - -### Configuration Hot-Reload - -**Strategy:** Apply configuration changes at cycle boundaries (atomic) - -**Implementation:** - -- Config update message queued in actor's channel -- Actor processes message at start of next collection cycle -- Ensures no mid-cycle configuration changes -- Some configs may require restart (documented) - -```mermaid -sequenceDiagram - participant Main as main.rs - participant Actor as ProcmondMonitorCollector (Actor) - participant EventBus as EventBusConnector - participant Collector as ProcessCollector - participant Lifecycle as LifecycleTracker - - Note over Main,Lifecycle: Initialization - Main->>Main: Create bounded mpsc channel (capacity: 100) - Main->>EventBus: Initialize with WAL - Main->>Actor: Create with channel receiver - Main->>Actor: Pass EventBusConnector - - Note over Main,Lifecycle: Startup Coordination - Main->>EventBus: Subscribe to control.collector.lifecycle - EventBus->>Main: Receive "begin monitoring" broadcast - Main->>Actor: Send BeginMonitoring message - Actor->>Actor: Start collection loop - - Note over Main,Lifecycle: Collection Loop - loop Every collection interval - Actor->>Collector: Collect processes - Collector-->>Actor: ProcessEvent list - Actor->>Lifecycle: Update and detect changes - Lifecycle-->>Actor: ProcessLifecycleEvent list - Actor->>EventBus: Publish events - EventBus->>EventBus: Write to WAL, buffer, publish - end - - Note over Main,Lifecycle: Backpressure - EventBus->>EventBus: Buffer reaches 70% - EventBus->>Actor: Send AdjustInterval message - Actor->>Actor: Increase collection interval (1.5x) - Note over Actor: Collection slows down - EventBus->>EventBus: Buffer drops to 50% - EventBus->>Actor: Send AdjustInterval message (restore) - Actor->>Actor: Restore original interval - - Note over Main,Lifecycle: Configuration Update - Main->>Actor: Send UpdateConfig message - Actor->>Actor: Queue config update - Note over Actor: Wait for cycle boundary - Actor->>Actor: Apply config at start of next cycle - Actor->>Main: Send success response via oneshot - - Note over Main,Lifecycle: Graceful Shutdown - Main->>Actor: Send GracefulShutdown message - Actor->>Actor: Complete current cycle - Actor->>EventBus: Flush buffered events + WAL - Actor->>Main: Send ready response via oneshot - Main->>Main: Exit -``` - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 1] - EventBusConnector and WAL must exist - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 3] - RPC service needs actor pattern -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 4] - Agent needs "begin monitoring" subscription - -## Acceptance Criteria - -### Actor Pattern - -- [ ] ProcmondMonitorCollector runs in dedicated task with message loop -- [ ] Bounded mpsc channel (capacity: 100) created for actor messages -- [ ] ActorMessage enum defined with all message types -- [ ] Messages processed sequentially (no concurrent state mutations) -- [ ] Oneshot channels used for request/response patterns -- [ ] Channel full errors handled gracefully (log warning, return error) - -### Event Bus Integration - -- [ ] LocalEventBus completely replaced with EventBusConnector -- [ ] Events published via EventBusConnector to `events.process.*` topics -- [ ] EventBusConnector integrated with actor pattern -- [ ] No compilation errors or warnings - -### Startup Coordination - -- [ ] procmond subscribes to `control.collector.lifecycle` topic -- [ ] procmond waits for "begin monitoring" broadcast before starting collection -- [ ] BeginMonitoring message triggers collection loop start -- [ ] Startup sequence documented in code comments - -### Dynamic Interval Adjustment - -- [ ] Actor receives AdjustInterval messages from EventBusConnector -- [ ] Collection interval increases by 50% (1.5x) when backpressure triggered -- [ ] Collection interval restored to original when backpressure released -- [ ] Interval adjustment logged at INFO level - -### Configuration Hot-Reload - -- [ ] UpdateConfig message queued in actor channel -- [ ] Config applied at start of next collection cycle (atomic) -- [ ] Config validation performed before application -- [ ] Success/failure response sent via oneshot channel -- [ ] Documentation lists which configs are hot-reloadable vs. require restart - -### Health Check Enhancement - -- [ ] HealthCheck message returns event bus connectivity status -- [ ] Health data includes: collection state, buffer level, connection status -- [ ] Response sent via oneshot channel - -### main.rs Updates - -- [ ] Bounded mpsc channel created (capacity: 100) -- [ ] EventBusConnector initialized with WAL -- [ ] ProcmondMonitorCollector initialized as actor with channel receiver -- [ ] Graceful shutdown coordination implemented -- [ ] `DAEMONEYE_BROKER_SOCKET` environment variable read - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Core Flows:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f086f464-1e81-42e8-89f5-74a8638360d1 (Flow 2: System Startup) -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 1, Actor Pattern) -- **Monitor Collector:** file:procmond/src/monitor_collector.rs -- **Main Entry Point:** file:procmond/src/main.rs diff --git a/spec/procmond/tickets/Implement_Agent_Loading_State_and_Heartbeat_Detection.md b/spec/procmond/tickets/Implement_Agent_Loading_State_and_Heartbeat_Detection.md deleted file mode 100644 index b536fdbf..00000000 --- a/spec/procmond/tickets/Implement_Agent_Loading_State_and_Heartbeat_Detection.md +++ /dev/null @@ -1,222 +0,0 @@ -# Implement Agent Loading State and Heartbeat Detection - -## Overview - -Implement loading state management and heartbeat failure detection in daemoneye-agent. This ticket ensures coordinated startup (broker → collectors → privilege drop → steady state) and robust failure detection with escalating recovery actions. - -## Scope - -**In Scope:** - -- Collector configuration file format (`/etc/daemoneye/agent.yaml`) -- Configuration loading and validation on agent startup -- Loading state machine: Loading → Ready → Steady State -- Collector readiness tracking (wait for all collectors to report "ready") -- Privilege dropping after all collectors ready -- "Begin monitoring" broadcast to `control.collector.lifecycle` topic -- Heartbeat failure detection with escalating actions -- Integration tests for loading state and heartbeat detection - -**Out of Scope:** - -- procmond-side changes (Tickets 1-3) -- Comprehensive testing across all collectors (Ticket 5) -- Security hardening (Ticket 6) - -## Technical Details - -### Collector Configuration Format - -**Location:** `/etc/daemoneye/agent.yaml` - -**Schema:** - -```yaml -collectors: - - id: procmond - type: process-monitor - binary_path: /usr/bin/procmond - enabled: true - auto_restart: true - startup_timeout_secs: 60 - config: - collection_interval_secs: 30 - enhanced_metadata: true - compute_hashes: false -``` - -**Configuration Loading:** - -- Load on agent startup -- Validate collector binary paths exist and are executable -- Parse collector-specific configuration -- Spawn collectors in order defined in configuration file - -### Loading State Machine - -**States:** - -1. **Loading**: Agent starting, broker initializing, spawning collectors -2. **Ready**: All collectors registered and reported "ready", privileges dropped -3. **Steady State**: Normal operation, collectors monitoring - -**Transitions:** - -- Loading → Ready: All collectors report "ready" within timeout (60s default) -- Ready → Steady State: Agent broadcasts "begin monitoring" command -- Any → Loading: Agent restart - -**Timeout Handling:** - -- If collectors don't report "ready" within timeout, fail startup with error -- Log which collectors failed to report ready -- Exit with non-zero status code - -### Heartbeat Failure Detection - -**Strategy:** Escalating recovery actions - -**Detection:** - -- Track missed heartbeat count per collector (threshold: 3 consecutive) -- Heartbeat expected every 30 seconds (allow 90 seconds before action) - -**Escalating Actions:** - -1. **Health Check RPC** (timeout: 5 seconds) - - - Send health check RPC to collector - - If response received, reset missed heartbeat count - - If timeout, proceed to action 2 - -2. **Graceful Shutdown RPC** (timeout: 60 seconds) - - - Send graceful shutdown RPC to collector - - Wait for completion or timeout - - If successful, proceed to action 4 (restart) - - If timeout, proceed to action 3 - -3. **Force Kill** (via CollectorProcessManager) - - - Kill collector process (SIGKILL on Unix, TerminateProcess on Windows) - - Log forced termination - - Proceed to action 4 - -4. **Automatic Restart** (if auto_restart enabled) - - - Restart collector via CollectorProcessManager - - Reset missed heartbeat count - - Log restart event - -```mermaid -stateDiagram-v2 - [*] --> Loading: Agent starts - Loading --> Loading: Spawn collectors - Loading --> Ready: All collectors ready - Loading --> [*]: Timeout (fail startup) - Ready --> SteadyState: Broadcast "begin monitoring" - SteadyState --> SteadyState: Normal operation - - state SteadyState { - [*] --> Monitoring - Monitoring --> HealthCheck: 3 missed heartbeats - HealthCheck --> Monitoring: Response received - HealthCheck --> GracefulShutdown: Timeout (5s) - GracefulShutdown --> Restart: Success - GracefulShutdown --> ForceKill: Timeout (60s) - ForceKill --> Restart: Process killed - Restart --> Monitoring: Collector restarted - } - - SteadyState --> [*]: Agent shutdown -``` - -### Component Changes - -**Modified:** `file:daemoneye-agent/src/broker_manager.rs` - -- Add loading state management -- Track collector readiness -- Implement privilege dropping after all collectors ready -- Broadcast "begin monitoring" command - -**Modified:** `file:daemoneye-agent/src/collector_registry.rs` - -- Track heartbeat timestamps per collector -- Detect missed heartbeats (3+ consecutive) -- Implement escalating recovery actions -- Log all recovery actions - -**New:** `file:daemoneye-agent/src/config.rs` - -- Load collector configuration from YAML file -- Validate configuration -- Provide configuration to BrokerManager - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 2] - procmond must wait for "begin monitoring" -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 3] - procmond must publish registration and heartbeat - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 5] - Integration tests need complete startup flow - -## Acceptance Criteria - -### Configuration Loading - -- [ ] Agent loads collector configuration from `/etc/daemoneye/agent.yaml` -- [ ] Configuration validation checks binary paths exist and are executable -- [ ] Collector-specific configuration parsed correctly -- [ ] Invalid configuration causes agent startup failure with clear error message - -### Loading State Management - -- [ ] Agent implements state machine: Loading → Ready → Steady State -- [ ] Agent spawns collectors in order defined in configuration -- [ ] Agent tracks collector readiness (waits for "ready" status from all collectors) -- [ ] Agent drops privileges only after all collectors report "ready" -- [ ] Agent broadcasts "begin monitoring" to `control.collector.lifecycle` when entering steady state -- [ ] Timeout (60s default) causes startup failure if collectors don't report ready -- [ ] Startup failure logs which collectors failed to report ready - -### Heartbeat Failure Detection - -- [ ] Agent tracks heartbeat timestamps per collector -- [ ] Agent detects 3 consecutive missed heartbeats (90 seconds without heartbeat) -- [ ] Escalating actions implemented: - - [ ] Action 1: Health check RPC with 5-second timeout - - [ ] Action 2: Graceful shutdown RPC with 60-second timeout - - [ ] Action 3: Force kill via CollectorProcessManager - - [ ] Action 4: Automatic restart (if auto_restart enabled) -- [ ] All recovery actions logged at WARN or ERROR level -- [ ] Missed heartbeat count reset on successful health check or restart - -### Integration Tests - -- [ ] Test: Agent waits for collector "ready" before dropping privileges -- [ ] Test: Agent broadcasts "begin monitoring" after all collectors ready -- [ ] Test: Agent fails startup if collector doesn't report ready within timeout -- [ ] Test: Agent detects missed heartbeats and takes escalating actions -- [ ] Test: Health check RPC resets missed heartbeat count -- [ ] Test: Graceful shutdown RPC completes successfully -- [ ] Test: Force kill terminates unresponsive collector -- [ ] Test: Automatic restart restores collector after failure - -### Documentation - -- [ ] Configuration file format documented -- [ ] Loading state machine documented with state diagram -- [ ] Heartbeat failure detection documented with escalating actions -- [ ] Timeout values documented and configurable - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Core Flows:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f086f464-1e81-42e8-89f5-74a8638360d1 (Flow 2: System Startup, Flow 6: Error Handling) -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 2, Agent Enhancements) -- **Broker Manager:** file:daemoneye-agent/src/broker_manager.rs -- **Collector Registry:** file:daemoneye-agent/src/collector_registry.rs diff --git a/spec/procmond/tickets/Implement_Comprehensive_Test_Suite.md b/spec/procmond/tickets/Implement_Comprehensive_Test_Suite.md deleted file mode 100644 index 7ba672e0..00000000 --- a/spec/procmond/tickets/Implement_Comprehensive_Test_Suite.md +++ /dev/null @@ -1,290 +0,0 @@ -# Implement Comprehensive Test Suite - -## Overview - -Achieve comprehensive test coverage across all procmond components and integration points. This ticket implements unit tests, integration tests, chaos tests, and security tests to meet the >80% unit coverage and >90% critical path coverage targets. - -## Scope - -**In Scope:** - -- Unit tests for all new components (WAL, EventBusConnector, RpcServiceHandler, RegistrationManager, ConfigurationManager) -- Integration tests for event bus and RPC communication -- Cross-platform tests (Linux, macOS, Windows) -- Chaos tests for resilience validation -- Security tests for privilege and injection attacks -- Performance baseline tests -- Test documentation and coverage reporting - -**Out of Scope:** - -- Security hardening implementation (Ticket 6) -- FreeBSD-specific tests (Ticket 7) -- Performance optimization (Ticket 8) - -## Technical Details - -### Unit Tests (>80% Coverage Target) - -**WriteAheadLog Tests:** - -- Persistence: Events written to disk correctly -- Rotation: Files rotate at 80% capacity -- Replay: Events replayed on startup -- Corruption recovery: Corrupted entries skipped with CRC32 validation -- Deletion: WAL files deleted after successful publish - -**EventBusConnector Tests:** - -- Connection: Connects to broker via socket path -- WAL integration: Events written to WAL before buffering -- Buffering: Events buffered when connection lost (10MB limit) -- Replay: Buffered events replayed on reconnection -- Dynamic backpressure: Triggered at 70%, released at 50% - -**RpcServiceHandler Tests:** - -- Request parsing: RPC requests parsed correctly -- Actor message sending: Messages sent to actor via mpsc channel -- Response handling: Responses published with correct status codes -- Concurrent request serialization: Requests processed one at a time -- Channel full errors: Handled gracefully - -**RegistrationManager Tests:** - -- Registration: Registers with agent on startup -- "Ready" reporting: Reports ready status after registration -- Heartbeat: Publishes heartbeats every 30 seconds -- Deregistration: Deregisters on graceful shutdown - -**ConfigurationManager Tests:** - -- Loading: Configuration loaded from files and env vars -- Validation: Invalid configuration rejected -- Cycle-boundary hot-reload: Config applied at cycle boundary -- Restart detection: Configs requiring restart identified - -**Actor Pattern Tests:** - -- Message processing: Messages processed sequentially -- Oneshot responses: Responses sent via oneshot channels -- Channel handling: Bounded channel (capacity: 100) respected - -### Integration Tests - -**Event Bus Communication:** - -- Publish/subscribe: Events published and received correctly -- Reconnection: Connection restored after broker restart -- Buffering: Events buffered and replayed on reconnection -- Topic hierarchy: Events published to correct topics - -**RPC Communication:** - -- Lifecycle operations: Start, Stop, Restart, HealthCheck, UpdateConfig, GracefulShutdown -- Health checks: Accurate health data returned -- Config updates: Configuration applied at cycle boundary -- Graceful shutdown: Completes within timeout - -**Cross-Platform:** - -- Linux: Process enumeration works correctly -- macOS: Process enumeration works correctly -- Windows: Process enumeration works correctly -- Platform-specific metadata: Enhanced metadata collected on each platform - -**Lifecycle Tracking:** - -- Start detection: New processes detected -- Stop detection: Terminated processes detected -- Modification detection: Process changes detected - -### Chaos Tests - -**Connection Failures:** - -- Broker restart: procmond reconnects and replays events -- Network interruption: Events buffered and replayed -- Socket unavailable: procmond retries connection - -**Backpressure:** - -- Slow consumer: Collection interval increases -- High event volume: Backpressure prevents buffer overflow -- Buffer full: Events written to WAL, no data loss - -**Resource Limits:** - -- Memory constraints: procmond operates within 100MB limit -- CPU throttling: procmond maintains \<5% CPU usage -- Disk space: WAL rotation prevents disk exhaustion - -**Concurrent Operations:** - -- Multiple RPC requests: Serialized correctly -- Collection during shutdown: Completes gracefully -- Config update during collection: Applied at cycle boundary - -### Security Tests - -**Privilege Escalation:** - -- Attempt to gain unauthorized access: Fails with error -- Privilege dropping: Agent drops privileges after collectors ready - -**Injection Attacks:** - -- Malicious process names: Sanitized before logging/publishing -- Malicious command lines: Sanitized before logging/publishing -- SQL injection: Not applicable (no SQL in procmond) - -**DoS Attacks:** - -- Excessive RPC requests: Rate-limited or rejected -- Event flooding: Backpressure prevents resource exhaustion - -**Data Sanitization:** - -- Secrets not logged: Environment variables with secrets sanitized -- Secrets not published: Command-line args with secrets sanitized - -### Test Infrastructure - -**Tools:** - -- cargo-nextest: Parallel test execution -- insta: Snapshot testing for CLI output -- criterion: Performance baseline tests -- llvm-cov: Coverage reporting - -**CI Matrix:** - -- Platforms: Linux, macOS, Windows -- Rust: stable, beta, MSRV (1.91+) -- Architectures: x86_64, ARM64 - -```mermaid -graph TD - subgraph "Unit Tests >80%" - U1[WriteAheadLog] - U2[EventBusConnector] - U3[RpcServiceHandler] - U4[RegistrationManager] - U5[ConfigurationManager] - U6[Actor Pattern] - end - - subgraph "Integration Tests" - I1[Event Bus Communication] - I2[RPC Communication] - I3[Cross-Platform] - I4[Lifecycle Tracking] - end - - subgraph "Chaos Tests" - C1[Connection Failures] - C2[Backpressure] - C3[Resource Limits] - C4[Concurrent Operations] - end - - subgraph "Security Tests" - S1[Privilege Escalation] - S2[Injection Attacks] - S3[DoS Attacks] - S4[Data Sanitization] - end - - U1 --> I1 - U2 --> I1 - U3 --> I2 - U4 --> I2 - U5 --> I2 - U6 --> I2 - - I1 --> C1 - I1 --> C2 - I2 --> C4 - - I1 --> S2 - I2 --> S3 - I4 --> S4 -``` - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 1] - WAL and EventBusConnector must exist -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 2] - Actor pattern must exist -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 3] - RPC service must exist -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 4] - Agent loading state must exist - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 6] - Security hardening needs test baseline -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 7] - FreeBSD support needs test framework -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 8] - Performance validation needs baseline tests - -## Acceptance Criteria - -### Unit Test Coverage - -- [ ] WriteAheadLog: >80% coverage -- [ ] EventBusConnector: >80% coverage -- [ ] RpcServiceHandler: >80% coverage -- [ ] RegistrationManager: >80% coverage -- [ ] ConfigurationManager: >80% coverage -- [ ] Actor Pattern: >80% coverage -- [ ] Overall unit test coverage: >80% - -### Critical Path Coverage (>90%) - -- [ ] Process enumeration on all platforms: >90% coverage -- [ ] Event bus communication (publish/subscribe/reconnection): >90% coverage -- [ ] Core monitoring loop and lifecycle detection: >90% coverage -- [ ] All error handling and recovery paths: >90% coverage -- [ ] Security boundaries (privilege management, data sanitization): >90% coverage - -### Integration Tests - -- [ ] Event bus communication tests pass on Linux, macOS, Windows -- [ ] RPC communication tests pass on Linux, macOS, Windows -- [ ] Cross-platform tests pass on Linux, macOS, Windows -- [ ] Lifecycle tracking tests pass on all platforms - -### Chaos Tests - -- [ ] Connection failure tests validate resilience -- [ ] Backpressure tests validate adaptive behavior -- [ ] Resource limit tests validate constraints -- [ ] Concurrent operation tests validate correctness - -### Security Tests - -- [ ] Privilege escalation tests pass (no unauthorized access) -- [ ] Injection attack tests pass (sanitization works) -- [ ] DoS attack tests pass (rate limiting/backpressure works) -- [ ] Data sanitization tests pass (secrets not leaked) - -### Test Infrastructure - -- [ ] cargo-nextest configured for parallel execution -- [ ] insta configured for snapshot testing -- [ ] criterion configured for performance baselines -- [ ] llvm-cov configured for coverage reporting -- [ ] CI matrix configured for Linux, macOS, Windows - -### Documentation - -- [ ] Test strategy documented -- [ ] Coverage targets documented -- [ ] Test execution instructions documented -- [ ] CI/CD integration documented - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Tech Plan:** spec:54226c8a-719a-879a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 3, Testing Strategy) -- **Testing Standards:** file:.cursor/rules/testing/testing-standards.mdc -- **Existing Tests:** file:procmond/tests/ diff --git a/spec/procmond/tickets/Implement_RPC_Service_and_Registration_Manager_(procmond).md b/spec/procmond/tickets/Implement_RPC_Service_and_Registration_Manager_(procmond).md deleted file mode 100644 index 5f08dd45..00000000 --- a/spec/procmond/tickets/Implement_RPC_Service_and_Registration_Manager_(procmond).md +++ /dev/null @@ -1,216 +0,0 @@ -# Implement RPC Service and Registration Manager (procmond) - -## Overview - -Implement RPC service handling and collector registration for procmond. This ticket enables lifecycle management via RPC (health checks, config updates, graceful shutdown) and establishes registration/heartbeat communication with daemoneye-agent. - -## Scope - -**In Scope:** - -- RpcServiceHandler component with actor coordination -- RegistrationManager component for registration and heartbeat -- RPC operation handling: HealthCheck, UpdateConfig, GracefulShutdown -- Subscription to `control.collector.procmond` topic -- Registration via RPC on startup -- "Ready" status reporting after registration -- Periodic heartbeat publishing (every 30 seconds) -- Deregistration on graceful shutdown -- Unit tests for RPC and registration - -**Out of Scope:** - -- Agent-side loading state management (Ticket 4) -- Agent-side heartbeat detection (Ticket 4) -- Comprehensive integration testing (Ticket 5) -- Security hardening (Ticket 6) - -## Technical Details - -### RpcServiceHandler Component - -**Location:** `file:procmond/src/rpc_service.rs` - -**Key Responsibilities:** - -- Subscribe to `control.collector.procmond` topic for RPC requests -- Parse incoming RPC requests -- Send ActorMessage to ProcmondMonitorCollector via mpsc channel -- Wait for responses via oneshot channels -- Publish RPC responses with appropriate status codes -- Handle channel full errors gracefully -- Serialize concurrent RPC requests (process one at a time) - -**Supported Operations:** - -- **HealthCheck**: Query collector health and status -- **UpdateConfig**: Apply configuration changes at cycle boundary -- **GracefulShutdown**: Initiate clean shutdown with event flush - -### RegistrationManager Component - -**Location:** `file:procmond/src/registration.rs` - -**Key Responsibilities:** - -- Register with daemoneye-agent on startup via RPC -- Report "ready" status after successful registration -- Publish periodic heartbeats to `control.health.heartbeat.procmond` (every 30 seconds) -- Include health status in heartbeat: Healthy/Degraded/Unhealthy -- Track registration state and heartbeat sequence number -- Deregister on graceful shutdown - -**Registration Message Schema:** - -```rust -struct RegistrationRequest { - collector_id: String, // "procmond" - collector_type: String, // "process-monitor" - version: String, - capabilities: Vec<String>, - pid: u32, -} - -struct RegistrationResponse { - status: RegistrationStatus, // Accepted/Rejected - message: Option<String>, -} -``` - -**Heartbeat Message Schema:** - -```rust -struct HeartbeatMessage { - collector_id: String, - sequence: u64, - timestamp: DateTime<Utc>, - health_status: HealthStatus, // Healthy/Degraded/Unhealthy - metrics: HeartbeatMetrics, -} - -struct HeartbeatMetrics { - processes_collected: u64, - events_published: u64, - buffer_level_percent: f64, - connection_status: ConnectionStatus, -} -``` - -```mermaid -sequenceDiagram - participant Main as main.rs - participant Reg as RegistrationManager - participant RPC as RpcServiceHandler - participant Actor as ProcmondMonitorCollector - participant EventBus as EventBusConnector - participant Agent as daemoneye-agent - - Note over Main,Agent: Startup Registration - Main->>Reg: Initialize - Reg->>EventBus: Publish registration request (RPC) - EventBus->>Agent: Forward registration - Agent-->>EventBus: Registration accepted - EventBus->>Reg: Registration response - Reg->>EventBus: Publish "ready" status - Reg->>Reg: Start heartbeat task - - Note over Main,Agent: Heartbeat Loop - loop Every 30 seconds - Reg->>Actor: Query health metrics - Actor-->>Reg: Health data - Reg->>EventBus: Publish heartbeat - end - - Note over Main,Agent: RPC Request Handling - Agent->>EventBus: Send health check RPC - EventBus->>RPC: Receive request - RPC->>RPC: Parse request - RPC->>Actor: Send HealthCheck message (actor) - Actor-->>RPC: Health data via oneshot - RPC->>EventBus: Publish RPC response - EventBus->>Agent: Forward response - - Note over Main,Agent: Configuration Update - Agent->>EventBus: Send config update RPC - EventBus->>RPC: Receive request - RPC->>RPC: Validate config - RPC->>Actor: Send UpdateConfig message (actor) - Note over Actor: Config applied at cycle boundary - Actor-->>RPC: Update result via oneshot - RPC->>EventBus: Publish RPC response - - Note over Main,Agent: Graceful Shutdown - Agent->>EventBus: Send graceful shutdown RPC - EventBus->>RPC: Receive request - RPC->>Actor: Send GracefulShutdown message (actor) - Actor->>Actor: Complete current cycle - Actor->>EventBus: Flush buffered events + WAL - Actor-->>RPC: Shutdown ready via oneshot - RPC->>EventBus: Publish RPC response - RPC->>Reg: Deregister - Reg->>EventBus: Publish deregistration - RPC->>Main: Signal shutdown - Main->>Main: Exit -``` - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 2] - Actor pattern must exist for message coordination - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 4] - Agent needs registration/heartbeat handling -- ticket:54226c8a-719a-879a-863b-9c91f43717a9/[Ticket 5] - Integration tests need RPC functionality - -## Acceptance Criteria - -### RpcServiceHandler - -- [ ] Subscribes to `control.collector.procmond` topic on startup -- [ ] Parses incoming RPC requests correctly -- [ ] Sends ActorMessage to ProcmondMonitorCollector via mpsc channel -- [ ] Waits for responses via oneshot channels -- [ ] Publishes RPC responses with correct status codes -- [ ] Handles channel full errors gracefully (logs warning, returns error) -- [ ] Serializes concurrent RPC requests (processes one at a time) -- [ ] Unit tests cover: request parsing, actor coordination, response handling, error cases - -### RegistrationManager - -- [ ] Registers with daemoneye-agent on startup via RPC -- [ ] Reports "ready" status after successful registration -- [ ] Publishes heartbeats every 30 seconds to `control.health.heartbeat.procmond` -- [ ] Includes health status in heartbeat: Healthy/Degraded/Unhealthy -- [ ] Includes metrics in heartbeat: processes collected, events published, buffer level, connection status -- [ ] Tracks registration state and heartbeat sequence number -- [ ] Deregisters on graceful shutdown -- [ ] Unit tests cover: registration, heartbeat publishing, deregistration, state tracking - -### RPC Operations - -- [ ] **HealthCheck**: Returns accurate health data including event bus connectivity -- [ ] **UpdateConfig**: Validates config, sends to actor, returns success/failure -- [ ] **GracefulShutdown**: Coordinates with actor, waits for completion, signals main - -### Integration with Actor - -- [ ] RPC operations correctly coordinate with actor via messages -- [ ] Oneshot channels used for request/response patterns -- [ ] No race conditions or deadlocks -- [ ] Graceful handling of actor channel full errors - -### main.rs Updates - -- [ ] RpcServiceHandler initialized and started -- [ ] RegistrationManager initialized and started -- [ ] Graceful shutdown coordination includes RPC and registration cleanup - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Core Flows:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f086f464-1e81-42e8-89f5-74a8638360d1 (Flow 5: Configuration Update, Flow 7: Graceful Shutdown) -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 2, RPC Service) -- **RPC Patterns:** file:daemoneye-eventbus/docs/rpc-patterns.md -- **Topic Hierarchy:** file:daemoneye-eventbus/docs/topic-hierarchy.md diff --git a/spec/procmond/tickets/Implement_Security_Hardening_and_Data_Sanitization.md b/spec/procmond/tickets/Implement_Security_Hardening_and_Data_Sanitization.md deleted file mode 100644 index c5874df7..00000000 --- a/spec/procmond/tickets/Implement_Security_Hardening_and_Data_Sanitization.md +++ /dev/null @@ -1,251 +0,0 @@ -# Implement Security Hardening and Data Sanitization - -## Overview - -Implement privilege management and data sanitization for procmond. This ticket ensures procmond operates with appropriate privileges, detects privilege requirements at startup, and sanitizes sensitive data before logging or publishing. - -## Scope - -**In Scope:** - -- Privilege detection at startup (capabilities on Linux, tokens on Windows, entitlements on macOS) -- Data sanitization for command-line arguments and environment variables -- Security boundary validation between procmond and agent -- Security test suite (privilege escalation, injection, DoS) -- Security documentation and threat analysis - -**Out of Scope:** - -- FreeBSD privilege management (Ticket 7) -- Performance optimization (Ticket 8) -- Advanced security features (kernel monitoring, sandboxing) - -## Technical Details - -### Privilege Detection - -**Linux:** - -- Detect CAP_SYS_PTRACE capability for full process access -- Detect CAP_DAC_READ_SEARCH for reading /proc -- Log detected capabilities at startup -- Gracefully degrade if capabilities insufficient (basic enumeration only) - -**macOS:** - -- Detect task_for_pid() entitlements -- Check for root privileges -- Log detected privileges at startup -- Gracefully degrade if privileges insufficient - -**Windows:** - -- Detect SeDebugPrivilege token -- Check for Administrator privileges -- Log detected privileges at startup -- Gracefully degrade if privileges insufficient - -**Implementation:** - -```rust -struct PrivilegeStatus { - platform: Platform, - has_full_access: bool, - capabilities: Vec<String>, - degraded_mode: bool, -} - -fn detect_privileges() -> Result<PrivilegeStatus> { - #[cfg(target_os = "linux")] - return detect_linux_capabilities(); - - #[cfg(target_os = "macos")] - return detect_macos_privileges(); - - #[cfg(target_os = "windows")] - return detect_windows_privileges(); -} -``` - -### Data Sanitization - -**Sensitive Data Patterns:** - -- Environment variables: `PASSWORD`, `SECRET`, `TOKEN`, `KEY`, `API_KEY`, `AUTH` -- Command-line arguments: `--password`, `--secret`, `--token`, `--api-key` -- File paths: `/home/*/.ssh/`, `/home/*/.aws/`, `C:\Users\*\.ssh\` - -**Sanitization Strategy:** - -- Replace sensitive values with `[REDACTED]` -- Log sanitization events at DEBUG level -- Apply sanitization before logging and before publishing to event bus - -**Implementation:** - -```rust -fn sanitize_command_line(cmd: &str) -> String { - let sensitive_patterns = [ - "--password", - "--secret", - "--token", - "--api-key", - "-p", - "-s", - "-t", - "-k", - ]; - - // Replace sensitive argument values with [REDACTED] - // Example: "--password secret123" -> "--password [REDACTED]" -} - -fn sanitize_env_vars(env: &HashMap<String, String>) -> HashMap<String, String> { - let sensitive_keys = ["PASSWORD", "SECRET", "TOKEN", "KEY", "API_KEY", "AUTH"]; - - // Replace sensitive values with [REDACTED] - // Example: {"API_KEY": "abc123"} -> {"API_KEY": "[REDACTED]"} -} -``` - -### Security Boundaries - -**Validation:** - -- procmond runs with elevated privileges (full process access) -- daemoneye-agent runs with minimal privileges (dropped after spawning collectors) -- Event bus communication uses Unix domain sockets (Linux/macOS) or named pipes (Windows) -- No network communication from procmond (only local IPC) -- WAL files protected with appropriate permissions (0600) - -**Threat Model:** - -- **Threat 1**: Attacker gains access to procmond process → Limited impact (no network, read-only process data) -- **Threat 2**: Attacker gains access to agent process → Cannot access privileged process data (privilege separation) -- **Threat 3**: Attacker intercepts event bus communication → Mitigated by local IPC (no network exposure) -- **Threat 4**: Attacker reads WAL files → Mitigated by file permissions (0600) - -### Security Test Suite - -**Privilege Escalation Tests:** - -- Attempt to gain unauthorized access to processes -- Verify privilege detection works correctly -- Verify graceful degradation when privileges insufficient - -**Injection Attack Tests:** - -- Malicious process names with special characters -- Malicious command lines with SQL injection attempts -- Malicious environment variables with code injection attempts - -**DoS Attack Tests:** - -- Excessive RPC requests (rate limiting) -- Event flooding (backpressure) -- Resource exhaustion (memory/CPU limits) - -**Data Sanitization Tests:** - -- Verify sensitive data sanitized in logs -- Verify sensitive data sanitized in published events -- Verify sanitization patterns cover common secrets - -```mermaid -graph TD - subgraph "Privilege Detection" - P1[Linux: CAP_SYS_PTRACE] - P2[macOS: task_for_pid] - P3[Windows: SeDebugPrivilege] - P1 --> P4[Log Capabilities] - P2 --> P4 - P3 --> P4 - P4 --> P5{Full Access?} - P5 -->|Yes| P6[Full Enumeration] - P5 -->|No| P7[Degraded Mode] - end - - subgraph "Data Sanitization" - S1[Command-Line Args] - S2[Environment Variables] - S3[File Paths] - S1 --> S4[Detect Sensitive Patterns] - S2 --> S4 - S3 --> S4 - S4 --> S5[Replace with REDACTED] - S5 --> S6[Log Sanitization] - S5 --> S7[Publish Sanitized Data] - end - - subgraph "Security Boundaries" - B1[procmond: Elevated] - B2[agent: Minimal] - B3[Event Bus: Local IPC] - B4[WAL: 0600 Permissions] - B1 --> B3 - B2 --> B3 - B3 --> B5[No Network Exposure] - B4 --> B5 - end -``` - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 5] - Test framework must exist - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 8] - Performance validation needs security baseline - -## Acceptance Criteria - -### Privilege Detection - -- [ ] Linux: CAP_SYS_PTRACE and CAP_DAC_READ_SEARCH detected correctly -- [ ] macOS: task_for_pid() entitlements and root privileges detected correctly -- [ ] Windows: SeDebugPrivilege and Administrator privileges detected correctly -- [ ] Detected privileges logged at startup (INFO level) -- [ ] Graceful degradation when privileges insufficient (basic enumeration only) -- [ ] Degraded mode logged at WARN level - -### Data Sanitization - -- [ ] Command-line arguments sanitized before logging -- [ ] Command-line arguments sanitized before publishing to event bus -- [ ] Environment variables sanitized before logging -- [ ] Environment variables sanitized before publishing to event bus -- [ ] Sensitive patterns detected: PASSWORD, SECRET, TOKEN, KEY, API_KEY, AUTH -- [ ] Sanitization events logged at DEBUG level -- [ ] Sanitized values replaced with `[REDACTED]` - -### Security Boundaries - -- [ ] procmond runs with elevated privileges (full process access) -- [ ] daemoneye-agent runs with minimal privileges (dropped after spawning) -- [ ] Event bus communication uses local IPC (no network) -- [ ] WAL files protected with 0600 permissions -- [ ] No network communication from procmond - -### Security Test Suite - -- [ ] Privilege escalation tests pass (no unauthorized access) -- [ ] Injection attack tests pass (malicious data sanitized) -- [ ] DoS attack tests pass (rate limiting/backpressure works) -- [ ] Data sanitization tests pass (secrets not leaked in logs or events) - -### Documentation - -- [ ] Privilege detection documented for all platforms -- [ ] Data sanitization patterns documented -- [ ] Security boundaries documented with threat model -- [ ] Security test suite documented -- [ ] Threat analysis documented - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 4, Security Hardening) -- **Security Standards:** file:.cursor/rules/security/security-standards.mdc -- **Security Design:** file:docs/src/technical/security_design_overview.md diff --git a/spec/procmond/tickets/Implement_Write-Ahead_Log_and_Event_Bus_Connector.md b/spec/procmond/tickets/Implement_Write-Ahead_Log_and_Event_Bus_Connector.md deleted file mode 100644 index 16c942c8..00000000 --- a/spec/procmond/tickets/Implement_Write-Ahead_Log_and_Event_Bus_Connector.md +++ /dev/null @@ -1,154 +0,0 @@ -# Implement Write-Ahead Log and Event Bus Connector - -## Overview - -Implement durable event persistence and broker connectivity for procmond. This ticket establishes the foundation for reliable event delivery with crash recovery by creating the Write-Ahead Log (WAL) component and EventBusConnector that integrates with daemoneye-eventbus. - -## Scope - -**In Scope:** - -- WriteAheadLog component with bincode serialization -- Sequence-numbered WAL files with rotation at 80% capacity -- CRC32 corruption detection and recovery -- EventBusConnector with WAL integration -- Event buffering (10MB limit) with replay capability -- Connection to broker via `DAEMONEYE_BROKER_SOCKET` environment variable -- Dynamic backpressure monitoring (70% threshold) -- Unit tests for WAL and EventBusConnector - -**Out of Scope:** - -- Actor pattern implementation (Ticket 2) -- RPC service handling (Ticket 3) -- Agent-side changes (Ticket 4) -- Integration testing (Ticket 5) - -## Technical Details - -### WriteAheadLog Component - -**Location:** `file:procmond/src/wal.rs` - -**Key Responsibilities:** - -- Persist events to disk before buffering (durability guarantee) -- Use sequence-numbered files: `procmond-{sequence:05}.wal` -- Rotate when file reaches 80MB (80% of 100MB max) -- Replay events on startup for crash recovery -- Delete WAL files after successful publish -- Handle corruption with CRC32 validation - -**File Format:** - -```rust -// Bincode-serialized records with CRC32 checksums -struct WalEntry { - sequence: u64, - timestamp: DateTime<Utc>, - event: ProcessEvent, - crc32: u32, -} -``` - -### EventBusConnector Component - -**Location:** `file:procmond/src/event_bus_connector.rs` - -**Key Responsibilities:** - -- Connect to daemoneye-agent's embedded broker -- Integrate with WriteAheadLog for event persistence -- Buffer events (10MB limit) when connection lost -- Replay buffered events from WAL on reconnection -- Publish to topic hierarchy: `events.process.*` -- Monitor buffer level for backpressure (70% threshold) -- Provide shared channel reference for backpressure signaling - -**Backpressure Strategy:** - -- Trigger at 70% buffer capacity -- Release at 50% buffer capacity -- Signal via shared mpsc channel (to be used by Ticket 2) - -```mermaid -sequenceDiagram - participant WAL as WriteAheadLog - participant Connector as EventBusConnector - participant Broker as DaemoneyeBroker - - Note over WAL,Broker: Normal Operation - Connector->>WAL: Write event to WAL - WAL-->>Connector: Persisted (sequence number) - Connector->>Broker: Publish event - Broker-->>Connector: Acknowledged - Connector->>WAL: Mark for deletion - - Note over WAL,Broker: Connection Lost - Connector->>WAL: Write event to WAL - WAL-->>Connector: Persisted - Connector->>Connector: Buffer event (in-memory) - Note over Connector: Buffer reaches 70% - Connector->>Connector: Trigger backpressure - - Note over WAL,Broker: Reconnection - Connector->>Broker: Reconnect - Connector->>WAL: Read unpublished events - WAL-->>Connector: Event list - Connector->>Broker: Replay events - Broker-->>Connector: Acknowledged - Connector->>WAL: Delete WAL files - Note over Connector: Buffer drops below 50% - Connector->>Connector: Release backpressure -``` - -## Dependencies - -**Requires:** - -- daemoneye-eventbus client library -- Existing ProcessEvent data model from `file:daemoneye-lib/src/models/process.rs` - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 2] - Actor pattern needs EventBusConnector -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 3] - RPC service needs event bus connectivity - -## Acceptance Criteria - -### WriteAheadLog - -- [ ] Events persisted to disk using bincode serialization -- [ ] Sequence-numbered files created: `procmond-00001.wal`, `procmond-00002.wal`, etc. -- [ ] File rotation occurs at 80MB (80% of 100MB max) -- [ ] WAL replay works correctly on startup (all unpublished events recovered) -- [ ] Corrupted entries detected via CRC32 and skipped with warning log -- [ ] WAL files deleted after all events successfully published -- [ ] Unit tests cover: persistence, rotation, replay, corruption recovery - -### EventBusConnector - -- [ ] Connects to broker using `DAEMONEYE_BROKER_SOCKET` environment variable -- [ ] Events written to WAL before buffering -- [ ] Events buffered (10MB limit) when connection lost -- [ ] Buffered events replayed on reconnection -- [ ] Events published to correct topics: `events.process.start`, `events.process.stop`, `events.process.modify` -- [ ] Backpressure triggered at 70% buffer capacity -- [ ] Backpressure released at 50% buffer capacity -- [ ] Shared channel reference provided for backpressure signaling -- [ ] Unit tests cover: connection, WAL integration, buffering, replay, backpressure - -### Integration - -- [ ] EventBusConnector successfully integrates with WriteAheadLog -- [ ] Events survive procmond crash and are replayed on restart -- [ ] No data loss during connection failures -- [ ] Backpressure mechanism ready for actor integration (Ticket 2) - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 1, Component Architecture) -- **Event Bus Architecture:** file:docs/embedded-broker-architecture.md -- **Topic Hierarchy:** file:daemoneye-eventbus/docs/topic-hierarchy.md -- **Process Models:** file:daemoneye-lib/src/models/process.rs diff --git a/spec/procmond/tickets/Validate_FreeBSD_Platform_Support.md b/spec/procmond/tickets/Validate_FreeBSD_Platform_Support.md deleted file mode 100644 index 465fc111..00000000 --- a/spec/procmond/tickets/Validate_FreeBSD_Platform_Support.md +++ /dev/null @@ -1,187 +0,0 @@ -# Validate FreeBSD Platform Support - -## Overview - -Validate basic process enumeration on FreeBSD and document platform limitations. This ticket ensures procmond works on FreeBSD 13+ with basic metadata collection using the FallbackProcessCollector. - -## Scope - -**In Scope:** - -- Test FallbackProcessCollector on FreeBSD 13+ -- Document FreeBSD limitations (basic metadata only, no enhanced features) -- Add platform detection and capability reporting -- Create FreeBSD-specific tests -- Update documentation with FreeBSD support status - -**Out of Scope:** - -- Enhanced metadata collection for FreeBSD (deferred to future work) -- FreeBSD-specific privilege management (basic only) -- Performance optimization for FreeBSD - -## Technical Details - -### FreeBSD Support Status - -**Current State:** - -- FallbackProcessCollector uses sysinfo crate for basic enumeration -- Basic metadata: PID, PPID, name, executable path, CPU usage, memory usage -- No enhanced metadata: network connections, file descriptors, security contexts - -**Limitations:** - -- No platform-specific collector (unlike Linux, macOS, Windows) -- Limited metadata compared to primary platforms -- Performance may be lower than platform-specific collectors - -**Acceptance:** - -- FreeBSD support is "best-effort" with documented limitations -- Basic enumeration is sufficient for FreeBSD use cases -- Enhanced features deferred to future work - -### Platform Detection - -**Implementation:** - -```rust -#[cfg(target_os = "freebsd")] -fn detect_platform_capabilities() -> PlatformCapabilities { - PlatformCapabilities { - platform: Platform::FreeBSD, - collector_type: CollectorType::Fallback, - enhanced_metadata: false, - network_connections: false, - file_descriptors: false, - security_contexts: false, - } -} -``` - -**Capability Reporting:** - -- Report platform capabilities at startup -- Log degraded status for FreeBSD (INFO level) -- Include capabilities in registration message to agent - -### FreeBSD-Specific Tests - -**Test Coverage:** - -- Basic process enumeration works -- PID, PPID, name, executable path collected correctly -- CPU usage and memory usage collected correctly -- Process lifecycle detection works (start/stop/modify) -- Event publishing works correctly -- No crashes or panics on FreeBSD - -**Test Environment:** - -- FreeBSD 13.0+ (latest stable) -- x86_64 and ARM64 architectures -- CI/CD integration (if FreeBSD runner available) - -```mermaid -graph TD - subgraph "Platform Detection" - D1[Detect OS: FreeBSD] - D1 --> D2[Use FallbackProcessCollector] - D2 --> D3[Report Capabilities] - D3 --> D4{Enhanced Metadata?} - D4 -->|No| D5[Log Degraded Status] - D4 -->|Yes| D6[Full Features] - end - - subgraph "FreeBSD Testing" - T1[Basic Enumeration] - T2[Lifecycle Detection] - T3[Event Publishing] - T1 --> T4[Validate Metadata] - T2 --> T4 - T3 --> T4 - T4 --> T5{Tests Pass?} - T5 -->|Yes| T6[FreeBSD Supported] - T5 -->|No| T7[Document Issues] - end - - subgraph "Documentation" - DOC1[Support Status] - DOC2[Limitations] - DOC3[Capabilities] - DOC1 --> DOC4[Update Docs] - DOC2 --> DOC4 - DOC3 --> DOC4 - end -``` - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 5] - Test framework must exist - -**Blocks:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 8] - Performance validation includes FreeBSD - -## Acceptance Criteria - -### Platform Detection - -- [ ] FreeBSD detected correctly at runtime -- [ ] FallbackProcessCollector used on FreeBSD -- [ ] Platform capabilities reported at startup -- [ ] Degraded status logged for FreeBSD (INFO level) -- [ ] Capabilities included in registration message - -### Basic Enumeration - -- [ ] Process enumeration works on FreeBSD 13+ -- [ ] PID collected correctly -- [ ] PPID collected correctly -- [ ] Process name collected correctly -- [ ] Executable path collected correctly -- [ ] CPU usage collected correctly -- [ ] Memory usage collected correctly - -### Lifecycle Detection - -- [ ] Process start events detected -- [ ] Process stop events detected -- [ ] Process modification events detected -- [ ] Events published to event bus correctly - -### FreeBSD-Specific Tests - -- [ ] Basic enumeration tests pass on FreeBSD -- [ ] Lifecycle detection tests pass on FreeBSD -- [ ] Event publishing tests pass on FreeBSD -- [ ] No crashes or panics on FreeBSD -- [ ] Tests run on x86_64 and ARM64 (if available) - -### Documentation - -- [ ] FreeBSD support status documented (best-effort, basic metadata only) -- [ ] Limitations documented clearly: - - No enhanced metadata - - No network connections - - No file descriptors - - No security contexts -- [ ] Platform capabilities documented -- [ ] Future work documented (enhanced FreeBSD support) - -### CI/CD Integration - -- [ ] FreeBSD tests added to CI/CD pipeline (if runner available) -- [ ] FreeBSD tests run on pull requests -- [ ] FreeBSD test failures reported clearly - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Core Flows:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f086f464-1e81-42e8-89f5-74a8638360d1 (Flow 10: Cross-Platform Behavior) -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 5, FreeBSD Support) -- **Process Collector:** file:procmond/src/process_collector.rs -- **Existing Tests:** file:procmond/tests/os_compatibility_tests.rs diff --git a/spec/procmond/tickets/Validate_Performance_and_Optimize.md b/spec/procmond/tickets/Validate_Performance_and_Optimize.md deleted file mode 100644 index 47e53f75..00000000 --- a/spec/procmond/tickets/Validate_Performance_and_Optimize.md +++ /dev/null @@ -1,226 +0,0 @@ -# Validate Performance and Optimize - -## Overview - -Validate procmond performance against targets and optimize if needed. This ticket ensures procmond meets performance requirements: enumerate 1,000 processes in \<100ms, support 10,000+ processes, use \<100MB memory, and maintain \<5% CPU usage. - -## Scope - -**In Scope:** - -- Benchmark process enumeration (1,000 processes target: \<100ms) -- Load testing with 10,000+ processes -- Memory profiling (target: \<100MB sustained) -- CPU monitoring (target: \<5% sustained) -- Regression testing to prevent degradation -- Performance optimization if targets not met -- Performance documentation - -**Out of Scope:** - -- Advanced performance features (kernel monitoring, eBPF) -- Performance tuning for specific workloads -- Distributed performance testing - -## Technical Details - -### Performance Targets - -| Metric | Target | Measurement Method | -| ----------------------- | ------------------------------- | --------------------------------- | -| Process Enumeration | \<100ms for 1,000 processes | Criterion benchmark | -| Large-Scale Support | 10,000+ processes without issue | Load testing with synthetic procs | -| Memory Usage | \<100MB sustained | Memory profiler (heaptrack, etc.) | -| CPU Usage | \<5% sustained | System monitoring (top, htop) | -| Event Publishing | >1,000 events/sec | Throughput benchmark | -| WAL Write Performance | >500 writes/sec | WAL-specific benchmark | -| Backpressure Activation | \<1s to adjust interval | Chaos test measurement | - -### Benchmarking Strategy - -**Criterion Benchmarks:** - -- Process enumeration on all platforms (Linux, macOS, Windows, FreeBSD) -- Event publishing throughput -- WAL write performance -- Configuration hot-reload latency -- RPC request/response latency - -**Load Testing:** - -- Spawn 10,000+ synthetic processes -- Monitor procmond behavior under load -- Validate no degradation or crashes -- Measure memory and CPU usage - -**Memory Profiling:** - -- Use heaptrack (Linux), Instruments (macOS), or similar tools -- Identify memory leaks or excessive allocations -- Validate \<100MB sustained usage -- Profile WAL and event buffer memory usage - -**CPU Monitoring:** - -- Monitor CPU usage during continuous operation -- Validate \<5% sustained usage -- Identify CPU hotspots with profiler -- Optimize hot paths if needed - -### Optimization Strategies - -**If Targets Not Met:** - -1. **Process Enumeration Optimization:** - - - Reduce syscall overhead - - Batch process queries - - Cache frequently accessed data - - Use platform-specific optimizations - -2. **Memory Optimization:** - - - Reduce event buffer size if excessive - - Optimize WAL file rotation - - Use more efficient data structures - - Profile and eliminate memory leaks - -3. **CPU Optimization:** - - - Reduce collection frequency if needed - - Optimize hot paths (profiler-guided) - - Use more efficient algorithms - - Reduce logging overhead - -4. **Event Publishing Optimization:** - - - Batch event publishing - - Optimize serialization (bincode) - - Reduce event size if possible - - Optimize topic matching - -```mermaid -graph TD - subgraph "Benchmarking" - B1[Process Enumeration] - B2[Event Publishing] - B3[WAL Performance] - B4[RPC Latency] - B1 --> B5[Criterion Results] - B2 --> B5 - B3 --> B5 - B4 --> B5 - end - - subgraph "Load Testing" - L1[Spawn 10k Processes] - L2[Monitor Memory] - L3[Monitor CPU] - L4[Monitor Throughput] - L1 --> L5[Load Test Results] - L2 --> L5 - L3 --> L5 - L4 --> L5 - end - - subgraph "Profiling" - P1[Memory Profiler] - P2[CPU Profiler] - P3[Identify Hotspots] - P1 --> P3 - P2 --> P3 - P3 --> P4[Optimization Targets] - end - - B5 --> O1{Targets Met?} - L5 --> O1 - O1 -->|Yes| O2[Document Performance] - O1 -->|No| P4 - P4 --> O3[Optimize] - O3 --> B1 -``` - -### Regression Testing - -**Strategy:** - -- Establish performance baselines with criterion -- Run benchmarks on every pull request -- Fail CI if performance regresses >10% -- Document acceptable performance ranges - -**Baseline Storage:** - -- Store criterion baselines in repository -- Update baselines when intentional changes made -- Track performance trends over time - -## Dependencies - -**Requires:** - -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 6] - Security hardening must be complete -- ticket:54226c8a-719a-479a-863b-9c91f43717a9/[Ticket 7] - FreeBSD support must be validated - -**Blocks:** - -- None (final ticket in Epic) - -## Acceptance Criteria - -### Process Enumeration Performance - -- [ ] Enumerate 1,000 processes in \<100ms (average) on Linux -- [ ] Enumerate 1,000 processes in \<100ms (average) on macOS -- [ ] Enumerate 1,000 processes in \<100ms (average) on Windows -- [ ] Enumerate 1,000 processes in \<200ms (average) on FreeBSD (degraded acceptable) - -### Large-Scale Support - -- [ ] Support 10,000+ processes without crashes -- [ ] Support 10,000+ processes without memory leaks -- [ ] Support 10,000+ processes without performance degradation -- [ ] Load testing validates stability under high process count - -### Memory Usage - -- [ ] Memory usage \<100MB during normal operation (1,000 processes) -- [ ] Memory usage \<200MB during high load (10,000 processes) -- [ ] No memory leaks detected by profiler -- [ ] WAL and event buffer memory usage within limits - -### CPU Usage - -- [ ] CPU usage \<5% during continuous monitoring (1,000 processes) -- [ ] CPU usage \<10% during high load (10,000 processes) -- [ ] No CPU hotspots identified by profiler -- [ ] Collection interval adjustment reduces CPU usage under backpressure - -### Event Publishing Performance - -- [ ] Publish >1,000 events/sec to event bus -- [ ] WAL write performance >500 writes/sec -- [ ] Backpressure activation \<1s to adjust interval -- [ ] Event publishing throughput validated by benchmark - -### Regression Testing - -- [ ] Criterion baselines established for all benchmarks -- [ ] Benchmarks run on every pull request -- [ ] CI fails if performance regresses >10% -- [ ] Performance trends tracked over time - -### Documentation - -- [ ] Performance targets documented -- [ ] Benchmarking methodology documented -- [ ] Optimization strategies documented -- [ ] Performance baselines documented -- [ ] Regression testing documented - -## References - -- **Epic Brief:** spec:54226c8a-719a-479a-863b-9c91f43717a9/0fc3298b-37df-4722-a761-66a5a0da16b3 -- **Tech Plan:** spec:54226c8a-719a-479a-863b-9c91f43717a9/f70103e2-e7ef-494f-8638-5a7324565f28 (Phase 6, Performance Validation) -- **Performance Standards:** file:.cursor/rules/rust/performance-optimization.mdc -- **Existing Benchmarks:** file:procmond/benches/process_collector_benchmarks.rs diff --git a/spec/product.md b/spec/product.md deleted file mode 100644 index ea43b28b..00000000 --- a/spec/product.md +++ /dev/null @@ -1,78 +0,0 @@ -# DaemonEye Product Overview - -DaemonEye is a security-focused, high-performance process monitoring system built for cybersecurity professionals, threat hunters, and security operations centers. **Its primary purpose is to detect suspicious activity on systems by monitoring abnormal process behavior and patterns.** This is a complete Rust 2024 rewrite of a proven Python prototype, delivering enterprise-grade performance with audit-grade integrity. - -## Core Mission - -Detect and alert on suspicious system activity through continuous process monitoring, behavioral analysis, and pattern recognition. Provide security operations teams with a reliable, high-performance threat detection solution that operates independently of external dependencies while maintaining audit-grade integrity and operator-centric workflows. - -## Key Value Propositions - -- **Audit-Grade Integrity**: Tamper-evident, cryptographically chained logs suitable for compliance and forensics -- **Offline-First Operation**: Full functionality without internet access, perfect for airgapped environments -- **Security-First Architecture**: Privilege separation, sandboxed execution, and minimal attack surface -- **High Performance**: \<5% CPU overhead while monitoring 10,000+ processes with sub-second enumeration -- **Operator-Centric Design**: Built for operators, by operators, with workflows optimized for contested environments - -## Three-Component Architecture - -1. **procmond** (Privileged Process Collector): Runs with elevated privileges, focused solely on process monitoring with minimal attack surface. Communicates via protobuf IPC with daemoneye-agent. -2. **daemoneye-agent** (User-Space Orchestrator): Operates in user space with minimal privileges for alerting and network delivery. Manages procmond lifecycle and translates SQL rules to simple detection tasks. -3. **daemoneye-cli** (Command-Line Interface): Local interface for data queries, result exports, and service configuration. Communicates with daemoneye-agent for all operations. - -## Tiered Deployment Architecture - -### Free Tier - -- Standalone agent deployment (procmond + daemoneye-agent + daemoneye-cli) -- Local process monitoring and detection -- Basic alerting and data export - -### Business Tier - -- **Security Center**: Centralized management and aggregation -- **Enterprise Integrations**: Splunk, Elasticsearch, Kafka connectors -- **Curated Rule Packs**: Pre-built detection rules for common threats -- **Container Support**: Kubernetes DaemonSet deployment -- **Code Signing**: Signed installers for Windows/macOS - -### Enterprise Tier (Custom Pricing) - -- **Kernel-Level Monitoring**: Real-time eBPF/ETW/EndpointSecurity -- **Federated Architecture**: Multi-tier Security Center hierarchy -- **Advanced SIEM Integration**: STIX/TAXII, compliance mappings -- **Hardened Security**: SLSA provenance, Cosign signatures -- **Network Correlation**: Process-to-network event correlation - -## Target Users - -- SOC Analysts monitoring fleet infrastructure for process anomalies -- Security Operations & Incident Response Teams investigating compromised systems -- System Reliability Engineers requiring low-overhead monitoring -- Blue Team Security Engineers integrating with existing security infrastructure -- DevSecOps Teams embedding security monitoring in deployments - -## Key Features - -### Threat Detection Capabilities - -- **Process Behavior Analysis**: Detect process hollowing, executable integrity violations, suspicious parent-child relationships -- **Anomaly Detection**: Identify unusual resource consumption patterns and suspicious process name duplications -- **SQL-Based Detection Engine**: Flexible rule creation using standard SQL queries with sandboxed execution -- **Built-in Detection Rules**: Comprehensive library covering common threat patterns and attack techniques - -### System Integration - -- Cross-platform support (Linux, macOS, Windows) with native OS integration -- Multi-channel alerting (stdout, syslog, webhooks, email) for SIEM integration -- Tamper-evident audit logging with BLAKE3 cryptographic integrity for forensic analysis -- Resource-bounded operation with graceful degradation under load -- Offline-first operation with bundle-based configuration distribution - -### Enterprise Features - -- **Federated Management**: Multi-tier Security Center architecture for large deployments -- **Real-time Monitoring**: Kernel-level event subscription (eBPF, ETW, EndpointSecurity) -- **Advanced Integrations**: STIX/TAXII feeds, compliance framework mappings -- **Hardened Security**: SLSA Level 3 provenance, hardware-backed code signing -- **Network Correlation**: Process-to-network event correlation for lateral movement detection diff --git a/spec/product_strategy.md b/spec/product_strategy.md deleted file mode 100644 index 29c87a38..00000000 --- a/spec/product_strategy.md +++ /dev/null @@ -1,1241 +0,0 @@ -# Product Strategy - -## Overview - -### Naming Glossary - -- Product: DaemonEye -- Components: - - procmond (Privileged Process Collector) - - daemoneye-agent (Detection Orchestrator) - - daemoneye-cli (Operator CLI) - - daemoneye-lib (Shared library) - - collector-core (Collector framework) - -DaemonEye is an agent-centric system monitoring tool that operates autonomously on each host to collect and analyze process information locally. The daemoneye-agent component runs independently on each system, collecting process data and executing detection rules without requiring a central server for core functionality. - -For enterprise deployments, an optional central server can be deployed to aggregate data from multiple agents, provide centralized management, and enable fleet-wide analysis. However, the core monitoring and detection capabilities function entirely within the local agent, ensuring the system remains operational even in airgapped or isolated environments. - -## Core Functionality - -Setting aside the high-level mission statement, the core functionality of the product is to allow administrators to run queries against the running system to detect suspicious activity. This is done by using a SQL-like query language that is translated into a set of detection tasks for "monitoring" collectors that run on the system and watch processes, files, network connections, and other system resources. When these tasks detect the portion of the "query" they are responsible for, they will trigger additional enrichment by "triggerable" collectors that will run in parallel and provide additional context for the detection. Combining these multiple collectors to return results in the form of a virtual "table" that can be queried against using the same SQL-like language is the core of the system functionality. - -The daemoneye-agent is responsible for taking the "SQL queries" and turning them into "detection tasks" that are then sent to the appropriate collectors, and then collecting the results and maintaining the virtual database structure. - -## Prioritization - -Given the core functionality, the most important features are: - -- A robust SQL-like query language that is easy to learn and use and allows for complex queries with as much cross-platform consistency as possible -- A set of "monitoring" collectors that are responsible for watching the system and detecting suspicious activity, collecting as much as possible while remaining as small and efficient as possible -- A set of "triggerable" collectors that are responsible for providing additional context for the detection, gathering as much as possible while being consistent and easy to write queries against -- A set of "alerting" sinks that are responsible for delivering the results to the administrator - -From a more practical standpoint, that means that the only real way to truly enhance the project core is to continue to standardize procmond, the collector-core framework, and the agent. While we can always add more sinks, more central monitoring and management, and easier deployment, those features are generally not core to the project's success and are geared more for the business and enterprise customers. The emphesis for this repository is define an extensive and robust collector-core framework that can be used to build a wide variety of collectors and agents, and then to build a wide variety of agents that can be used to detect suspicious activity on the system. While each platform does offer an open metadata value, which can contain platform-specific data, the goal is to find the best features for each platform and try to offer them in a consistent way across all platforms. We will be calling these "collector profiles" and they will be defined similar to database schemas. - -### Collector Types - -- `monitor`: These collectors are responsible for monitoring the processes through either subscription or polling. They will be responsible for collecting the most basic metadata about the process and then triggering the triggerable collectors to collect more detailed metadata. They will run as daemons, with their lifecycle managed by the daemoneye-agent, and will typically run for the entire time that the daemoneye-agent is running. They will only poll if they have alert tasks or if they do not have the ability to subscribe to the process events. -- `triggerable`: These collectors are responsible for collecting more detailed metadata about the process. They do not poll or subscribe to events and will only examine data when provided a task from the daemoneye-agent from a monitor collector. They will populate their own virtual tables in the agent's virtual database on demand. They will run as daemons, with their lifecycle managed by the daemoneye-agent, and will remain idle awaiting a task provided over their IPC channel. Their tasking is generally setup using the special DaemonEye SQL dialect syntax `AUTO JOIN` to automatically collect the data they need (see `daemon_eye_spec_sql_to_ipc_detection_architecture.md` for more details). - -### Collector Roadmap - -The collector framework exposes virtual tables that can be queried using SQL-like syntax. Each collector provides specific data schemas that represent system resources and analysis results. - -#### Composite Primary Key - -- All process-scoped virtual tables MUST use a composite primary key of: (pid, \<table_primary_time_field>, process_instance_id) -- process_instance_id is a deterministic UUIDv5 computed as: - - Namespace: "daemoneye/process" - - Name string: "{boot_id}:{pid}:{process_start_time_ns}" -- Boot ID source (platform-specific): - - Linux: /proc/sys/kernel/random/boot_id (primary); fallback: deterministic UUIDv5 from normalized btime timestamp - - macOS: kern.bootsessionuuid (primary); fallback: deterministic UUIDv5 from normalized kern.boottime - - Windows: Win32_OperatingSystem.LastBootUpTime (WMI) rendered as RFC 3339 UTC; no fallback (avoid clock arithmetic) - - FreeBSD: deterministic UUIDv5 from normalized kern.boottime timestamp -- Timestamps: - - All times MUST be stored in UTC using RFC 3339 format - - Nanosecond precision MUST be used when available; otherwise, use the highest precision provided by the OS - - **Mixed-precision handling**: Normalize all timestamps to RFC 3339 with nanoseconds padded/truncated to consistent canonical form - - Record precision metadata when true nanoseconds are unavailable (e.g., `precision: "microseconds"`) -- Table requirements: - - Every virtual table MUST include process_instance_id - - Each table MUST explicitly specify its primary time field used in the composite key -- **Primary Time Field Mapping:** - - `processes`: `start_time` - - `network_connections`: `created_time` - - `file_events`: `event_time` - - `memory_patterns`: `scan_time` - - `heap_analysis`: `analysis_time` - - `exploit_detection`: `analysis_time` - - `protocol_analysis`: `analysis_time` - -Triggerable collectors are not allowed to trigger other triggerable collectors. The daemoneye-agent will be responsible for triggering the appropriate collectors based on the SQL query and, when all data is returned, the daemoneye-agent will then trigger the alerting sinks to deliver the results. The alerting sinks will be responsible for delivering the results to the administrator. This decoupling allows the individual components to be more focused and have a reduced attack surface. - -#### Monitor Collectors (Continuous System Monitoring) - -##### Processes Collector (procmond) - -Status: Core framework implemented, cross-platform optimizations in progress - -**Virtual Table: `processes`** - -Core fields available across all platforms (provided by `sysinfo`): - -- `pid` (integer): Process ID (effective PK) -- `ppid` (integer): Parent Process ID -- `name` (string): Process name -- `executable_path` (string): Full path to executable -- `command_line` (string): Complete command line arguments -- `state` (string): Process state (running, sleeping, stopped, zombie) -- `cpu_usage` (float): CPU usage percentage -- `accumulated_cpu_time` (float): Total accumulated CPU time in milliseconds -- `memory_rss` (integer): Resident Set Size in bytes -- `memory_vms` (integer): Virtual Memory Size in bytes -- `start_time` (timestamp): Process start time -- `run_time` (integer): Process run time in seconds -- `user_id` (integer): User ID of process owner -- `effective_user_id` (integer): Effective user ID of process owner -- `group_id` (integer): Group ID of process owner -- `effective_group_id` (integer): Effective group ID of process owner -- `working_directory` (string): Current working directory -- `environment_variables` (json): Process environment variables -- `disk_usage` (json): Disk usage statistics (read/write bytes) -- `thread_count` (integer): Number of threads in the process -- `priority` (integer): Process priority -- `nice_value` (integer): Process nice value -- `parent_process` (string): Parent process name -- `root_directory` (string): Process root directory -- `session_id` (integer): Process session ID - -**Table primary time field:** `start_time` (used in composite primary key: pid + start_time + process_instance_id) - -**Sensitive Data Handling:** - -Fields likely to contain PII/secrets (marked with `sensitive: true`): - -- `environment_variables` (json): Process environment variables (sensitive: true) -- `command_line` (string): Complete command line arguments (sensitive: true) -- `working_directory` (string): Current working directory (sensitive: true) -- `root_directory` (string): Process root directory (sensitive: true) - -**Privacy Controls:** - -- **Redaction Rules**: Mask API keys, tokens, passwords using regex patterns (e.g., `(?i)(password|token|key|secret)\s*[:=]\s*\S+`) -- **Encryption-at-Rest**: All sensitive fields must be encrypted in database storage -- **Access Controls**: Sensitive fields require elevated permissions and audit logging -- **Retention Policy**: Sensitive data retention limited to 30 days with automated purge -- **Compliance Mapping**: GDPR (right to erasure), CCPA (data minimization), HIPAA (PHI protection), SOC2 (data handling) - -**Platform-Specific Extensions:** - -Linux extensions (provided by `sysinfo` and `procfs` crates): - -- `memory_maps` (json): Memory mapping information from /proc/[pid]/maps -- `file_descriptors` (json): Open file descriptors from /proc/[pid]/fd -- `network_connections` (json): Network connections from /proc/[pid]/net -- `namespaces` (json): Process namespace information -- `capabilities` (json): Linux capability sets -- `security_context` (string): SELinux/AppArmor security context -- `io_statistics` (json): I/O statistics from /proc/[pid]/io -- `limits` (json): Process resource limits from /proc/[pid]/limits -- `oom_score` (integer): Out-of-memory score -- `oom_score_adj` (integer): OOM score adjustment -- `cgroup` (json): Control group information -- `autogroup` (json): Process autogroup information -- `clear_refs` (json): Clear references information -- `coredump_filter` (json): Core dump filter settings -- `mountinfo` (json): Mount information -- `mountstats` (json): Mount statistics -- `smaps` (json): Detailed memory mapping from /proc/[pid]/smaps -- `smaps_rollup` (json): Rolled-up memory mapping summary -- `wchan` (string): Wait channel information -- `task_stat` (json): Per-task statistics from /proc/[pid]/task/[tid]/stat -- `task_status` (json): Per-task status from /proc/[pid]/task/[tid]/status -- `task_io` (json): Per-task I/O statistics from /proc/[pid]/task/[tid]/io -- `task_children` (json): Child processes from /proc/[pid]/task/[tid]/children - -macOS extensions (provided by `sysinfo` and `mach2` crates): - -- `code_signing` (json): Code signing information and entitlements -- `bundle_info` (json): Application bundle metadata -- `sip_protected` (boolean): System Integrity Protection status -- `sandboxed` (boolean): Sandboxed process detection -- `integrity_level` (string): Process integrity level -- `task_info` (json): Mach task information including virtual memory statistics -- `thread_info` (json): Thread information and scheduling details -- `vm_region` (json): Virtual memory region information -- `vm_statistics` (json): Virtual memory statistics -- `thread_act` (json): Thread activation information -- `thread_policy` (json): Thread scheduling policy -- `thread_status` (json): Thread status and registers -- `port_info` (json): Mach port information -- `semaphore_info` (json): Semaphore information -- `clock_info` (json): Clock and timing information -- `exception_info` (json): Exception handling information -- `dyld_info` (json): Dynamic linker information -- `loader_info` (json): Mach-O loader information - -Windows extensions (provided by `sysinfo` and `windows-rs` crates): - -- `process_token` (json): Process security token information -- `protected_process` (boolean): Protected process detection -- `system_process` (boolean): System process identification -- `uac_elevated` (boolean): UAC elevation status -- `windows_service` (boolean): Windows service detection -- `session_id` (integer): Terminal session ID -- `integrity_level` (string): Process integrity level -- `handle_count` (integer): Number of open handles -- `page_faults` (integer): Page fault count -- `working_set` (integer): Working set size -- `peak_working_set` (integer): Peak working set size -- `gpu_usage` (float): GPU usage percentage -- `gpu_memory_used` (integer): GPU memory usage in bytes -- `thread_count` (integer): Number of threads -- `priority_class` (string): Process priority class -- `affinity_mask` (integer): CPU affinity mask -- `creation_time` (timestamp): Process creation time -- `exit_time` (timestamp): Process exit time -- `kernel_time` (integer): Kernel mode time -- `user_time` (integer): User mode time -- `security_descriptor` (json): Security descriptor information -- `job_object` (json): Job object information -- `window_station` (string): Window station name -- `desktop` (string): Desktop name -- `window_title` (string): Main window title -- `command_line` (string): Full command line -- `environment_variables` (json): Environment variables -- `dll_list` (json): Loaded DLL information -- `registry_access` (json): Registry access information -- `file_access` (json): File access information -- `network_access` (json): Network access information - -##### Network Collector (netmond) - -Status: Planned - -**Virtual Table: `network_connections`** - -Core fields available across all platforms (provided by `sysinfo`): - -- `pid` (integer): Process ID owning the connection (effective PK) -- `connection_id` (string): Unique connection identifier (RFC4122 v5 UUID derived from 5-tuple: protocol + local_address + local_port + remote_address + remote_port + socket_inode + created_time_ms) -- `protocol` (string): Protocol (tcp, udp, unix) -- `local_address` (string): Local IP address -- `local_port` (integer): Local port number -- `remote_address` (string): Remote IP address -- `remote_port` (integer): Remote port number -- `state` (string): Connection state (established, listening, time_wait, etc.) -- `process_name` (string): Name of the owning process -- `created_time` (timestamp): Connection creation time -- `last_activity` (timestamp): Last activity time -- `inode` (integer): Socket inode number -- `socket_type` (string): Socket type (stream, dgram, raw, etc.) -- `family` (string): Address family (inet, inet6, unix) -- `flags` (integer): Socket flags -- `backlog` (integer): Listen backlog queue length -- `receive_queue` (integer): Receive queue length -- `send_queue` (integer): Send queue length -- `uid` (integer): User ID of socket owner -- `gid` (integer): Group ID of socket owner - -**Table primary time field:** `created_time` (used in composite primary key: pid + connection_id + created_time) - -**Virtual Table: `network_interfaces`** - -Core fields available across all platforms (provided by `sysinfo`): - -- `interface_name` (string): Network interface name -- `interface_type` (string): Interface type (ethernet, wifi, loopback, etc.) -- `status` (string): Interface status (up, down, unknown) -- `bytes_sent` (integer): Total bytes sent -- `bytes_received` (integer): Total bytes received -- `packets_sent` (integer): Total packets sent -- `packets_received` (integer): Total packets received -- `errors_sent` (integer): Send errors -- `errors_received` (integer): Receive errors -- `drops_sent` (integer): Send drops -- `drops_received` (integer): Receive drops -- `mac_address` (string): MAC address of the interface -- `ip_networks` (json): IP networks associated with the interface -- `mtu` (integer): Maximum Transmission Unit -- `speed` (integer): Interface speed in Mbps -- `duplex` (string): Duplex mode (full, half, unknown) -- `carrier` (boolean): Carrier detection status -- `operstate` (string): Operational state -- `link_mode` (string): Link mode -- `address` (string): Interface address -- `broadcast` (string): Broadcast address -- `netmask` (string): Network mask -- `rx_bytes` (integer): Bytes received since last refresh -- `tx_bytes` (integer): Bytes transmitted since last refresh -- `rx_packets` (integer): Packets received since last refresh -- `tx_packets` (integer): Packets transmitted since last refresh -- `rx_errors` (integer): Receive errors since last refresh -- `tx_errors` (integer): Transmit errors since last refresh -- `rx_dropped` (integer): Receive drops since last refresh -- `tx_dropped` (integer): Transmit drops since last refresh - -**Platform-Specific Extensions:** - -Linux extensions (provided by `sysinfo` and `procfs` crates): - -- `namespace_id` (integer): Network namespace ID -- `container_id` (string): Container identifier -- `vlan_id` (integer): VLAN ID -- `bond_master` (string): Bonding master interface -- `mtu` (integer): Maximum Transmission Unit -- `arp_table` (json): ARP table entries from /proc/net/arp -- `route_table` (json): Routing table from /proc/net/route -- `tcp_connections` (json): TCP connections from /proc/net/tcp -- `udp_connections` (json): UDP connections from /proc/net/udp -- `unix_sockets` (json): Unix domain sockets from /proc/net/unix -- `snmp_stats` (json): SNMP statistics from /proc/net/snmp -- `snmp6_stats` (json): SNMP6 statistics from /proc/net/snmp6 -- `net_dev_stats` (json): Network device statistics from /proc/net/dev -- `tcp_mem` (json): TCP memory usage from /proc/net/tcp_mem -- `tcp_congestion` (string): TCP congestion control algorithm -- `tcp_window_scaling` (boolean): TCP window scaling enabled -- `tcp_timestamps` (boolean): TCP timestamps enabled -- `tcp_sack` (boolean): TCP SACK enabled -- `tcp_fack` (boolean): TCP FACK enabled -- `tcp_dsack` (boolean): TCP DSACK enabled -- `tcp_ecn` (boolean): TCP ECN enabled -- `tcp_abc` (boolean): TCP ABC enabled -- `tcp_syncookies` (boolean): TCP SYN cookies enabled -- `tcp_fastopen` (boolean): TCP Fast Open enabled -- `tcp_autocorking` (boolean): TCP auto corking enabled -- `tcp_no_delay_ack` (boolean): TCP no delay ACK enabled -- `tcp_thin_linear_timeouts` (boolean): TCP thin linear timeouts enabled -- `tcp_thin_dupack` (boolean): TCP thin duplicate ACK enabled -- `tcp_early_retrans` (boolean): TCP early retransmission enabled -- `tcp_reordering` (integer): TCP reordering threshold -- `tcp_retrans_collapse` (boolean): TCP retrans collapse enabled -- `tcp_keepalive_time` (integer): TCP keepalive time -- `tcp_keepalive_probes` (integer): TCP keepalive probes -- `tcp_keepalive_intvl` (integer): TCP keepalive interval -- `tcp_retries1` (integer): TCP retries 1 -- `tcp_retries2` (integer): TCP retries 2 -- `tcp_orphan_retries` (integer): TCP orphan retries -- `tcp_tw_reuse` (boolean): TCP TIME_WAIT socket reuse -- `tcp_fin_timeout` (integer): TCP FIN timeout -- `tcp_tw_recycle` (boolean): TCP TIME_WAIT socket recycling -- `tcp_max_tw_buckets` (integer): TCP maximum TIME_WAIT buckets -- `tcp_max_syn_backlog` (integer): TCP maximum SYN backlog -- `tcp_syn_retries` (integer): TCP SYN retries -- `tcp_synack_retries` (integer): TCP SYN-ACK retries -- `tcp_abort_on_overflow` (boolean): TCP abort on overflow -- `tcp_stdurg` (boolean): TCP strict URG -- `tcp_rfc1337` (boolean): TCP RFC 1337 - -macOS extensions (provided by `sysinfo` and `mach2` crates): - -- `service_name` (string): Network service name -- `bonjour_services` (json): Bonjour service discovery -- `airport_info` (json): WiFi airport information -- `energy_impact` (float): Network energy impact -- `network_service_order` (json): Network service order configuration -- `dns_configuration` (json): DNS configuration -- `proxy_settings` (json): Proxy settings -- `firewall_status` (string): Firewall status -- `stealth_mode` (boolean): Stealth mode enabled -- `block_all_incoming` (boolean): Block all incoming connections -- `application_firewall` (json): Application firewall rules -- `network_location` (string): Current network location -- `vpn_connections` (json): VPN connection information -- `wifi_networks` (json): Available WiFi networks -- `bluetooth_devices` (json): Bluetooth device information -- `network_interfaces_detailed` (json): Detailed interface information -- `routing_table` (json): Routing table information -- `arp_cache` (json): ARP cache entries -- `network_statistics` (json): Network statistics -- `socket_statistics` (json): Socket statistics -- `tcp_statistics` (json): TCP statistics -- `udp_statistics` (json): UDP statistics -- `icmp_statistics` (json): ICMP statistics -- `ip_statistics` (json): IP statistics -- `interface_statistics` (json): Interface statistics -- `network_quality` (json): Network quality metrics -- `bandwidth_usage` (json): Bandwidth usage statistics -- `connection_history` (json): Connection history -- `network_diagnostics` (json): Network diagnostics information - -Windows extensions (provided by `sysinfo` and `windows-rs` crates): - -- `adapter_guid` (string): Network adapter GUID -- `dhcp_enabled` (boolean): DHCP enabled status -- `dns_servers` (json): DNS server configuration -- `firewall_status` (string): Windows Firewall status -- `hyper_v_vswitch` (string): Hyper-V virtual switch -- `network_adapter_info` (json): Network adapter information -- `ip_configuration` (json): IP configuration -- `routing_table` (json): Routing table -- `arp_table` (json): ARP table -- `netstat_connections` (json): Netstat connection information -- `tcp_connections` (json): TCP connections -- `udp_connections` (json): UDP connections -- `network_interfaces` (json): Network interfaces -- `network_profiles` (json): Network profiles -- `wifi_profiles` (json): WiFi profiles -- `bluetooth_devices` (json): Bluetooth devices -- `vpn_connections` (json): VPN connections -- `network_bridge` (json): Network bridge information -- `network_team` (json): Network team information -- `network_lbfo` (json): Network LBFO information -- `network_qos` (json): Network QoS information -- `network_security` (json): Network security settings -- `network_monitoring` (json): Network monitoring information -- `network_diagnostics` (json): Network diagnostics -- `network_performance` (json): Network performance metrics -- `network_usage` (json): Network usage statistics -- `network_events` (json): Network events -- `network_logs` (json): Network logs -- `network_traces` (json): Network traces -- `network_captures` (json): Network captures -- `network_analysis` (json): Network analysis -- `network_forensics` (json): Network forensics -- `network_compliance` (json): Network compliance -- `network_audit` (json): Network audit information - -##### Filesystem Collector (fsmond) - -Status: Planned - -**Virtual Table: `file_events`** - -Core fields available across all platforms (provided by `sysinfo` and platform-specific crates): - -- `process_pid` (integer): Process ID that triggered the event (effective PK) -- `event_id` (string): Unique event identifier -- `canonical_id` (string): Cross-platform canonical identifier (foreign key to `file_metadata.canonical_id`) -- `event_type` (string): Event type (create, modify, delete, access, move) -- `file_path` (string): Full path to the file (foreign key to `file_metadata.file_path` on Windows) -- `file_name` (string): File name only -- `directory` (string): Directory containing the file -- `file_size` (integer): File size in bytes -- `file_type` (string): File type (regular, directory, symlink, etc.) -- `permissions` (string): File permissions (octal or symbolic) -- `owner_user` (string): File owner username -- `owner_group` (string): File owner group -- `created_time` (timestamp): File creation time -- `modified_time` (timestamp): File modification time -- `accessed_time` (timestamp): File access time -- `event_time` (timestamp): When the event occurred -- `process_name` (string): Process name that triggered the event -- `mount_point` (string): Mount point containing the file -- `filesystem_type` (string): Type of filesystem -- `device_id` (integer): Device ID containing the file -- `inode` (integer): Inode number (Linux/Unix) (foreign key to `file_metadata.inode` on Unix systems) -- `hard_links` (integer): Number of hard links -- `block_size` (integer): Filesystem block size -- `blocks_allocated` (integer): Number of blocks allocated -- `file_mode` (integer): File mode bits -- `file_uid` (integer): File owner UID -- `file_gid` (integer): File owner GID -- `file_flags` (integer): File flags (Linux) -- `file_generation` (integer): File generation number -- `file_version` (integer): File version -- `file_attributes` (integer): File attributes -- `file_creation_time` (timestamp): File creation time (Windows) -- `file_last_write_time` (timestamp): Last write time (Windows) -- `file_last_access_time` (timestamp): Last access time (Windows) -- `file_change_time` (timestamp): Last change time (Linux/Unix) - -**Join Logic for file_events ↔ file_metadata:** - -- **Cross-platform**: Use `file_events.canonical_id = file_metadata.canonical_id` -- **Unix/Linux**: Use `file_events.inode = file_metadata.inode AND file_events.device_id = file_metadata.device_id` -- **Windows**: Use `file_events.file_path = file_metadata.file_path` - -**Virtual Table: `file_metadata`** - -> [!NOTE] -> `file_metadata` is system-scoped (not process-scoped). Primary keys are platform-specific: -> -> - **Unix/Linux**: Primary key = `inode` + `device_id` -> - **Windows**: Primary key = `file_path` -> - **Cross-platform**: Use `canonical_id` (computed hash) for joins - -Core fields available across all platforms (provided by `sysinfo` and platform-specific crates): - -- `canonical_id` (string): Cross-platform canonical identifier (computed hash for joins) -- `file_path` (string): Full path to the file -- `file_name` (string): File name only -- `directory` (string): Directory containing the file -- `file_size` (integer): File size in bytes -- `file_type` (string): File type (regular, directory, symlink, etc.) -- `permissions` (string): File permissions -- `owner_user` (string): File owner username -- `owner_group` (string): File owner group -- `created_time` (timestamp): File creation time -- `modified_time` (timestamp): File modification time -- `accessed_time` (timestamp): File access time -- `sha256_hash` (string): SHA-256 hash of file contents -- `mount_point` (string): Mount point containing the file -- `filesystem_type` (string): Type of filesystem -- `device_id` (integer): Device ID containing the file -- `inode` (integer): Inode number (Linux/Unix) (effective primary key in file_metadata on Unix systems; referenced by file_events.inode as a foreign key) -- `hard_links` (integer): Number of hard links -- `block_size` (integer): Filesystem block size -- `blocks_allocated` (integer): Number of blocks allocated -- `file_mode` (integer): File mode bits -- `file_uid` (integer): File owner UID -- `file_gid` (integer): File owner GID -- `file_flags` (integer): File flags (Linux) -- `file_generation` (integer): File generation number -- `file_version` (integer): File version -- `file_attributes` (integer): File attributes -- `file_creation_time` (timestamp): File creation time (Windows) -- `file_last_write_time` (timestamp): Last write time (Windows) -- `file_last_access_time` (timestamp): Last access time (Windows) -- `file_change_time` (timestamp): Last change time (Linux/Unix) -- `symlink_target` (string): Symbolic link target (if applicable) -- `file_extension` (string): File extension -- `mime_type` (string): MIME type of the file -- `file_encoding` (string): File encoding -- `file_compression` (string): Compression type -- `file_encryption` (string): Encryption status -- `file_backup` (boolean): Backup status -- `file_archive` (boolean): Archive status -- `file_hidden` (boolean): Hidden file status -- `file_system` (boolean): System file status -- `file_readonly` (boolean): Read-only status -- `file_executable` (boolean): Executable status -- `file_directory` (boolean): Directory status -- `file_symlink` (boolean): Symbolic link status -- `file_socket` (boolean): Socket status -- `file_pipe` (boolean): Named pipe status -- `file_device` (boolean): Device file status -- `file_special` (boolean): Special file status -- `file_system_events` (json): Real-time file system events (via `notify` crate) -- `file_watcher_events` (json): File watcher events (via `notify` crate) -- `directory_changes` (json): Directory change notifications (via `notify` crate) -- `file_creation_events` (json): File creation events (via `notify` crate) -- `file_modification_events` (json): File modification events (via `notify` crate) -- `file_deletion_events` (json): File deletion events (via `notify` crate) -- `file_rename_events` (json): File rename events (via `notify` crate) -- `file_access_events` (json): File access events (via `notify` crate) -- `file_permission_events` (json): File permission change events (via `notify` crate) -- `file_ownership_events` (json): File ownership change events (via `notify` crate) -- `file_size_events` (json): File size change events (via `notify` crate) -- `file_content_events` (json): File content change events (via `notify` crate) -- `file_metadata_events` (json): File metadata change events (via `notify` crate) -- `file_symlink_events` (json): Symbolic link events (via `notify` crate) -- `file_hardlink_events` (json): Hard link events (via `notify` crate) -- `file_special_events` (json): Special file events (via `notify` crate) -- `file_device_events` (json): Device file events (via `notify` crate) -- `file_socket_events` (json): Socket file events (via `notify` crate) -- `file_pipe_events` (json): Named pipe events (via `notify` crate) -- `file_fifo_events` (json): FIFO events (via `notify` crate) -- `file_block_events` (json): Block device events (via `notify` crate) -- `file_character_events` (json): Character device events (via `notify` crate) -- `file_directory_events` (json): Directory events (via `notify` crate) -- `file_regular_events` (json): Regular file events (via `notify` crate) - -**Platform-Specific Extensions:** - -Linux extensions (provided by `sysinfo` and `procfs` crates): - -- `extended_attributes` (json): Extended attributes (xattr) -- `acl_entries` (json): Access Control List entries -- `capabilities` (json): Linux capabilities -- `inode` (integer): Inode number -- `device` (integer): Device ID -- `hard_links` (integer): Number of hard links -- `symlink_target` (string): Symbolic link target -- `mount_info` (json): Mount information from /proc/mounts -- `mount_stats` (json): Mount statistics from /proc/mountstats -- `disk_stats` (json): Disk statistics from /proc/diskstats -- `file_locks` (json): File locks from /proc/locks -- `dentry_state` (json): Dentry state from /proc/sys/fs/dentry-state -- `inode_state` (json): Inode state from /proc/sys/fs/inode-state -- `file_nr` (json): File descriptor usage from /proc/sys/fs/file-nr -- `file_max` (integer): Maximum file descriptors from /proc/sys/fs/file-max -- `inode_max` (integer): Maximum inodes from /proc/sys/fs/inode-max -- `inode_nr` (json): Current inode usage from /proc/sys/fs/inode-nr -- `super_max` (integer): Maximum superblocks from /proc/sys/fs/super-max -- `super_nr` (integer): Current superblocks from /proc/sys/fs/super-nr -- `dquot_max` (integer): Maximum disk quotas from /proc/sys/fs/dquot-max -- `dquot_nr` (json): Current disk quotas from /proc/sys/fs/dquot-nr -- `lease_break_time` (integer): Lease break time from /proc/sys/fs/lease-break-time -- `leases_enable` (boolean): Leases enabled from /proc/sys/fs/leases-enable -- `dir_notify_enable` (boolean): Directory notifications from /proc/sys/fs/dir-notify-enable -- `inotify_max_user_watches` (integer): Inotify max user watches -- `inotify_max_user_instances` (integer): Inotify max user instances -- `inotify_max_queued_events` (integer): Inotify max queued events -- `pipe_max_size` (integer): Maximum pipe size from /proc/sys/fs/pipe-max-size -- `pipe_user_pages_hard` (integer): Pipe user pages hard limit -- `pipe_user_pages_soft` (integer): Pipe user pages soft limit -- `protected_hardlinks` (boolean): Protected hardlinks from /proc/sys/fs/protected_hardlinks -- `protected_symlinks` (boolean): Protected symlinks from /proc/sys/fs/protected_symlinks -- `suid_dumpable` (integer): SUID dumpable from /proc/sys/fs/suid_dumpable -- `overflowgid` (integer): Overflow GID from /proc/sys/fs/overflowgid -- `overflowuid` (integer): Overflow UID from /proc/sys/fs/overflowuid -- `nr_open` (integer): Maximum open files from /proc/sys/fs/nr_open -- `mount_max` (integer): Maximum mounts from /proc/sys/fs/mount-max -- `binfmt_misc` (json): Binary format misc from /proc/sys/fs/binfmt_misc -- `epoll` (json): Epoll configuration from /proc/sys/fs/epoll - -macOS extensions (provided by `sysinfo`, `notify`, and platform-specific crates): - -- `spotlight_metadata` (json): Spotlight metadata (via `notify` crate with FSEvents API) -- `bundle_info` (json): Application bundle information (via `plist` crate for parsing .plist files) -- `quarantine_flags` (json): Quarantine flags (via `xattr` crate for extended attributes) -- `extended_attributes` (json): macOS extended attributes (via `xattr` crate) -- `time_machine_backup` (boolean): Time Machine backup status (via `notify` crate) - -Windows extensions (provided by `sysinfo`, `notify`, and platform-specific crates): - -- `file_attributes` (json): Windows file attributes (via `notify` crate with ReadDirectoryChangesW API) -- `alternate_data_streams` (json): NTFS alternate data streams (via `notify` crate) -- `security_descriptor` (json): Windows security descriptor (via `notify` crate) -- `volume_serial` (string): Volume serial number (via `notify` crate) -- `file_index` (integer): NTFS file index (via `notify` crate) -- `reparse_point` (boolean): Reparse point status (via `notify` crate) -- `file_archive_events` (json): Archive file events (via `notify` crate) -- `file_compressed_events` (json): Compressed file events (via `notify` crate) -- `file_encrypted_events` (json): Encrypted file events (via `notify` crate) -- `file_hidden_events` (json): Hidden file events (via `notify` crate) -- `file_system_events` (json): System file events (via `notify` crate) -- `file_readonly_events` (json): Read-only file events (via `notify` crate) -- `file_temporary_events` (json): Temporary file events (via `notify` crate) -- `file_offline_events` (json): Offline file events (via `notify` crate) -- `file_sparse_events` (json): Sparse file events (via `notify` crate) -- `file_reparse_events` (json): Reparse point events (via `notify` crate) -- `file_integrity_events` (json): Integrity file events (via `notify` crate) -- `file_virtual_events` (json): Virtual file events (via `notify` crate) -- `file_compression_events` (json): Compression events (via `notify` crate) -- `file_encryption_events` (json): Encryption events (via `notify` crate) -- `file_backup_events` (json): Backup events (via `notify` crate) -- `file_index_events` (json): Index events (via `notify` crate) -- `file_content_indexed_events` (json): Content indexed events (via `notify` crate) -- `file_not_content_indexed_events` (json): Not content indexed events (via `notify` crate) -- `file_recall_on_data_access_events` (json): Recall on data access events (via `notify` crate) -- `file_recall_on_open_events` (json): Recall on open events (via `notify` crate) -- `file_pin_events` (json): Pin events (via `notify` crate) -- `file_unpin_events` (json): Unpin events (via `notify` crate) - -##### Performance Collector (perfmond) - -Status: Planned - -**Virtual Table: `process_performance`** - -Core fields available across all platforms (provided by `sysinfo` crate): - -- `timestamp` (timestamp): Performance measurement time -- `pid` (integer): Process ID (effective PK) -- `process_name` (string): Process name -- `cpu_usage_percent` (float): Process CPU usage percentage -- `memory_rss` (integer): Resident Set Size in bytes -- `memory_vms` (integer): Virtual Memory Size in bytes -- `memory_usage_percent` (float): Memory usage percentage relative to system -- `thread_count` (integer): Number of threads -- `context_switches` (integer): Context switches -- `disk_read_bytes` (integer): Disk read bytes -- `disk_write_bytes` (integer): Disk write bytes -- `network_read_bytes` (integer): Network read bytes -- `network_write_bytes` (integer): Network write bytes -- `priority` (integer): Process priority -- `nice_value` (integer): Process nice value -- `cpu_time_user` (float): User CPU time in seconds -- `cpu_time_system` (float): System CPU time in seconds -- `cpu_time_total` (float): Total CPU time in seconds -- `memory_peak` (integer): Peak memory usage in bytes -- `memory_shared` (integer): Shared memory usage in bytes -- `memory_private` (integer): Private memory usage in bytes -- `io_read_ops` (integer): I/O read operations count -- `io_write_ops` (integer): I/O write operations count -- `io_read_bytes` (integer): I/O read bytes total -- `io_write_bytes` (integer): I/O write bytes total -- `io_wait_time` (float): I/O wait time in seconds -- `cpu_wait_time` (float): CPU wait time in seconds -- `cpu_idle_time` (float): CPU idle time in seconds -- `cpu_steal_time` (float): CPU steal time in seconds -- `cpu_guest_time` (float): CPU guest time in seconds -- `cpu_guest_nice_time` (float): CPU guest nice time in seconds -- `handle_count` (integer): Handle count (Windows/macOS) -- `page_faults` (integer): Page faults -- `working_set` (integer): Working set size (Windows/macOS) -- `peak_working_set` (integer): Peak working set size (Windows/macOS) -- `gpu_usage` (float): Process GPU usage percentage (macOS/Windows) -- `gpu_memory_used` (integer): Process GPU memory used in bytes (macOS/Windows) -- `energy_impact` (float): Energy impact score (macOS) -- `thermal_state` (string): Thermal state (macOS) -- `cpu_time` (integer): Total CPU time in milliseconds -- `user_time` (integer): User CPU time in milliseconds -- `system_time` (integer): System CPU time in milliseconds -- `memory_usage_percent` (float): Memory usage percentage -- `disk_usage_percent` (float): Disk usage percentage -- `network_usage_percent` (float): Network usage percentage -- `io_wait` (float): I/O wait percentage -- `steal_time` (float): Steal time percentage -- `disk_read_speed` (integer): Disk read speed in bytes/second -- `disk_write_speed` (integer): Disk write speed in bytes/second -- `network_receive_speed` (integer): Network receive speed in bytes/second -- `network_transmit_speed` (integer): Network transmit speed in bytes/second - -**Platform-Specific Extensions:** - -Linux extensions (provided by `sysinfo`, `procfs`, and platform-specific crates): - -- `cgroup_id` (string): Control group ID (via `procfs` crate) -- `container_id` (string): Container identifier (via `procfs` crate) -- `perf_events` (json): Hardware performance counters (via `perf_event_open` syscall) -- `io_wait` (float): Process I/O wait percentage (via `/proc/[pid]/stat`) -- `steal_time` (float): Process steal time percentage (via `/proc/[pid]/stat`) -- `handle_count` (integer): File descriptor count (via `/proc/[pid]/fd`) -- `page_faults` (integer): Page faults (via `/proc/[pid]/stat`) -- `working_set` (integer): Working set size (via `/proc/[pid]/status`) -- `peak_working_set` (integer): Peak working set size (via `/proc/[pid]/status`) -- `cpu_temperature` (float): Process CPU temperature (via `sysfs` thermal zones) -- `power_usage` (float): Process power usage in watts (via `sysfs` power management) -- `energy_impact` (float): Process energy impact score (via `sysfs` energy management) -- `thermal_state` (string): Process thermal state (via `sysfs` thermal management) - -macOS extensions (provided by `sysinfo`, `core-graphics`, and platform-specific crates): - -- `handle_count` (integer): Process handle count (via `proc_info` syscall) -- `page_faults` (integer): Process page faults (via `proc_info` syscall) -- `working_set` (integer): Process working set size (via `proc_info` syscall) -- `peak_working_set` (integer): Process peak working set size (via `proc_info` syscall) -- `gpu_usage` (float): Process GPU usage percentage (via `core-graphics` or `Metal` API) -- `gpu_memory_used` (integer): Process GPU memory used (via `core-graphics` or `Metal` API) -- `metal_performance` (json): Process Metal performance data (via `Metal` API) -- `battery_level` (float): Battery level percentage (via `IOKit`) -- `battery_status` (string): Battery status (via `IOKit`) - -Windows extensions (provided by `sysinfo`, `windows`, and platform-specific crates): - -- `handle_count` (integer): Process handle count (via `windows` crate) -- `page_faults` (integer): Process page faults (via `windows` crate) -- `working_set` (integer): Process working set size (via `windows` crate) -- `peak_working_set` (integer): Process peak working set size (via `windows` crate) -- `gpu_usage` (float): Process GPU usage percentage (via `windows` crate with WMI) -- `gpu_memory_used` (integer): Process GPU memory used (via `windows` crate with WMI) -- `battery_level` (float): Battery level percentage (via `windows` crate) -- `battery_status` (string): Battery status (via `windows` crate) - -#### Trigger Collectors (Event-Driven Enrichment) - -##### Binary Analysis Collector (binmond) - -The binary analysis collector will be responsible for examining the associated executable file for the process, and in some cases, the memory image of the process itself. It will collect general header fields, hash values, code signing information, imports and exports, and can accept YARA-like tasks to gather highly tuned data. - -Status: Planned - -**Virtual Table: `binary_analysis`** - -Core fields available across all platforms (provided by `goblin` crate): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the analyzed binary -- `file_name` (string): Binary file name -- `file_size` (integer): File size in bytes -- `file_format` (string): Binary format (PE, ELF, Mach-O, etc.) -- `architecture` (string): Target architecture (x86, x64, ARM, etc.) -- `platform` (string): Target platform (Windows, Linux, macOS) -- `entry_point` (string): Entry point address -- `base_address` (string): Base address of binary -- `image_size` (integer): Size of loaded image -- `subsystem` (string): Subsystem type (console, gui, etc.) -- `machine_type` (string): Machine type identifier -- `characteristics` (json): Binary characteristics flags -- `sections` (json): Section headers and information -- `imports_count` (integer): Number of imported functions -- `exports_count` (integer): Number of exported functions -- `libraries_count` (integer): Number of linked libraries -- `compiler` (string): Compiler used to build the binary (via debug symbols) -- `compiler_version` (string): Compiler version (via debug symbols) -- `build_tool` (string): Build tool identification (via debug symbols) -- `build_date` (timestamp): Build date and time (via debug symbols) -- `debug_info` (json): Debug information sections -- `source_files` (json): Source file paths (via debug symbols) -- `panic_messages` (json): Embedded panic messages (Rust) -- `file_paths` (json): Embedded file paths (Rust/Go) -- `build_info` (json): Build information (Go binaries) -- `go_version` (string): Go runtime version (Go binaries) -- `go_modules` (json): Go module information (Go binaries) -- `rust_panic_info` (json): Rust panic metadata (Rust binaries) -- `packed` (boolean): Whether binary is packed/compressed -- `packer_type` (string): Type of packer used (UPX, PECompact, etc.) -- `upx_packed` (boolean): Whether binary is UPX-packed -- `upx_version` (string): UPX version used (if UPX-packed) -- `section_entropy` (json): Entropy values for each section -- `suspicious_sections` (json): Sections with high entropy or unusual names -- `entropy` (float): File entropy score -- `created_time` (timestamp): File creation time -- `modified_time` (timestamp): File modification time -- `sha256_hash` (string): SHA-256 hash of file contents -- `md5_hash` (string): MD5 hash of file contents -- `analysis_time` (timestamp): When analysis was performed - -**Virtual Table: `code_signing`** - -Core fields available across all platforms (provided by `goblin`, `x509-parser`, and platform-specific crates): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the signed binary -- `signed` (boolean): Whether the binary is signed -- `signature_valid` (boolean): Whether the signature is valid -- `signature_type` (string): Signature type (authenticode, adhoc, etc.) -- `certificate_chain` (json): Certificate chain information (via `x509-parser` crate) -- `signing_authority` (string): Certificate authority -- `publisher` (string): Code publisher -- `subject_name` (string): Certificate subject name -- `issuer_name` (string): Certificate issuer name -- `serial_number` (string): Certificate serial number -- `timestamp` (timestamp): Code signing timestamp -- `timestamp_valid` (boolean): Whether timestamp is valid -- `revocation_status` (string): Certificate revocation status -- `trust_level` (string): Trust level assessment -- `signature_algorithm` (string): Signature algorithm used -- `hash_algorithm` (string): Hash algorithm used -- `certificate_valid_from` (timestamp): Certificate valid from date -- `certificate_valid_to` (timestamp): Certificate valid to date - -**Virtual Table: `imports_exports`** - -Core fields available across all platforms (provided by `goblin` crate): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the binary -- `import_type` (string): Import or export -- `library_name` (string): Library name -- `function_name` (string): Function name -- `ordinal` (integer): Function ordinal -- `address` (string): Function address -- `rva` (string): Relative virtual address -- `api_category` (string): API category classification -- `suspicious` (boolean): Suspicious API usage flag -- `forwarded` (boolean): Whether function is forwarded -- `delayed_import` (boolean): Whether import is delayed -- `bound_import` (boolean): Whether import is bound -- `api_set` (string): API set name (Windows) -- `module_name` (string): Module name containing function -- `function_type` (string): Function type (stdcall, cdecl, etc.) -- `calling_convention` (string): Calling convention used -- `parameter_count` (integer): Number of parameters -- `return_type` (string): Return type of function - -**Virtual Table: `yara_matches`** - -Core fields available across all platforms (provided by `yara` crate): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the scanned file -- `rule_name` (string): YARA rule name -- `rule_namespace` (string): YARA rule namespace -- `rule_category` (string): Rule category (malware, packer, etc.) -- `rule_severity` (string): Rule severity level -- `match_count` (integer): Number of matches -- `match_strings` (json): Matched strings -- `rule_metadata` (json): Rule metadata -- `match_offset` (integer): Offset of match in file -- `match_length` (integer): Length of matched content -- `rule_tags` (json): Rule tags -- `rule_author` (string): Rule author -- `rule_version` (string): Rule version -- `rule_description` (string): Rule description -- `scan_time` (timestamp): When scan was performed - -**Virtual Table: `debug_info`** - -Core fields available across all platforms (provided by `goblin` crate and debug symbol parsing): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the binary -- `debug_format` (string): Debug format (DWARF, PDB, etc.) -- `compiler` (string): Compiler used (gcc, clang, msvc, rustc, etc.) -- `compiler_version` (string): Compiler version string -- `build_tool` (string): Build tool (make, cmake, cargo, etc.) -- `build_date` (timestamp): Build date and time -- `build_host` (string): Build host information -- `build_user` (string): Build user information -- `source_files` (json): Source file paths and line numbers -- `function_names` (json): Function names and addresses -- `variable_names` (json): Variable names and types -- `line_numbers` (json): Line number information -- `optimization_level` (string): Compiler optimization level -- `debug_level` (string): Debug information level -- `target_triple` (string): Target architecture triple -- `linker` (string): Linker used -- `linker_version` (string): Linker version -- `analysis_time` (timestamp): When analysis was performed - -**Virtual Table: `language_metadata`** - -Core fields available across all platforms (provided by `goblin` crate and language-specific parsing): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the binary -- `language` (string): Programming language (Rust, Go, C, C++, etc.) -- `language_version` (string): Language version used -- `runtime_version` (string): Runtime version (Go, .NET, etc.) -- `framework` (string): Framework used (Rust std, Go std, etc.) -- `panic_messages` (json): Embedded panic messages (Rust) -- `file_paths` (json): Embedded file paths (Rust/Go) -- `build_info` (json): Build information (Go binaries) -- `go_version` (string): Go runtime version (Go binaries) -- `go_modules` (json): Go module information (Go binaries) -- `rust_panic_info` (json): Rust panic metadata (Rust binaries) -- `rust_std_version` (string): Rust standard library version -- `cargo_version` (string): Cargo version used (Rust) -- `go_build_tags` (json): Go build tags used -- `go_ldflags` (json): Go linker flags used -- `rust_features` (json): Rust features enabled -- `rust_target` (string): Rust target triple -- `analysis_time` (timestamp): When analysis was performed - -**Detection Methods:** - -- **Go Metadata**: Parse `.note.go.buildid` section for build info, extract version strings from `.rodata` section, analyze Go runtime symbols and string tables using `goblin` crate -- **Rust Metadata**: Scan binary strings for panic messages with file paths, extract Cargo metadata from debug sections, identify Rust std library symbols and panic handlers using `goblin` crate -- **Language Detection**: Analyze import table for language-specific APIs (Go runtime, Rust std), scan string sections for language signatures, examine symbol tables for framework-specific functions using `goblin` crate -- **Metadata Extraction**: Parse section headers for embedded build info, extract version strings from string tables, analyze symbol tables for runtime metadata using `goblin` crate - -**Virtual Table: `packer_analysis`** - -Core fields available across all platforms (provided by `goblin` crate and entropy analysis): - -- `pid` (integer): Process ID (effective PK) -- `file_path` (string): Path to the binary -- `packed` (boolean): Whether binary is packed/compressed -- `packer_type` (string): Type of packer used (UPX, PECompact, ASPack, etc.) -- `packer_version` (string): Version of the packer used -- `upx_packed` (boolean): Whether binary is UPX-packed -- `upx_version` (string): UPX version used (if UPX-packed) -- `upx_compression_ratio` (float): Compression ratio achieved by UPX -- `section_entropy` (json): Entropy values for each section -- `suspicious_sections` (json): Sections with high entropy or unusual names -- `upx_sections` (json): UPX-specific sections (UPX0, UPX1, etc.) -- `import_table_size` (integer): Size of import table (often minimal in packed binaries) -- `entry_point_entropy` (float): Entropy of entry point region -- `overlay_present` (boolean): Whether binary has overlay data -- `overlay_size` (integer): Size of overlay data -- `stub_size` (integer): Size of unpacking stub -- `compression_method` (string): Compression method used -- `compression_level` (integer): Compression level used -- `analysis_time` (timestamp): When analysis was performed - -**Detection Methods:** - -- **UPX Detection**: Analyze section names (`UPX0`, `UPX1`), section entropy, import table size, and UPX signatures in headers -- **Generic Packer Detection**: High section entropy (>7.0), minimal import table, unusual section names, entry point entropy analysis -- **Entropy Analysis**: Calculate Shannon entropy for each section to identify compressed/encrypted content -- **Section Analysis**: Inspect section headers for packer-specific characteristics using `goblin` crate - -##### Memory Analysis Collector (memmond) - -Status: Planned - -**Virtual Table: `memory_regions`** - -Core fields available across all platforms (provided by `sysinfo` crate and platform-specific extensions): - -- `pid` (integer): Process ID -- `process_name` (string): Process name -- `region_address` (string): Memory region start address -- `region_size` (integer): Memory region size in bytes -- `region_type` (string): Region type (heap, stack, code, data, mapped) -- `permissions` (string): Memory permissions (read, write, execute) -- `protection` (string): Memory protection flags -- `mapped_file` (string): Mapped file path (if applicable) -- `allocation_time` (timestamp): When region was allocated -- `access_count` (integer): Number of accesses to region -- `dirty_pages` (integer): Number of dirty pages -- `shared` (boolean): Whether region is shared -- `commit_charge` (integer): Committed memory in region -- `working_set` (integer): Working set size for region -- `analysis_time` (timestamp): When analysis was performed - -**Virtual Table: `memory_artifacts`** - -Core fields available across all platforms (provided by `regex` crate and memory scanning utilities): - -- `pid` (integer): Process ID -- `process_name` (string): Process name -- `artifact_type` (string): Artifact type (credential, key, connection, file, malware_signature, etc.) -- `artifact_value` (string): Artifact value -- `artifact_category` (string): Artifact category (suspicious, malicious, credential, network, file) -- `memory_address` (string): Memory address where artifact was found -- `confidence` (float): Detection confidence score -- `context` (json): Additional context information -- `pattern_used` (string): Regex pattern used for detection -- `match_length` (integer): Length of matched artifact -- `encoding` (string): Text encoding detected (UTF-8, ASCII, etc.) -- `extraction_time` (timestamp): When artifact was extracted - -**Virtual Table: `memory_patterns`** - -> **⚠️ LEGAL/COMPLIANCE WARNING**: Memory analysis involves accessing potentially sensitive data including PII, credentials, and proprietary information. Ensure compliance with CFAA, GDPR/CCPA, PCI-DSS, and obtain proper authorization/consent before deployment. Implement redaction/sanitization rules for extracted artifacts and maintain audit logs for all access. - -Core fields available across all platforms (provided by `regex` crate and pattern matching utilities): - -- `pid` (integer): Process ID -- `process_name` (string): Process name -- `pattern_type` (string): Pattern type (crypto_key, url, email, ip, malware_signature, shellcode, etc.) -- `pattern_value` (string): Pattern value -- `pattern_regex` (string): Regular expression used -- `match_count` (integer): Number of matches found -- `memory_addresses` (json): Memory addresses where pattern was found -- `confidence` (float): Pattern matching confidence -- `entropy_score` (float): Entropy of matched content -- `false_positive_rate` (float): Estimated false positive rate -- `rop_chain_detected` (boolean): Summary flag for ROP chain detection (detailed analysis in exploit_detection table) -- `buffer_overflow_detected` (boolean): Summary flag for buffer overflow detection (detailed analysis in exploit_detection table) -- `stack_corruption_detected` (boolean): Summary flag for stack corruption detection (detailed analysis in exploit_detection table) -- `scan_time` (timestamp): When pattern scan was performed - -**Privacy & Compliance Controls:** - -- **Redaction Rules**: Mask PII, credentials, keys using regex patterns; replace with hashed tokens -- **Retention Policy**: Memory artifacts retained for 7 days with secure deletion; audit logs for 90 days -- **Access Controls**: Memory analysis requires elevated permissions and multi-factor authentication -- **Scope Restrictions**: Default scanning excludes heap/user-data regions; limit to code/stack/mapped regions -- **Operational Checklist**: Obtain explicit consent, document legal basis, implement audit logging, establish data handling procedures - -**Table Boundaries:** - -- `memory_patterns`: Generic pattern-based detections (crypto keys, URLs, emails, IPs) -- `exploit_detection`: Deep exploit-specific analysis (ROP chains, buffer overflows, shellcode) - -**Virtual Table: `heap_analysis`** - -Core fields available across all platforms (provided by `sysinfo` crate and platform-specific memory analysis): - -- `pid` (integer): Process ID -- `process_name` (string): Process name -- `heap_address` (string): Heap base address -- `heap_size` (integer): Heap size in bytes -- `allocation_count` (integer): Number of allocations -- `free_count` (integer): Number of frees -- `memory_leak_count` (integer): Number of potential memory leaks detected -- `fragmentation` (float): Heap fragmentation percentage; fragmentation is allocator- and workload-dependent and not by itself indicative of malicious behavior -- `largest_allocation` (integer): Size of largest allocation -- `average_allocation` (float): Average allocation size -- `peak_memory_usage` (integer): Peak memory usage -- `current_memory_usage` (integer): Current memory usage -- `memory_growth_rate` (float): Memory growth rate over time -- `analysis_time` (timestamp): When heap analysis was performed - -**Virtual Table: `exploit_detection`** - -Core fields available across all platforms (provided by `regex` crate, `sysinfo` crate, and custom exploit analysis logic): - -- `pid` (integer): Process ID -- `process_name` (string): Process name -- `exploit_type` (string): Type of exploit detected (ROP, buffer_overflow, stack_overflow, heap_overflow, format_string, etc.) -- `exploit_confidence` (float): Confidence score for exploit detection -- `rop_chain_length` (integer): Length of detected ROP chain -- `rop_gadgets` (json): ROP gadgets found in the chain -- `buffer_size` (integer): Size of overflowed buffer -- `overflow_offset` (integer): Offset where overflow occurred -- `stack_pointer` (string): Stack pointer value at time of detection -- `return_address` (string): Return address value -- `canary_detected` (boolean): Whether stack canary was present -- `canary_value` (string): Stack canary value -- `aslr_enabled` (boolean): Whether ASLR was enabled -- `dep_enabled` (boolean): Whether DEP/NX was enabled -- `exploit_technique` (string): Specific technique used (ret2libc, ret2syscall, etc.) -- `shellcode_detected` (boolean): Whether shellcode was detected -- `shellcode_size` (integer): Size of detected shellcode -- `analysis_time` (timestamp): When exploit analysis was performed - -**Detection Methods:** - -- **Memory Region Analysis**: Use `sysinfo` crate for process memory information, `procfs` crate for parsing `/proc/[pid]/maps` (Linux), `mach2` crate for macOS memory regions, and `windows` crate for `VirtualQueryEx` (Windows) region details -- **Pattern Matching**: Use `regex` crate for pattern matching in memory dumps, scan for URLs, emails, IPs, crypto keys, and other artifacts using compiled regex patterns -- **Artifact Extraction**: Scan memory regions for strings, extract text using encoding detection, identify structured data patterns using entropy analysis -- **Heap Analysis**: Use `sysinfo` crate for process memory statistics, analyze allocation patterns, detect memory leaks through allocation/free tracking, calculate fragmentation metrics -- **Security Analysis**: Instrument memory activity via first-party telemetry built on `sysinfo`, `procfs`, `mach2`, and the `windows` crate (ToolHelp + ETW providers). Develop and ship an in-house `heap_sentinel` module for sustained allocation/leak detection so no runtime profile depends on experimental crates; keep optional debug profiling behind cargo features. -- **Threat Detection**: Combine maintained tooling—`capstone` for disassembly, `goblin` for PE/ELF parsing, and OS-backed memory snapshots—with custom heuristics that score injection indicators. Document fallback logic and regression tests for each heuristic so we can evolve without reintroducing unstable dependencies. -- **Exploit Detection**: Use `regex` crate for pattern matching to detect ROP gadgets, shellcode signatures, and exploit patterns in memory dumps -- **ROP Analysis**: Scan memory for ROP gadget patterns using regex, analyze return address chains for suspicious patterns, detect gadget sequences -- **Buffer Overflow Detection**: Use `sysinfo` and platform-specific crates to analyze stack frames, detect canary violations, identify return address corruption patterns -- **Shellcode Detection**: Use `regex` crate to scan for shellcode signatures, detect encoded payloads, identify suspicious code patterns - -Mitigation note: any auxiliary instrumentation that requires vendor or experimental APIs will ship behind an `experimental_telemetry` feature flag. The production build matrix forbids enabling that flag, and security review tickets track removal timelines whenever we prototype with non-hardened crates. - -##### Network Analysis Collector (netanalymond) - -Status: Planned - -**Virtual Table: `network_traffic`** - -Core fields available across all platforms (provided by `sysinfo` crate, `netstat` crate, and `huginn-net` crate): - -- `connection_id` (string): Unique connection identifier -- `protocol` (string): Protocol (tcp, udp, http, https, dns, etc.) -- `source_ip` (string): Source IP address -- `source_port` (integer): Source port number -- `destination_ip` (string): Destination IP address -- `destination_port` (integer): Destination port number -- `packet_count` (integer): Number of packets -- `bytes_transferred` (integer): Total bytes transferred -- `start_time` (timestamp): Connection start time -- `end_time` (timestamp): Connection end time -- `duration` (integer): Connection duration in seconds -- `direction` (string): Traffic direction (inbound, outbound) -- `pid` (integer): Process ID associated with connection -- `process_name` (string): Process name associated with connection -- `connection_state` (string): Connection state (established, listening, etc.) -- `bytes_sent` (integer): Bytes sent by process -- `bytes_received` (integer): Bytes received by process -- `tcp_ttl` (integer): TCP TTL value (via huginn-net) -- `tcp_window_size` (integer): TCP window size (via huginn-net) -- `tcp_mtu` (integer): TCP MTU value (via huginn-net) -- `analysis_time` (timestamp): When analysis was performed - -**Virtual Table: `protocol_analysis`** - -Core fields available across all platforms (provided by `huginn-net` crate for protocol fingerprinting and analysis): - -- `connection_id` (string): Connection identifier -- `protocol` (string): Protocol name -- `version` (string): Protocol version -- `method` (string): HTTP method (for HTTP/HTTPS) -- `url` (string): Requested URL (for HTTP/HTTPS) -- `status_code` (integer): HTTP status code -- `user_agent` (string): User agent string -- `content_type` (string): Content type -- `content_length` (integer): Content length -- `server` (string): Server header -- `dns_query` (string): DNS query name -- `dns_response` (string): DNS response -- `ssl_version` (string): SSL/TLS version -- `cipher_suite` (string): Cipher suite used -- `certificate_issuer` (string): Certificate issuer -- `http_headers` (json): HTTP headers as key-value pairs -- `dns_record_type` (string): DNS record type (A, AAAA, CNAME, etc.) -- `dns_response_code` (integer): DNS response code -- `pid` (integer): Process ID associated with connection -- `process_name` (string): Process name associated with connection -- `tls_handshake_complete` (boolean): Whether TLS handshake completed -- `tls_certificate_valid` (boolean): Whether TLS certificate is valid -- `ja4_fingerprint` (string): JA4 TLS client fingerprint (via huginn-net) -- `http_user_agent` (string): HTTP User-Agent string (via huginn-net) -- `http_accept_language` (string): HTTP Accept-Language header (via huginn-net) -- `tcp_signature` (string): TCP signature fingerprint (via huginn-net) -- `analysis_time` (timestamp): When analysis was performed - -**Privacy & Compliance Controls:** - -**PII and Sensitive Data Handling:** - -- **URL/DNS PII Redaction**: Strip or canonicalize query parameters, remove path segments resembling usernames/IDs (e.g., `/user/12345/profile` → `/user/[REDACTED]/profile`) -- **Token/Credential Detection**: Mask Authorization, Cookie, API key values in `http_headers` using regex patterns (e.g., `(?i)(authorization|cookie|x-api-key):\s*[^\s,]+` → `$1: [MASKED]`) -- **Data Retention Policy**: Protocol analysis records retained for 30 days with automated purge; audit logs for 1 year -- **Access Controls**: Protocol analysis requires elevated permissions and audit logging for all access -- **Legal/Operational Notes**: HTTP/HTTPS inspection requires TLS interception capabilities, explicit consent/notice, and jurisdiction-specific compliance (check local laws for network monitoring requirements) - -**Affected Fields**: `url`, `dns_query`, `http_headers`, `user_agent`, `certificate_issuer` - -**Virtual Table: `traffic_patterns`** - -Core fields available across all platforms (provided by `sysinfo` crate for basic network statistics and `pnet` crate for detailed traffic analysis): - -- `pattern_id` (string): Unique pattern identifier -- `interface_name` (string): Network interface name (via sysinfo) -- `packets_received` (integer): Packets received since last refresh (via sysinfo) -- `packets_transmitted` (integer): Packets transmitted since last refresh (via sysinfo) -- `total_packets_received` (integer): Total packets received (via sysinfo) -- `total_packets_transmitted` (integer): Total packets transmitted (via sysinfo) -- `errors_received` (integer): Errors on received packets (via sysinfo) -- `errors_transmitted` (integer): Errors on transmitted packets (via sysinfo) -- `mac_address` (string): MAC address of interface (via sysinfo) -- `source_ip` (string): Source IP address (via pnet packet analysis) -- `destination_ip` (string): Destination IP address (via pnet packet analysis) -- `source_port` (integer): Source port (via pnet packet analysis) -- `destination_port` (integer): Destination port (via pnet packet analysis) -- `protocol` (string): Protocol (tcp, udp, icmp, etc.) (via pnet packet analysis) -- `packet_size` (integer): Packet size in bytes (via pnet packet analysis) -- `timestamp` (timestamp): Packet timestamp (via pnet packet analysis) -- `pid` (integer): Process ID associated with connection (via sysinfo + netstat) -- `process_name` (string): Process name associated with connection (via sysinfo + netstat) -- `analysis_time` (timestamp): When analysis was performed - -**Detection Methods:** - -- **Network Interface Monitoring**: Use `sysinfo` crate to collect basic network interface statistics (packets received/transmitted, errors, MAC addresses) and track process network connections -- **Packet Analysis**: Use `pnet` crate for detailed packet inspection, protocol identification, and traffic analysis at the packet level -- **Process Association**: Use `netstat` crate to associate network connections with specific processes and track connection states -- **Protocol Fingerprinting**: Use `huginn-net` crate for multi-protocol passive fingerprinting, JA4 TLS client fingerprinting, TCP signature analysis, and HTTP fingerprinting - -##### Registry/Configuration Collector (regmond) - -Status: Planned - -**Virtual Table: `system_configuration`** - -Core fields available across all platforms (provided by `sysinfo` crate and configuration parsing crates): - -- `pid` (integer): Process ID associated with configuration (effective PK) -- `config_path` (string): Configuration file or registry path -- `config_type` (string): Configuration type (file, registry, plist, ini, toml, yaml) -- `setting_name` (string): Configuration setting name -- `setting_value` (string): Configuration setting value -- `setting_type` (string): Value type (string, integer, boolean, json) -- `category` (string): Configuration category (security, network, system, etc.) -- `modified_time` (timestamp): When setting was last modified -- `file_size` (integer): Configuration file size in bytes -- `file_permissions` (string): File permissions (Unix) or security descriptor (Windows) -- `relevance_reason` (string): Why this config is relevant to the process (process_uses, system_security, recent_change) -- `analysis_time` (timestamp): When analysis was performed - -**Virtual Table: `configuration_changes`** - -Core fields available across all platforms (provided by file system monitoring and configuration parsing): - -- `change_id` (string): Unique change identifier -- `pid` (integer): Process ID that triggered the change -- `config_path` (string): Configuration path -- `setting_name` (string): Setting name -- `old_value` (string): Previous value -- `new_value` (string): New value -- `change_type` (string): Change type (create, modify, delete) -- `change_time` (timestamp): When change occurred -- `file_event_type` (string): File system event type (via `notify` crate) -- `process_name` (string): Process name that made the change -- `user_id` (integer): User ID that made the change - -**Platform-Specific Extensions:** - -Windows extensions (provided by `winreg` crate and `windows` crate): - -- `registry_hive` (string): Registry hive (HKEY_LOCAL_MACHINE, HKEY_CURRENT_USER, etc.) -- `registry_key` (string): Registry key path -- `registry_value` (string): Registry value name -- `registry_type` (string): Registry value type (REG_SZ, REG_DWORD, etc.) -- `group_policy` (boolean): Whether setting is from Group Policy -- `security_descriptor` (json): Security descriptor information -- `registry_timestamp` (timestamp): Registry modification timestamp - -macOS extensions (provided by `plist` crate and `xattr` crate): - -- `plist_domain` (string): Property list domain (user, system, etc.) -- `launchd_service` (boolean): Whether setting is for launchd service -- `sandbox_entitlement` (boolean): Whether setting is sandbox entitlement -- `privacy_setting` (boolean): Whether setting is privacy-related -- `system_integrity` (boolean): Whether setting affects system integrity -- `extended_attributes` (json): Extended attributes (via `xattr` crate) -- `quarantine_flags` (json): Quarantine flags (via `xattr` crate) - -Linux extensions (provided by `procfs` crate and configuration parsing): - -- `config_file` (string): Configuration file path -- `systemd_unit` (string): systemd unit name -- `selinux_context` (string): SELinux security context -- `apparmor_profile` (string): AppArmor profile name -- `kernel_parameter` (boolean): Whether setting is kernel parameter -- `file_owner` (string): Configuration file owner -- `file_group` (string): Configuration file group -- `inode` (integer): File inode number - -**Detection Methods:** - -- **Process-Specific Config Discovery**: Use `sysinfo` crate to get process working directory, environment variables, and executable path to identify relevant configuration files (e.g., config files in working directory, environment-specified config paths) -- **Process-Specific System Configs**: Examine system configurations that directly affect the process (user permissions, resource limits, security contexts, firewall rules for process's network connections) using platform-specific crates -- **Recent Configuration Changes**: Use `notify` crate to identify configuration files that have been modified recently and associate them with the suspicious process -- **Configuration File Parsing**: Use `ini` crate for INI files, `toml` crate for TOML files, `serde_yaml` crate for YAML files, `plist` crate for macOS property lists, `winreg` crate for Windows registry access -- **Process Association**: Use `sysinfo` crate to associate configuration changes with specific processes and track modification sources