From c8f622626797eb4f6867571cfb48a4e2f04e0475 Mon Sep 17 00:00:00 2001 From: "David J. Bianco" Date: Thu, 14 May 2026 15:44:20 -0400 Subject: [PATCH] feat: add host activity realism profiles --- TODO.md | 17 +- commands/eforge/config.md | 1 + .../references/config-dependency-graph.md | 8 + .../eforge/references/config-host-activity.md | 56 +++- .../eforge/references/config-validation.md | 3 +- docs/reference/CUSTOMIZING_CONFIG.md | 1 + scenarios/COVERAGE-TEST-PROMPT.md | 18 +- scenarios/ITERATION-TEST-PROMPT.md | 18 +- scenarios/LARGE-SCALE-COVERAGE-TEST-PROMPT.md | 25 +- src/evidenceforge/cli/validate_config.py | 23 ++ src/evidenceforge/config/activity/README.md | 1 + .../activity/host_activity_profiles.yaml | 199 ++++++++++++ src/evidenceforge/config/schemas.py | 164 ++++++++++ src/evidenceforge/events/contexts.py | 2 + .../activity/host_activity_profiles.py | 281 +++++++++++++++++ .../generation/activity/suspicious_benign.py | 54 ++-- .../generation/emitters/cisco_asa.py | 6 +- .../generation/engine/baseline.py | 284 ++++++++++++++++-- tests/unit/test_baseline_canonical.py | 4 +- tests/unit/test_cisco_asa_emitter.py | 3 + tests/unit/test_host_activity_profiles.py | 141 +++++++++ tests/unit/test_validate_config.py | 28 ++ 22 files changed, 1252 insertions(+), 85 deletions(-) create mode 100644 src/evidenceforge/config/activity/host_activity_profiles.yaml create mode 100644 src/evidenceforge/generation/activity/host_activity_profiles.py create mode 100644 tests/unit/test_host_activity_profiles.py diff --git a/TODO.md b/TODO.md index 663a96e6..b97146cf 100644 --- a/TODO.md +++ b/TODO.md @@ -2,7 +2,7 @@ **Status:** Phase 8.5 (Dual src/dst HostContext) COMPLETE; Pre-MVP quality fixes ongoing **Started:** 2026-03-11 -**Last Updated:** 2026-04-29 +**Last Updated:** 2026-05-14 See [CHANGELOG.md](CHANGELOG.md) for detailed development history of completed phases. @@ -241,7 +241,7 @@ Replaced manual per-emitter field coordination with SecurityEvent intermediate r - [x] **P1** Web application response/session realism follow-up — Added data-driven inbound `web_server` visitor profiles so human visitors consume `traffic_rates.web` as top-level actions, then fan out into required page assets/API calls through `site_maps.yaml`; crawler, health-check, API-client, and opportunistic-probe traffic now uses source-native configured request/status/User-Agent profiles. Static resource sizes are stable per host/path, human navigation and render fanout timing use `timing_profiles.yaml`, and docs/skill references now explain the budget and config ownership. Verification passed: focused web/timing/baseline tests (`107 passed, 1 skipped`), config-related tests (`64 passed`), `uv run eforge validate-config`, repo-wide Ruff checks/format checks, full normal `uv run pytest -q` (`3012 passed, 15 skipped`), and `git diff --check`. - [x] **P1** Well-synced network sensor timing follow-up — Replaced hardcoded multi-sensor Zeek +/-400ms skew plus broad path delay with a validated `network_sensor_observation` timing profile. The default `well_synced` profile keeps stable per-sensor clock skew within +/-1.5ms and per-flow capture/path delay within 50-2000us while preserving canonical packet/byte truth unless source-native observation variance is explicitly enabled. Verification passed with focused Zeek/timing tests, `uv run eforge validate-config`, repo-wide Ruff checks/format checks, full normal `uv run pytest -q` (`3012 passed, 15 skipped`), and `git diff --check`. - [x] **P1** Source identity and endpoint baseline realism sprint — completed TLS/X.509 issuer-compatible chain signatures, Sysmon Event 7 native third-party module identity, config-driven Windows scheduled-process timing, and DHCP registry emission policy tied to lease activity. Verified with `uv run eforge validate-config`, focused regressions, Ruff, normal pytest, and slow-inclusive pytest. -- [ ] **P2** Endpoint/eCAR baseline variance follow-up — Loop 96 found workstation eCAR category volumes and Linux process lifecycle evidence too uniform and complete. The realistic endpoint observation-gap portion is now handled by named observation profiles; remaining work should focus on host/persona-specific volume variance, long-lived process state, and benign unmatched endpoint artifacts. +- [x] **P2** Endpoint/eCAR baseline variance follow-up — addressed through the host/activity profile realism layer. Host family, role, persona, and stable per-host multipliers now shape endpoint, process, registry, scheduled-task, syslog, bash, eCAR, Windows, Zeek, firewall, IDS, web, and proxy rates; config-driven encoded PowerShell variants and benign endpoint texture reduce repeated per-host artifacts. Verification passed with focused host-activity/config/ASA/baseline tests, `uv run eforge validate-config`, Ruff checks/format checks, full normal `uv run pytest -v`, and slow-inclusive `uv run pytest -v --include-slow --no-cov` (`3057 passed, 1 skipped`). - [x] **Later architectural sprint: imperfect observation and source coverage** — implemented a training-friendly `complete` default plus overlay-compatible named observation profiles that apply deterministic source-level drop/delay/coverage semantics without modeling contradictions. The policy covers endpoint, network, proxy/web, firewall, IDS, Windows, Sysmon, Zeek, syslog, bash history, and eCAR source families, while ground truth preserves canonical truth and records source evidence status. Verification passed: focused observation/config/ground-truth tests, `uv run eforge validate-config`, Ruff checks/format checks, full normal `uv run pytest -v` (`3036 passed, 15 skipped`), and slow-inclusive `uv run pytest -v --include-slow` (`3050 passed, 1 skipped`). - [x] Full slow-suite regression cleanup after loop-65 merge — explicit-proxy storyline beacons now preserve authored hostname+destination IP pairs only when the storyline marks that pair as intentional, normal proxy-origin DNS resolution remains intact, and the parallel-generation LogonID assertion treats Type 7 unlock reuse as valid slice-of-time Windows behavior. Verified with targeted proxy/parallel tests, `uv run ruff check .`, `uv run ruff format --check .`, and `uv run pytest -v --include-slow` (`2875 passed, 23 skipped`). Detection Engineer blind review completed for the regenerated Loop 61 dataset at `scenarios/iteration-test/data`; reviewer verdict: Synthetic, 63/100 confidence. Main findings: one PROXY-01 sshd accepted-login lifecycle gap/self-source artifact and Windows 4648 explicit-credential caller PID/image provenance ambiguity around `WS-MCHEN-01`. @@ -437,7 +437,7 @@ Data works but experienced analysts spot tells. Grouped by format for efficient - [x] Event 10 source/target pairs too narrow — fixed by widening `process_access_patterns.yaml` and seeded long-lived process actors. Verification audit output: 950 Event 10 records used 16 source/target pairs. - [x] Registry writer processes too narrow — fixed with key-family-aware writer selection. Verification audit output: Event 12/13 records used 12 writer process images and 1,968 unique TargetObject paths with 0 template artifacts. - [x] Event 7 residual attribution issues — tightened generic module/process matching and retained process-aware DLL materialization. Verification audit output: 380 Event 7 records used 42 unique ImageLoaded paths. -- [ ] Cross-source distribution realism layer — defer until data-source reviews are complete. Independent Sysmon reviews found that field-level realism improved, but per-host event volumes and recipe selection remain too uniform. Design a deterministic host/activity profile layer derived from scenario facts (host type, roles, assigned_user, persona, services, stable seed) and use it to shape Sysmon, Windows Security, Zeek, syslog, firewall, web, proxy, and eCAR/EDR rates. Avoid implementing Sysmon-only profile logic unless needed as a narrow bug fix. +- [x] Cross-source distribution realism layer — implemented a deterministic, overlay-capable host/activity profile layer derived from host family, roles, persona/risk, services, and stable per-host variance. Baseline generation now uses these profiles to scale Windows Security/Sysmon/eCAR, Zeek/network/web/proxy, Linux syslog/bash, firewall/ASA, IDS, auth, endpoint registry, scheduled process, and service-noise volumes without requiring scenario YAML changes. **Zeek:** - [x] Zeek DNS / network support log review — fixed DNS/TLS PTR coherence, added realistic TXT lookup variety, prevented CDN-hostname MX artifacts, increased file-server SMB target coverage, and made SSH pivot UIDs respect sensor visibility. Tests, docs, skills, and skill references updated where needed. @@ -583,8 +583,8 @@ Data works but experienced analysts spot tells. Grouped by format for efficient - [x] Security: bound threat-detection deny timestamp tracking window to prevent unbounded memory/CPU growth - [x] ASA imperfect-observation realism — addressed by the general observation profile layer. `complete` preserves paired training-friendly firewall evidence, while non-default profiles can apply deterministic ASA source-family gaps that create realistic missing/partial firewall evidence without rewriting canonical truth. - [ ] ASA message type diversity limited to 106023/302013-16/305011-12 — missing 111008, 113004, 733100, 106001, 725001, 304001 -- [ ] ASA deny baseline burstiness/profile variance — defer to a general per-source activity profile rather than a one-off ASA fix. Current deny events are uniformly spaced (3-7s); real scans should have configurable burst/quiet periods, campaign-level cadence, and source-specific variance. -- [ ] ASA deny metadata diversity — defer to a general field-distribution realism layer. Current deny events use `[0x0, 0x0]` hash values uniformly; a later profile should model when hashes remain zero vs vary by platform/message/context. +- [x] ASA deny baseline burstiness/profile variance — fixed through host activity profiles and firewall-deny burst scheduling. Baseline denies now use deterministic burst/quiet periods and host/profile variance instead of uniform 3-7 second spacing. +- [x] ASA deny metadata diversity — fixed by carrying deny hash metadata on canonical firewall context and rendering stable varied ASA hash values where appropriate instead of hardcoded `[0x0, 0x0]`. - [ ] Recognizable 45.33.32.x public IPs remain in built-in scan/attacker pools — the original `45.33.32.1` NAT PAT finding is stale, but code still uses `45.33.32.156` in scan/attacker pools. Move these values into data/config or replace them with less recognizable public-looking lab addresses during the broader public-IP/profile cleanup. **eCAR:** @@ -598,10 +598,11 @@ Data works but experienced analysts spot tells. Grouped by format for efficient **Cross-Source / General:** - [x] Configurable cross-source evidence disagreement — implemented as named observation profiles with `complete` as the default. Non-default profiles can introduce deterministic dropped/delayed/filtered/out-of-window evidence across Zeek, web, proxy, firewall, IDS, Windows, Sysmon, syslog, bash history, and eCAR without contradictions or ambiguous rewrites; ground truth retains source evidence status for traceability. - [x] Cross-sensor timestamp precision identical to 15+ decimal places — microsecond jitter added in snort.py, windows.py, and storyline.py -- [ ] **P2** Per-host-type event rate multiplier — Domain controllers generate ~50 events/hr but real DCs running AD/DNS/DFS/GPO produce thousands/hr. `system.type` is used for routing but never for volume scaling. Need `event_rate_multiplier` on System model (or implicit per-type defaults) applied in `_calculate_events_for_hour()` and `_generate_system_traffic()`. DCs should be 3-5x workstation baseline; file servers and web servers similarly elevated. -- [ ] Configurable per-entity artifact variation — deferred to the general host/activity profile layer. Encoded PowerShell baseline noise is currently identical across hosts (same Get-Service blob); later profiles should derive stable per-host command variants, encoded payloads, tool versions, and operator habits. -- [ ] Configurable per-host volume variance — deferred to the general host/activity profile layer. Workstation connection counts are suspiciously uniform (808-1068 range); later profiles should widen variance by role, persona, weekday, installed apps, and stable host-specific multipliers. +- [x] **P2** Per-host-type event rate multiplier — implemented as implicit host/activity profile defaults rather than scenario YAML fields. Domain controllers, file servers, web servers, proxies, Linux servers, and workstations now receive role/family/persona-specific multipliers across baseline activity, auth, endpoint, network, and source-specific noise. +- [x] Configurable per-entity artifact variation — implemented in the host/activity profile layer for baseline artifact texture, including stable per-host encoded PowerShell variants and profile-owned endpoint activity scaling. +- [x] Configurable per-host volume variance — implemented via stable host/persona/role multipliers applied across major activity families so hosts no longer share narrow uniform volume bands by construction. - [ ] Configurable per-host/source log deployment coverage — observation profiles now support source-family gaps and host-scoped missingness multipliers, but explicit per-host source enablement/disablement remains future work. A later setting should model named host groups, disabled sensors, partial deployments, and collection windows when users need topology-level telemetry coverage differences rather than event-level missingness. +- [ ] **P2** Generation speed and efficiency follow-up — Sprint 4 host/activity realism is functionally verified, but the slow-inclusive suite exposed that `pytest-cov` plus `tracemalloc` can make the medium dataset memory test pathological. A future sprint should profile generation without instrumentation noise, identify hot paths introduced by richer host activity/web fanout/firewall texture, and decide whether to optimize generation, mark the memory test `--no-cov`, or relax/update stale performance assertions. - [x] DNS IP pool reuse causes cross-provider resolution (CloudFront→Microsoft IPs, etc.) — domain-first selection ensures consistent domain→IP mapping via FORWARD_DNS - [x] AWS region mismatch between DNS PTR and SSL SNI for same IP — AWS hostname/PTR generation now derives a stable per-IP region/edge identity and PTR generation respects known forward hostname context. - [x] TLS volume clustering design — added data-driven TLS destination profiles with overlay support and `eforge validate-config` schema/tag checks. Auto-generated external TLS now uses weighted enterprise, certificate-infra, package-update, developer-tool, and long-tail browsing profiles with stable per-host preferences. Smoke output had 28,544 TLS SNI rows, 116 distinct names, top SNI share 5.5%, and top-5 share 18.0%. diff --git a/commands/eforge/config.md b/commands/eforge/config.md index b2d8b88b..17a026e3 100644 --- a/commands/eforge/config.md +++ b/commands/eforge/config.md @@ -70,6 +70,7 @@ When writing to the overlay, files are partial — they contain ONLY the user's | Modify Windows auth realism | `windows_auth_realism.yaml` | (standalone — Security log auth timing and failed-logon profile knobs) | | Modify baseline auth noise | `auth_noise.yaml` | (standalone — stale scheduled-credential accounts and irregular recurrence timing) | | Modify endpoint background noise | `endpoint_noise.yaml` | (standalone — scheduled-process timing and DHCP registry emission policy) | +| Modify host activity distribution | `host_activity_profiles.yaml` | (standalone — host/persona/role rate-family multipliers, firewall deny bursts, and artifact variants) | | Modify source observation coverage | `observation_profiles.yaml` | Scenario `observation_profile` selects the named profile; keep `complete` as the default training profile | | Modify causal/source timing | `timing_profiles.yaml` | (standalone — causal prerequisite, source latency, teardown, and Windows/Sysmon collision-spacing knobs) | | ~~Format definitions~~ | Not user-customizable | Engine internals — requires code changes | diff --git a/commands/eforge/references/config-dependency-graph.md b/commands/eforge/references/config-dependency-graph.md index 95a720b2..c3ee6dd8 100644 --- a/commands/eforge/references/config-dependency-graph.md +++ b/commands/eforge/references/config-dependency-graph.md @@ -49,6 +49,14 @@ Each row is a file; columns show what it depends on and what depends on it. | depends on | nothing | Standalone rate table | | **depended on by** | Engine (runtime) | Drives all baseline traffic rate calculations (user activity, web top-level actions, DNS, SMB, Kerberos, LDAP, persona connections) | +### host_activity_profiles.yaml +| Direction | File | Relationship | +|-----------|------|-------------| +| depends on | scenario host metadata | Uses system type, roles, assigned users, primary systems, and user personas to resolve coarse activity multipliers | +| depends on | `traffic_rates.yaml` | Multiplies resolved baseline rates after global intensity and scenario `baseline_activity.traffic_rates` overrides are applied | +| **depended on by** | Engine (runtime) | Shapes host/persona/role baseline volume, endpoint noise, Linux/syslog shell activity, firewall deny bursts, IDS/ICMP rates, and encoded PowerShell artifact variation | +| validated by | `eforge validate-config` | Enforces known rate-family names, ordered positive bounds, core host types, firewall deny burst settings, and artifact variant pools | + ### web_session_profiles.yaml | Direction | File | Relationship | |-----------|------|-------------| diff --git a/commands/eforge/references/config-host-activity.md b/commands/eforge/references/config-host-activity.md index fae076df..33634892 100644 --- a/commands/eforge/references/config-host-activity.md +++ b/commands/eforge/references/config-host-activity.md @@ -15,9 +15,10 @@ Schema documentation for host-level activity config files. User customizations g 5. [windows_auth_realism.yaml](#windows_auth_realismyaml) 6. [auth_noise.yaml](#auth-noise-auth_noiseyaml) 7. [endpoint_noise.yaml](#endpoint-noise-endpoint_noiseyaml) -8. [observation_profiles.yaml](#observation-profiles-observation_profilesyaml) -9. [timing_profiles.yaml](#timing_profilesyaml) -10. [Domain Controller Baseline Activity](#domain-controller-baseline-activity) +8. [host_activity_profiles.yaml](#host-activity-profiles-host_activity_profilesyaml) +9. [observation_profiles.yaml](#observation-profiles-observation_profilesyaml) +10. [timing_profiles.yaml](#timing_profilesyaml) +11. [Domain Controller Baseline Activity](#domain-controller-baseline-activity) --- @@ -350,6 +351,55 @@ registry_noise: --- +## Host Activity Profiles (`host_activity_profiles.yaml`) + +Controls coarse host/persona/role volume multipliers for baseline realism. This layer is intentionally rate-family based rather than event-type based: it keeps scenario authors from managing per-emitter matrices while still making domain controllers, servers, workstations, sysadmins, developers, and exposed roles produce distinct volumes. + +```yaml +rate_families: + default_bounds: [0.25, 6.0] + bounds: + windows_machine_auth: [0.5, 8.0] + firewall_deny: [0.4, 5.0] + +host_types: + workstation: + base_multiplier: 1.0 + variance: [0.75, 1.35] + families: + inbound_network: 0.65 + server: + base_multiplier: 1.8 + variance: [0.85, 1.45] + families: + windows_service_process: 1.15 + domain_controller: + base_multiplier: 4.0 + variance: [0.9, 1.3] + families: + dc_kerberos: 1.5 + +role_profiles: + web_server: + families: + inbound_network: 2.0 + firewall_deny: 1.35 + +persona_profiles: + sysadmin: + families: + linux_remote_admin: 1.45 + windows_remote_admin: 1.35 +``` + +Resolved multipliers apply after global intensity defaults and scenario `baseline_activity.traffic_rates` overrides. Use `traffic_rates.yaml` for global low/medium/high defaults; use `host_activity_profiles.yaml` when the rate should differ by host type, role, persona, or deterministic per-host variance. + +Valid rate families are: `user_activity`, `web`, `dns_interval`, `ntp`, `smb_interval`, `kerberos`, `ldap`, `persona_connections`, `role_network`, `inbound_network`, `windows_service_process`, `windows_registry`, `windows_scheduled_task`, `windows_remote_thread`, `windows_process_access`, `windows_module_load`, `windows_remote_admin`, `windows_service_logon`, `windows_machine_auth`, `dc_kerberos`, `linux_syslog`, `linux_remote_admin`, `linux_shell`, `firewall_deny`, `ids_alert`, and `icmp_monitoring`. + +`artifact_variants.powershell_encoded` provides data-driven benign encoded PowerShell payload templates and parameter pools. `firewall_deny` controls ASA deny burst windows, quiet periods, and mostly-zero metadata hash frequency. Run `eforge validate-config` after overlay changes; it rejects unknown rate-family names, missing core host types, inverted ranges, invalid probabilities, and empty artifact pools. + +--- + ## Observation Profiles (`observation_profiles.yaml`) Defines named source-observation profiles selected by scenario `observation_profile`. Keep `complete` as the default for training-friendly perfect source coverage and correlation. Use non-default profiles only when a scenario intentionally needs realistic source gaps or ingestion delays. diff --git a/commands/eforge/references/config-validation.md b/commands/eforge/references/config-validation.md index a0aa6ac9..86db24c5 100644 --- a/commands/eforge/references/config-validation.md +++ b/commands/eforge/references/config-validation.md @@ -85,7 +85,8 @@ Run `eforge info ` to get specific values (e.g., `eforge info paths.activ | 38 | auth_noise.yaml structure | ERROR | Invalid stale scheduled-credential account pool, host-count range, recurrence interval range, jitter range, skip probability, or backoff bounds | | 39 | endpoint_noise.yaml structure | ERROR | Invalid Windows scheduled-process timing bounds, skip probability, or DHCP registry emission policy | | 40 | observation_profiles.yaml structure | ERROR | Invalid source-family name, missing `complete` profile, invalid missingness probability, or inverted delay/host multiplier range | -| 41 | tls_realism.yaml chain metadata | ERROR | Invalid TLS subject-key profile fields or RSA/ECDSA child signature algorithm mismatch | +| 41 | host_activity_profiles.yaml structure | ERROR | Invalid host/persona/role rate-family name, missing core host type, malformed multiplier/bounds range, malformed firewall deny burst settings, or invalid artifact variant pools | +| 42 | tls_realism.yaml chain metadata | ERROR | Invalid TLS subject-key profile fields or RSA/ECDSA child signature algorithm mismatch | ## Scenario Validation: traffic_rates diff --git a/docs/reference/CUSTOMIZING_CONFIG.md b/docs/reference/CUSTOMIZING_CONFIG.md index c2d0a76d..286baf38 100644 --- a/docs/reference/CUSTOMIZING_CONFIG.md +++ b/docs/reference/CUSTOMIZING_CONFIG.md @@ -163,6 +163,7 @@ Configuration files are interconnected. When you add an entry to one file, other | Windows auth realism | `windows_auth_realism.yaml` (`workstation_lock.min_unlock_gap_seconds`, failed-logon local/network profiles, and optional companion network connection rates) | | Baseline auth noise | `auth_noise.yaml` (stale scheduled-credential account pools, host counts, recurrence intervals, jitter, skips, and backoff) | | Endpoint background noise | `endpoint_noise.yaml` (Windows scheduled-process trigger windows, host drift, skip probability, and DHCP registry emission policy) | +| Host/persona/role volume realism | `host_activity_profiles.yaml` (coarse rate-family multipliers, firewall deny burst shaping, and data-driven artifact variants) | | Observation/source coverage | `observation_profiles.yaml` (named source-level missingness/delay profiles selected by scenario `observation_profile`; default `complete` keeps perfect coverage) | | Causal/source-native timing | `timing_profiles.yaml` (`relationships` for causal prerequisites, source latency, teardown margins, Zeek analyzer offsets and TLS duration floors, plus Windows/Sysmon collision spacing) | | Public NTP fallback servers and DNS tunnel timing | `network_params.yaml` (`public_ntp_servers`, `dns_tunnel_rtt`; scenario-defined internal/domain NTP servers still take precedence) | diff --git a/scenarios/COVERAGE-TEST-PROMPT.md b/scenarios/COVERAGE-TEST-PROMPT.md index 44637578..200e0d0f 100644 --- a/scenarios/COVERAGE-TEST-PROMPT.md +++ b/scenarios/COVERAGE-TEST-PROMPT.md @@ -8,6 +8,8 @@ first minute of output is realistic rather than cold-start). logon_grace_period: "30m" (suppresses "no prior logon" warnings for users assumed already at their desk at time_window.start). + observation_profile: complete (explicit default — preserves training-friendly complete source + coverage; use non-default profiles only when specifically testing collection gaps). Systems (mix of Windows and Linux, ~20+ total): - One workstation per user, distributed across departments: dev, IT, @@ -80,7 +82,7 @@ - Service account (svc_backup) authenticating from an unusual host (not its normal server) — legitimate scheduled task migration, but looks like lateral movement. - All 10 log format groups: windows, zeek, ecar, syslog, bash_history, snort_alert, cisco_asa, + All 9 log format groups: windows, zeek, ecar, syslog, bash_history, snort_alert, cisco_asa, web_access, proxy_access. (Note: "windows" expands to windows_event_security + windows_event_sysmon; "zeek" expands to zeek_conn, zeek_dns, zeek_http, zeek_ssl, zeek_files, zeek_dhcp, zeek_ntp, zeek_weird, @@ -238,6 +240,7 @@ - 4634 logoff pairs with 4624 on matching TargetLogonId, including type 3 network logons and DC machine-account logons (after short delays) - Certificate validity periods match issuer (Let's Encrypt = 90 days, DigiCert = 397 days) + - X.509 child certificate signatures are compatible with the issuer key family and CA profile - Certificate chain depth and CA reuse driven by tls_realism.yaml/tls_issuers.yaml — intermediate CAs appear as shared profiles, not unique per leaf - MAC addresses use diverse OUI prefixes from network_params.yaml (Dell, HP, Lenovo, @@ -288,7 +291,8 @@ process terminations with realistic delays (recon: 0.3-5s, attack tools: 5-30s, persistent/C2: no termination); paired 1:1 with Security 4689 + eCAR PROCESS/TERMINATE for the same exit - Event 7 (ImageLoad): baseline DLL loads (ntdll.dll, kernel32.dll, etc.) with - signing status and signature details + signing status and signature details. Third-party DLLs preserve source-native signer, + company, product, and version metadata instead of falling back to Microsoft identity. - Event 8 (CreateRemoteThread): baseline benign pairs 1-3/hr (MsMpEng->explorer, csrss->svchost, etc.) plus storyline mimikatz create_remote_thread targeting lsass; correlated with eCAR THREAD/REMOTE_CREATE @@ -315,6 +319,9 @@ - Correct interface resolution: internal IPs -> "inside", DMZ IPs -> "dmz", external IPs -> "outside" - Per-sensor directory output: fw-perimeter/cisco_asa.log - Deny baseline volume proportional to deny_ratio (~5x allows) + - Deny baseline timing uses burst/quiet cadence from host_activity_profiles.yaml, not evenly + spaced attempts; 106023 hash pairs should vary when the profile calls for it, not always + render as [0x0, 0x0] - Firewall policy enforcement: external -> corporate_lan denied, external -> dmz:80/443 allowed - Storyline connections through the firewall produce ASA allow records correlated with Zeek conn records - 305011 (Built NAT translation) present when nat_rules configured @@ -334,6 +341,9 @@ Verify DNS-to-TCP offsets are not uniform; verify Sysmon Events 1/5/8/10 for the same process chain are not bucketed at identical timestamps. - Hawkes temporal model: user events show bursty clusters (CV > 1.0 in eval), not uniform spacing + - Host activity profiles: host type, roles, and persona shape broad rate families after + traffic_rates/scenario overrides. Verify DC/file/web/proxy/server hosts and user workstations + have distinct event-volume profiles rather than uniform per-host counts. - Typing cadence: multi-event storyline steps (e.g., step 4 discovery commands, step 10 AD enum) have 1-15 second gaps between events, not identical timestamps - Day-of-week variation: if scenario spans a weekend, Saturday/Sunday activity near-zero @@ -353,6 +363,8 @@ - Entity lifecycle: no process_access events targeting PIDs that don't exist in running_processes - Workstation lock/unlock (4800/4801): persona-driven lock frequency during work hours - Explicit credentials (4648): RunAs and scheduled task execution with alternate credentials + - Observation profile: `complete` keeps cross-source coverage training-friendly; source gaps, + delays, and partial collection belong to named non-default profiles and should not appear here. Proxy coverage (verify in generated data): - Forward proxy (PROXY-01 with roles: [forward_proxy]) routes web traffic for internal systems @@ -377,6 +389,8 @@ dirb/nmap_http always blank - Nikto User-Agent rotates per request via @NIKTO_TESTID@ token (6-digit IDs unique per request), not a single static string + - Browser-like page loads fan out into realistic CSS/JS/image/API subresource requests; the + top-level request budget counts user-driven page/tool requests, not every render component - Event-specific jitter defaults: beacon 0.15 (tight), web_scan 0.4 (wide), credential_spray 0.5 (self-pacing), dga_queries 0.3, dns_tunnel 0.25 — can be overridden per event diff --git a/scenarios/ITERATION-TEST-PROMPT.md b/scenarios/ITERATION-TEST-PROMPT.md index c21d49ca..199cf680 100644 --- a/scenarios/ITERATION-TEST-PROMPT.md +++ b/scenarios/ITERATION-TEST-PROMPT.md @@ -10,6 +10,8 @@ warmup: "2h" (minimum viable to pre-populate DNS cache, process trees, and sessions — cold-start artifacts are immediately visible to forensic reviewers). logon_grace_period: "30m" + observation_profile: enterprise_standard (intentionally exercises realistic source-level + observation gaps, delays, and coverage variation for blind-review improvement loops). Systems (mix of Windows and Linux, ~15 total): - 8 workstations, one per user (1:1 mapping — create one workstation per user): @@ -253,6 +255,7 @@ LDAP/RPC connections to DC, type 3 logon on DC — all within seconds - 4634 logoff pairs with 4624 on matching TargetLogonId - Certificate validity periods match issuer (Let's Encrypt = 90 days, DigiCert = 397 days) + - X.509 child certificate signatures are compatible with the issuer key family and CA profile - PID 4 resolves to "System" in parent process lookups - NAT rules produce: dynamic PAT for outbound (mapped_src_ip + translated port), static NAT for WEB-EXT-01 VIP. Outside Zeek sensors see post-NAT IPs; inside sensors see real IPs @@ -284,7 +287,9 @@ command line; ParentImage reflects spawn_rules.yaml chains - Event 3 (NetworkConnect): outbound connections attributed to originating process - Event 5 (ProcessTerminate): paired 1:1 with Security 4689 + eCAR PROCESS/TERMINATE - - Event 7 (ImageLoad): baseline DLL loads with signing status + - Event 7 (ImageLoad): baseline DLL loads with signing status. Third-party DLLs preserve + source-native signer, company, product, and version metadata instead of falling back to + Microsoft identity. - Event 8 (CreateRemoteThread): baseline benign pairs (1-3/hr) plus storyline mimikatz - Event 10 (ProcessAccess): baseline benign pairs (3-8/hr) plus storyline mimikatz on lsass - Event 11/12/13: emitted for persistence steps (service install, scheduled task) @@ -296,6 +301,9 @@ - Built/Teardown pairs (302013/302014) for permitted TCP connections - Built/Teardown pairs (302015/302016) for permitted UDP connections (DNS, NTP) - Deny records (106023) for blocked traffic + - Deny baseline timing uses burst/quiet cadence from host_activity_profiles.yaml, not evenly + spaced attempts; 106023 hash pairs should vary when the profile calls for it, not always + render as [0x0, 0x0] - 733100 threat-detection alerts during port_scan and web_scan phases (burst exceeds threat_detection_rate of 10 drops/sec). Verify rate_id, current_burst, max_burst, total_count fields present. @@ -309,6 +317,9 @@ - Causal expansion: DNS queries precede TCP connections; Kerberos 4768/4769 precede 4624 domain logons; process_access follows create_remote_thread targeting lsass - Hawkes temporal model: user events show bursty clusters (CV > 1.0), not uniform spacing + - Host activity profiles: host type, roles, and persona shape broad rate families after + traffic_rates/scenario overrides. Verify DC/file/web/proxy/server hosts and user workstations + have distinct event-volume profiles rather than uniform per-host counts. - Typing cadence: multi-event storyline steps have 1-15 second gaps, not identical timestamps - Process→network correlation: chrome.exe/git/sqlcmd baseline processes produce matching connections - Stale account enrichment: Kerberos 4771 (0x12) failures plus failed batch and service logons @@ -321,6 +332,9 @@ - Workstation lock/unlock (4800/4801): workstation_lock always precedes workstation_unlock for the same session — semantic ordering enforced - Explicit credentials (4648): RunAs and scheduled task execution with alternate credentials + - Observation profile: `enterprise_standard` introduces realistic source-level gaps, delays, + and coverage variation without contradictions. Ground truth should still preserve canonical + truth and source-evidence status for reviewer traceability. Proxy coverage (verify in generated data): - PROXY-01 (forward_proxy) routes web traffic for internal systems @@ -337,6 +351,8 @@ - Nikto User-Agent rotates per request via @NIKTO_TESTID@ token (unique 6-digit IDs), not a single static string - Web-scan Referer for nikto: ~30% same-origin; for sqlmap/dirb/nmap_http: always blank + - Browser-like page loads fan out into realistic CSS/JS/image/API subresource requests; the + top-level request budget counts user-driven page/tool requests, not every render component Ground truth / answer key: - GROUND_TRUTH.md generated automatically from storyline events diff --git a/scenarios/LARGE-SCALE-COVERAGE-TEST-PROMPT.md b/scenarios/LARGE-SCALE-COVERAGE-TEST-PROMPT.md index 12a66769..8657aa9f 100644 --- a/scenarios/LARGE-SCALE-COVERAGE-TEST-PROMPT.md +++ b/scenarios/LARGE-SCALE-COVERAGE-TEST-PROMPT.md @@ -8,6 +8,8 @@ Duration: 72 hours (3 full business days), starting 2024-03-18T06:00:00Z (Monday morning). Timezone: America/Chicago. This spans Monday–Wednesday, exercising day-of-week variation with full business-day cycles including morning ramp-up, lunch dips, and evening wind-down. + observation_profile: complete (explicit default — preserves training-friendly complete source + coverage; use non-default profiles only when specifically testing collection gaps). Scenario name: apt-healthcare-breach-large @@ -249,10 +251,12 @@ Key requirements: - Exercise all typed event types: process, logon, failed_logon, logoff (baseline), connection, ssh_session, rdp_session, account_created, account_deleted, group_member_added, service_installed, - scheduled_task_created, log_cleared, create_remote_thread, dhcp_lease, port_scan, beacon, dns_query, - web_scan, credential_spray, dga_queries, dns_tunnel, raw - - NOTE: process_access is NOT a scenario event type — it is auto-generated by create_remote_thread - targeting lsass.exe via the causal expansion engine. Do not declare it in the YAML. + scheduled_task_created, log_cleared, create_remote_thread, process_access, dhcp_lease, + port_scan, beacon, dns_query, web_scan, credential_spray, dga_queries, dns_tunnel, raw + - NOTE: process_access IS a valid scenario event type and can be declared directly for a standalone + Sysmon Event 10. However, create_remote_thread targeting lsass.exe auto-generates correlated + process_access via the causal expansion engine. Do not declare a second process_access on lsass + in the same step. - Use connection events with HTTP fields (method, uri, status_code, user_agent) for web access log entries showing the SQLi, web shell access, and failed exploit attempts — NOT raw events - All base64 payloads must be real (generated via Bash tool) @@ -266,6 +270,7 @@ - DHCP events are routed to sensors by segment visibility (not duplicated across all sensors) - Windows service account events (SYSTEM, NETWORK SERVICE) show "NT AUTHORITY" as SubjectDomainName - Certificate validity periods match issuer (Let's Encrypt = 90 days, DigiCert = 397 days) + - X.509 child certificate signatures are compatible with the issuer key family and CA profile - MAC addresses use diverse OUI prefixes (Dell, HP, Lenovo, Intel, VMware) - PID 4 resolves to "System" in parent process lookups @@ -288,6 +293,8 @@ Sysmon coverage (verify in generated data): - Event 1 (ProcessCreate): baseline + storyline process events - Event 5 (ProcessTerminate): baseline process terminations plus storyline with realistic delays + - Event 7 (ImageLoad): third-party DLLs preserve source-native signer, company, product, and + version metadata instead of falling back to Microsoft identity - Event 8 (CreateRemoteThread): baseline benign pairs plus storyline mimikatz - Event 10 (ProcessAccess): baseline benign pairs plus storyline mimikatz on lsass - Baseline Event 8/10 noise ensures storyline attack events are not instant red flags @@ -302,6 +309,9 @@ - Correct interface resolution per firewall: fw-external uses inside/dmz/outside; fw-internal uses db-zone/mgmt-zone/outside - Deny baseline proportional to deny_ratio: ~8x for external firewall, ~3x for internal + - Deny baseline timing uses burst/quiet cadence from host_activity_profiles.yaml, not evenly + spaced attempts; 106023 hash pairs should vary when the profile calls for it, not always + render as [0x0, 0x0] - Policy enforcement: external → corporate_lan denied, external → dmz:80/443 allowed, app_vlan → database_vlan:3306 allowed, corporate_lan → database_vlan denied - Storyline step 23 (failed exfil from DC-01) should produce a firewall deny record since @@ -317,6 +327,9 @@ - Causal expansion: DNS queries precede TCP connections; Kerberos precede domain logons; process_access follows create_remote_thread targeting lsass - Hawkes temporal model: user events show bursty clusters (CV > 1.0), not uniform spacing + - Host activity profiles: host type, roles, and persona shape broad rate families after + traffic_rates/scenario overrides. Verify DC/file/web/proxy/server hosts and user workstations + have distinct event-volume profiles rather than uniform per-host counts. - Typing cadence: multi-event storyline steps have 1-15 second gaps between events - Day-of-week variation: 3-day span exercises full weekday patterns - Lateral movement: backup/monitoring/AD replication/mail routing between servers @@ -326,5 +339,9 @@ - Linux syslog depth: SSH login messages, package management, systemd timers, logrotate, journald - Command diversification: user-specific paths and varied project/document names - Entity lifecycle: no process_access targeting nonexistent PIDs + - Browser-like page loads fan out into realistic CSS/JS/image/API subresource requests; the + top-level request budget counts user-driven page/tool requests, not every render component + - Observation profile: `complete` keeps cross-source coverage training-friendly; source gaps, + delays, and partial collection belong to named non-default profiles and should not appear here. Save to scenarios/apt-healthcare-breach-large/scenario.yaml with accompanying ENVIRONMENT.md. diff --git a/src/evidenceforge/cli/validate_config.py b/src/evidenceforge/cli/validate_config.py index 80ac0aaf..42fad68d 100644 --- a/src/evidenceforge/cli/validate_config.py +++ b/src/evidenceforge/cli/validate_config.py @@ -233,6 +233,16 @@ def validate_config() -> ValidationResult: "activity/endpoint_noise.yaml": { "dict_fields": {"windows_scheduled_processes", "registry_noise"}, }, + "activity/host_activity_profiles.yaml": { + "dict_fields": { + "rate_families", + "host_types", + "role_profiles", + "persona_profiles", + "artifact_variants", + "firewall_deny", + }, + }, "activity/ids_signatures.yaml": { "list_fields": {"signatures": None}, }, @@ -450,6 +460,9 @@ def validate_config() -> ValidationResult: ) from evidenceforge.generation.activity.dns_registry import load_dns_registry from evidenceforge.generation.activity.endpoint_noise import load_endpoint_noise + from evidenceforge.generation.activity.host_activity_profiles import ( + load_host_activity_profiles, + ) from evidenceforge.generation.activity.ids_signatures import load_ids_signatures from evidenceforge.generation.activity.process_access_patterns import ( load_process_access_patterns, @@ -481,6 +494,7 @@ def validate_config() -> ValidationResult: site_data = load_site_maps() sys_proc_data = load_system_processes() endpoint_noise_data = load_endpoint_noise() + host_activity_profiles_data = load_host_activity_profiles() observation_profiles_data = load_observation_profiles() tls_realism_data = load_tls_realism() windows_auth_data = load_windows_auth_realism() @@ -1697,6 +1711,7 @@ def _record_ids_rule_identity( DnsTunnelTtlEntry, EdrFileSideEffectProfile, EndpointNoiseConfig, + HostActivityProfilesConfig, KerberosRealismConfig, ObservationProfilesConfig, OuiEntry, @@ -1830,6 +1845,14 @@ def _record_ids_rule_identity( _SCHEMA_CHECKS.append( ([observation_profiles_data], ObservationProfilesConfig, "observation_profiles.yaml") ) + if host_activity_profiles_data: + _SCHEMA_CHECKS.append( + ( + [host_activity_profiles_data], + HostActivityProfilesConfig, + "host_activity_profiles.yaml", + ) + ) # traffic_profiles.yaml: connection entries all_traffic_connection_entries = [] diff --git a/src/evidenceforge/config/activity/README.md b/src/evidenceforge/config/activity/README.md index 84f8050b..684bbb1a 100644 --- a/src/evidenceforge/config/activity/README.md +++ b/src/evidenceforge/config/activity/README.md @@ -23,6 +23,7 @@ caches data after first load. Two files (`network_params.yaml`, | `windows_auth_realism.yaml` | `windows_auth_realism.py` | Windows Security authentication realism knobs such as minimum 4800→4801 lock/unlock gap, failed-logon validation paths, companion network evidence, and 4672 privilege profiles. | | `auth_noise.yaml` | `auth_noise.py` | Baseline authentication-noise profiles such as stale scheduled-credential account pools and irregular recurrence timing. | | `endpoint_noise.yaml` | `endpoint_noise.py` | Endpoint background timing and registry-emission policies for Windows scheduled processes and DHCP interface registry writes. | +| `host_activity_profiles.yaml` | `host_activity_profiles.py` | Coarse host/persona/role rate multipliers for baseline volume, endpoint noise, firewall deny bursts, and data-driven artifact variation. | | `observation_profiles.yaml` | `config/observation_profiles.py` | Named source-observation profiles for optional source-level missingness and delays. Scenario `observation_profile` defaults to `complete`. | | `proxy_uri_templates.yaml` | `proxy_uri.py` | Per-domain URI path templates for proxy logs (Windows Update, CRL, OCSP, Azure AD, etc.). | | `network_params.yaml` | `network_params.py`, `engine/emitter_setup.py` | MAC address OUI prefixes, public NTP fallback servers, and DNS tunnel RTT bounds. | diff --git a/src/evidenceforge/config/activity/host_activity_profiles.yaml b/src/evidenceforge/config/activity/host_activity_profiles.yaml new file mode 100644 index 00000000..fed3eb39 --- /dev/null +++ b/src/evidenceforge/config/activity/host_activity_profiles.yaml @@ -0,0 +1,199 @@ +# Host/persona/role activity multipliers for baseline realism. +# +# These profiles are intentionally coarse. They shape broad source families +# without forcing every emitter/event type to carry its own micro-profile. +# +# Overridable via .eforge/config/activity/host_activity_profiles.yaml. +# +# Depended on by: baseline generation engine, suspicious benign activity +# Depends on: scenario system.type, roles, assigned_user, user.persona + +rate_families: + default_bounds: [0.25, 6.0] + bounds: + web: [0.4, 2.5] + dns_interval: [0.5, 4.0] + smb_interval: [0.4, 5.0] + kerberos: [0.5, 6.0] + ldap: [0.5, 6.0] + windows_machine_auth: [0.5, 8.0] + dc_kerberos: [0.8, 8.0] + linux_syslog: [0.4, 5.0] + firewall_deny: [0.4, 5.0] + +host_types: + workstation: + base_multiplier: 1.0 + variance: [0.75, 1.35] + families: + user_activity: 0.8 + role_network: 0.85 + inbound_network: 0.65 + windows_service_logon: 0.75 + windows_machine_auth: 0.9 + linux_syslog: 0.85 + firewall_deny: 0.8 + + server: + base_multiplier: 1.8 + variance: [0.85, 1.45] + families: + user_activity: 0.45 + persona_connections: 0.55 + web: 0.65 + dns_interval: 0.8 + smb_interval: 0.85 + kerberos: 0.9 + ldap: 0.9 + windows_service_process: 1.15 + windows_registry: 1.25 + windows_scheduled_task: 1.15 + windows_process_access: 1.15 + windows_module_load: 1.2 + windows_service_logon: 1.25 + windows_machine_auth: 1.0 + linux_syslog: 1.25 + linux_remote_admin: 1.2 + linux_shell: 0.8 + firewall_deny: 1.1 + + domain_controller: + base_multiplier: 4.0 + variance: [0.9, 1.3] + families: + user_activity: 0.2 + persona_connections: 0.25 + web: 0.35 + dns_interval: 0.45 + smb_interval: 0.65 + kerberos: 1.15 + ldap: 1.05 + role_network: 1.35 + inbound_network: 1.35 + windows_service_process: 1.35 + windows_registry: 1.35 + windows_scheduled_task: 1.2 + windows_process_access: 1.25 + windows_module_load: 1.3 + windows_service_logon: 1.4 + windows_machine_auth: 1.7 + dc_kerberos: 1.5 + firewall_deny: 1.1 + +role_profiles: + file_server: + families: + role_network: 1.35 + inbound_network: 2.2 + smb_interval: 1.8 + windows_registry: 1.1 + windows_service_logon: 1.2 + + web_server: + families: + web: 1.2 + role_network: 1.25 + inbound_network: 2.0 + linux_syslog: 1.45 + firewall_deny: 1.35 + + database: + families: + role_network: 1.3 + inbound_network: 1.8 + linux_syslog: 1.25 + windows_service_process: 1.15 + + app_server: + families: + role_network: 1.25 + inbound_network: 1.6 + windows_service_process: 1.1 + linux_syslog: 1.15 + + log_server: + families: + role_network: 1.2 + inbound_network: 2.1 + linux_syslog: 1.7 + + forward_proxy: + families: + role_network: 1.35 + inbound_network: 1.7 + linux_syslog: 1.35 + firewall_deny: 1.2 + + dns_server: + families: + dns_interval: 1.7 + role_network: 1.25 + inbound_network: 1.8 + linux_syslog: 1.2 + + domain_controller: + families: + dns_interval: 1.4 + kerberos: 1.25 + ldap: 1.25 + role_network: 1.35 + inbound_network: 1.5 + windows_machine_auth: 1.35 + dc_kerberos: 1.35 + +persona_profiles: + developer: + families: + persona_connections: 1.25 + linux_shell: 1.35 + + sysadmin: + families: + user_activity: 1.05 + persona_connections: 1.15 + linux_remote_admin: 1.45 + linux_shell: 1.45 + windows_remote_admin: 1.35 + + security_analyst: + families: + user_activity: 1.05 + persona_connections: 1.2 + linux_remote_admin: 1.2 + windows_remote_admin: 1.2 + + executive: + families: + user_activity: 0.8 + persona_connections: 0.9 + linux_shell: 0.6 + +artifact_variants: + powershell_encoded: + host_preferred_template_count: 3 + templates: + - "Get-Service -Name {svc}" + - "Get-EventLog -LogName {log} -Newest {n}" + - "Test-NetConnection {host} -Port {port}" + - "Get-Process -Name {proc}" + - "Get-ChildItem -Path C:\\{dir} -Recurse | Measure-Object" + - "Get-WmiObject Win32_LogicalDisk | Select-Object DeviceID, FreeSpace" + - "Get-HotFix | Sort-Object InstalledOn -Descending | Select-Object -First {n}" + - "Get-CimInstance Win32_Service | Where-Object {$_.State -eq '{svc_state}'}" + - "Get-ScheduledTask | Where-Object {$_.State -eq '{task_state}'}" + params: + svc: ["Spooler", "W32Time", "wuauserv", "BITS", "WinRM", "Dhcp", "Dnscache", "EventLog"] + svc_state: ["Running", "Stopped"] + task_state: ["Ready", "Running", "Disabled"] + log: ["System", "Application", "Security", "Setup"] + n: ["10", "25", "50", "100"] + host: ["dc01", "fileserver", "10.0.0.1", "localhost", "gateway"] + port: ["80", "443", "3389", "5985", "22"] + proc: ["svchost", "explorer", "chrome", "outlook", "code", "winlogon"] + dir: ["Logs", "Temp", "Reports", "Users\\Public"] + +firewall_deny: + burst_window_count: [2, 5] + burst_width_seconds: [20, 180] + quiet_probability: 0.08 + metadata_hash_nonzero_probability: 0.18 diff --git a/src/evidenceforge/config/schemas.py b/src/evidenceforge/config/schemas.py index 99862ea6..66fcfd08 100644 --- a/src/evidenceforge/config/schemas.py +++ b/src/evidenceforge/config/schemas.py @@ -1413,6 +1413,170 @@ def validate_rate_range(cls, v: Any) -> Any: return v +# --- Host Activity Profiles --- + + +_HOST_ACTIVITY_RATE_FAMILIES = frozenset( + { + "user_activity", + "web", + "dns_interval", + "ntp", + "smb_interval", + "kerberos", + "ldap", + "persona_connections", + "role_network", + "inbound_network", + "windows_service_process", + "windows_registry", + "windows_scheduled_task", + "windows_remote_thread", + "windows_process_access", + "windows_module_load", + "windows_remote_admin", + "windows_service_logon", + "windows_machine_auth", + "dc_kerberos", + "linux_syslog", + "linux_remote_admin", + "linux_shell", + "firewall_deny", + "ids_alert", + "icmp_monitoring", + } +) + + +class HostActivityRateFamiliesConfig(BaseModel, extra="forbid"): + """Rate-family bounds for host_activity_profiles.yaml.""" + + default_bounds: list[float] + bounds: dict[str, list[float]] = Field(default_factory=dict) + + @field_validator("default_bounds") + @classmethod + def default_bounds_valid(cls, v: list[float]) -> list[float]: + return _validate_positive_pair(v, "default_bounds") + + @field_validator("bounds") + @classmethod + def bounds_valid(cls, v: dict[str, list[float]]) -> dict[str, list[float]]: + unknown = sorted(set(v) - _HOST_ACTIVITY_RATE_FAMILIES) + if unknown: + raise ValueError(f"unknown rate family bounds: {unknown}") + for family, bounds in v.items(): + _validate_positive_pair(bounds, f"bounds.{family}") + return v + + +def _validate_positive_pair(v: list[float], field_name: str) -> list[float]: + """Validate a two-value positive numeric range.""" + if len(v) != 2: + raise ValueError(f"{field_name} must be a two-value [min, max] list") + if not all(isinstance(item, int | float) and item > 0 for item in v): + raise ValueError(f"{field_name} values must be positive numbers") + if v[0] > v[1]: + raise ValueError(f"{field_name} min must be <= max") + return v + + +class HostActivityProfileEntry(BaseModel, extra="forbid"): + """Host type, role, or persona multiplier profile.""" + + base_multiplier: float = Field(default=1.0, gt=0) + variance: list[float] | None = None + families: dict[str, float] = Field(default_factory=dict) + + @field_validator("variance") + @classmethod + def variance_valid(cls, v: list[float] | None) -> list[float] | None: + if v is None: + return v + return _validate_positive_pair(v, "variance") + + @field_validator("families") + @classmethod + def families_valid(cls, v: dict[str, float]) -> dict[str, float]: + unknown = sorted(set(v) - _HOST_ACTIVITY_RATE_FAMILIES) + if unknown: + raise ValueError(f"unknown activity families: {unknown}") + for family, multiplier in v.items(): + if not isinstance(multiplier, int | float) or multiplier <= 0: + raise ValueError(f"family multiplier {family!r} must be positive") + return v + + +class PowerShellEncodedVariantsConfig(BaseModel, extra="forbid"): + """Data-driven encoded PowerShell command variants.""" + + host_preferred_template_count: int = Field(default=3, gt=0) + templates: list[str] + params: dict[str, list[str]] = Field(default_factory=dict) + + @field_validator("templates") + @classmethod + def templates_non_empty(cls, v: list[str]) -> list[str]: + if not v or any(not template for template in v): + raise ValueError("templates must contain non-empty strings") + return v + + @field_validator("params") + @classmethod + def params_non_empty(cls, v: dict[str, list[str]]) -> dict[str, list[str]]: + for key, values in v.items(): + if not key or not values or any(not value for value in values): + raise ValueError("params keys and values must be non-empty") + return v + + +class HostActivityArtifactVariantsConfig(BaseModel, extra="forbid"): + """Artifact variation config for host_activity_profiles.yaml.""" + + powershell_encoded: PowerShellEncodedVariantsConfig + + +class HostActivityFirewallDenyConfig(BaseModel, extra="forbid"): + """Firewall deny burst and metadata knobs.""" + + burst_window_count: list[int] + burst_width_seconds: list[int] + quiet_probability: float = Field(ge=0.0, le=1.0) + metadata_hash_nonzero_probability: float = Field(ge=0.0, le=1.0) + + @field_validator("burst_window_count", "burst_width_seconds") + @classmethod + def integer_range_valid(cls, v: list[int]) -> list[int]: + if len(v) != 2: + raise ValueError("must be a two-value [min, max] list") + if not all(isinstance(item, int) and item > 0 for item in v): + raise ValueError("values must be positive integers") + if v[0] > v[1]: + raise ValueError("min must be <= max") + return v + + +class HostActivityProfilesConfig(BaseModel, extra="forbid"): + """Root schema for host_activity_profiles.yaml.""" + + rate_families: HostActivityRateFamiliesConfig + host_types: dict[str, HostActivityProfileEntry] + role_profiles: dict[str, HostActivityProfileEntry] = Field(default_factory=dict) + persona_profiles: dict[str, HostActivityProfileEntry] = Field(default_factory=dict) + artifact_variants: HostActivityArtifactVariantsConfig + firewall_deny: HostActivityFirewallDenyConfig + + @field_validator("host_types") + @classmethod + def required_host_types_present( + cls, v: dict[str, HostActivityProfileEntry] + ) -> dict[str, HostActivityProfileEntry]: + missing = sorted({"workstation", "server", "domain_controller"} - set(v)) + if missing: + raise ValueError(f"missing host type profiles: {missing}") + return v + + # --- Validation helper --- diff --git a/src/evidenceforge/events/contexts.py b/src/evidenceforge/events/contexts.py index cce47207..48574b55 100644 --- a/src/evidenceforge/events/contexts.py +++ b/src/evidenceforge/events/contexts.py @@ -547,6 +547,8 @@ class FirewallContext: access_group: str = "" # ACL name for deny logs bytes_sent: int = 0 # For teardown records duration: str = "" # "H:MM:SS" for teardown + deny_hash_a: str = "0x0" # ASA deny metadata hash field + deny_hash_b: str = "0x0" # ASA deny metadata hash field @dataclass(slots=True) diff --git a/src/evidenceforge/generation/activity/host_activity_profiles.py b/src/evidenceforge/generation/activity/host_activity_profiles.py new file mode 100644 index 00000000..5d7e1c0a --- /dev/null +++ b/src/evidenceforge/generation/activity/host_activity_profiles.py @@ -0,0 +1,281 @@ +# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates +# SPDX-License-Identifier: MIT + +"""Host/persona/role activity profile loader and resolver. + +The resolver intentionally works at coarse rate-family granularity. This keeps +baseline realism configurable without making every emitter and event subtype +carry its own profile knobs. +""" + +from __future__ import annotations + +import base64 +import random +from dataclasses import dataclass +from typing import Any + +from evidenceforge.config import get_activity_directory +from evidenceforge.config.overlay import deep_merge_dict, load_with_overlay +from evidenceforge.utils.rng import _stable_seed + +_PROFILES_PATH = get_activity_directory() / "host_activity_profiles.yaml" +_CACHED_DATA: dict[str, Any] | None = None + +RATE_FAMILIES = frozenset( + { + "user_activity", + "web", + "dns_interval", + "ntp", + "smb_interval", + "kerberos", + "ldap", + "persona_connections", + "role_network", + "inbound_network", + "windows_service_process", + "windows_registry", + "windows_scheduled_task", + "windows_remote_thread", + "windows_process_access", + "windows_module_load", + "windows_remote_admin", + "windows_service_logon", + "windows_machine_auth", + "dc_kerberos", + "linux_syslog", + "linux_remote_admin", + "linux_shell", + "firewall_deny", + "ids_alert", + "icmp_monitoring", + } +) + + +@dataclass(frozen=True) +class HostActivityProfile: + """Resolved activity multipliers for one host/persona view.""" + + hostname: str + multipliers: dict[str, float] + + def multiplier(self, family: str) -> float: + """Return a bounded multiplier for a rate family.""" + return self.multipliers.get(family, 1.0) + + +def load_host_activity_profiles() -> dict[str, Any]: + """Load host activity profiles, merged with overlay. Cached after first call.""" + global _CACHED_DATA # noqa: PLW0603 + if _CACHED_DATA is not None: + return _CACHED_DATA + _CACHED_DATA = load_with_overlay( + _PROFILES_PATH, + "activity/host_activity_profiles.yaml", + deep_merge_dict, + ) + return _CACHED_DATA + + +def reset_cache() -> None: + """Clear cached data for tests.""" + global _CACHED_DATA # noqa: PLW0603 + _CACHED_DATA = None + + +def _as_float(value: Any, default: float) -> float: + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _range_pair(value: Any, default: tuple[float, float]) -> tuple[float, float]: + if not isinstance(value, list | tuple) or len(value) != 2: + return default + lo = _as_float(value[0], default[0]) + hi = _as_float(value[1], default[1]) + if lo <= 0 or hi <= 0: + return default + if lo > hi: + return (hi, lo) + return (lo, hi) + + +def _family_multiplier(profile: dict[str, Any] | None, family: str) -> float: + if not isinstance(profile, dict): + return 1.0 + families = profile.get("families", {}) + if not isinstance(families, dict): + return 1.0 + return max(0.0, _as_float(families.get(family), 1.0)) + + +def _bounds_for_family(data: dict[str, Any], family: str) -> tuple[float, float]: + rate_families = data.get("rate_families", {}) + if not isinstance(rate_families, dict): + return (0.25, 6.0) + default_bounds = _range_pair(rate_families.get("default_bounds"), (0.25, 6.0)) + bounds = rate_families.get("bounds", {}) + if isinstance(bounds, dict) and family in bounds: + return _range_pair(bounds[family], default_bounds) + return default_bounds + + +def resolve_host_activity_profile( + *, + scenario_name: str, + system: Any, + roles: list[str] | None = None, + persona: str | None = None, +) -> HostActivityProfile: + """Resolve deterministic activity multipliers for a host/persona combination.""" + data = load_host_activity_profiles() + host_type = str(getattr(system, "type", "workstation") or "workstation").lower() + hostname = str(getattr(system, "hostname", "") or "") + normalized_roles = [role.lower() for role in roles or getattr(system, "roles", []) or []] + if host_type == "domain_controller" and "domain_controller" not in normalized_roles: + normalized_roles.append("domain_controller") + + host_profiles = data.get("host_types", {}) if isinstance(data, dict) else {} + role_profiles = data.get("role_profiles", {}) if isinstance(data, dict) else {} + persona_profiles = data.get("persona_profiles", {}) if isinstance(data, dict) else {} + host_profile = ( + host_profiles.get(host_type) + if isinstance(host_profiles, dict) and isinstance(host_profiles.get(host_type), dict) + else {} + ) + base_multiplier = max(0.0, _as_float(host_profile.get("base_multiplier"), 1.0)) + variance_min, variance_max = _range_pair(host_profile.get("variance"), (1.0, 1.0)) + persona_profile = ( + persona_profiles.get(str(persona).lower()) + if persona and isinstance(persona_profiles, dict) + else None + ) + + multipliers: dict[str, float] = {} + for family in RATE_FAMILIES: + host_variance_rng = random.Random( + _stable_seed(f"host_activity:{scenario_name}:{hostname}:{family}") + ) + multiplier = base_multiplier * host_variance_rng.uniform(variance_min, variance_max) + multiplier *= _family_multiplier(host_profile, family) + if isinstance(role_profiles, dict): + for role in normalized_roles: + role_profile = role_profiles.get(role) + multiplier *= _family_multiplier(role_profile, family) + multiplier *= _family_multiplier(persona_profile, family) + + low, high = _bounds_for_family(data, family) + multipliers[family] = max(low, min(high, multiplier)) + + return HostActivityProfile(hostname=hostname, multipliers=multipliers) + + +def scale_count_range(lo: int, hi: int, multiplier: float) -> tuple[int, int]: + """Scale a randint-style count range while preserving a nonzero range.""" + lo = int(lo) + hi = int(hi) + if hi < lo: + lo, hi = hi, lo + scaled_lo = int(round(lo * multiplier)) + scaled_hi = int(round(hi * multiplier)) + if lo > 0: + scaled_lo = max(1, scaled_lo) + scaled_hi = max(scaled_lo, scaled_hi) + else: + scaled_lo = max(0, scaled_lo) + scaled_hi = max(scaled_lo, scaled_hi) + return scaled_lo, scaled_hi + + +def scale_interval_range(lo: int, hi: int, multiplier: float) -> tuple[int, int]: + """Scale seconds-between-events ranges; higher multiplier means shorter intervals.""" + lo = int(lo) + hi = int(hi) + if hi < lo: + lo, hi = hi, lo + divisor = max(0.01, multiplier) + scaled_lo = max(1, int(round(lo / divisor))) + scaled_hi = max(scaled_lo, int(round(hi / divisor))) + return scaled_lo, scaled_hi + + +def pick_firewall_deny_offset( + *, + rng: random.Random, + sensor_name: str, + current_hour_epoch: int, + generated_index: int, + multiplier: float, +) -> float | None: + """Pick a bursty deny-event offset for an ASA/firewall baseline record.""" + data = load_host_activity_profiles() + config = data.get("firewall_deny", {}) if isinstance(data, dict) else {} + quiet_probability = _as_float(config.get("quiet_probability"), 0.08) + if rng.random() < quiet_probability / max(0.5, multiplier): + return None + + count_lo, count_hi = _range_pair(config.get("burst_window_count"), (2.0, 5.0)) + width_lo, width_hi = _range_pair(config.get("burst_width_seconds"), (20.0, 180.0)) + burst_count = max(1, int(round(rng.randint(int(count_lo), int(count_hi)) * multiplier))) + burst_index = generated_index % burst_count + burst_rng = random.Random( + _stable_seed(f"firewall_deny_burst:{sensor_name}:{current_hour_epoch}:{burst_index}") + ) + center = burst_rng.uniform(120, 3480) + width = burst_rng.uniform(width_lo, width_hi) + return max(0.0, min(3599.0, center + rng.gauss(0, width / 3.0))) + + +def firewall_deny_hash_values(rng: random.Random) -> tuple[str, str]: + """Return ASA deny hash values with realistic mostly-zero behavior.""" + data = load_host_activity_profiles() + config = data.get("firewall_deny", {}) if isinstance(data, dict) else {} + probability = max( + 0.0, min(1.0, _as_float(config.get("metadata_hash_nonzero_probability"), 0.18)) + ) + if rng.random() >= probability: + return ("0x0", "0x0") + return (f"0x{rng.getrandbits(16):04x}", f"0x{rng.getrandbits(16):04x}") + + +def generate_encoded_powershell_command( + *, + rng: random.Random, + hostname: str, + username: str, +) -> str: + """Generate a host-biased UTF-16LE PowerShell EncodedCommand payload.""" + data = load_host_activity_profiles() + variants = data.get("artifact_variants", {}) if isinstance(data, dict) else {} + ps_config = variants.get("powershell_encoded", {}) if isinstance(variants, dict) else {} + templates = ps_config.get("templates", []) + if not isinstance(templates, list) or not templates: + templates = ["Get-Service -Name {svc}"] + + preferred_count = max(1, int(ps_config.get("host_preferred_template_count", 3))) + host_rng = random.Random(_stable_seed(f"ps_encoded_templates:{hostname}:{username}")) + preferred = list(templates) + if len(preferred) > preferred_count: + preferred = host_rng.sample(preferred, preferred_count) + template = str(rng.choice(preferred)) + + params = ps_config.get("params", {}) + if not isinstance(params, dict): + params = {} + command = template + for key, values in params.items(): + placeholder = "{" + str(key) + "}" + if placeholder not in command: + continue + if not isinstance(values, list) or not values: + continue + param_rng = random.Random( + _stable_seed(f"ps_encoded_param:{hostname}:{username}:{key}:{rng.random()}") + ) + command = command.replace(placeholder, str(param_rng.choice(values))) + + return base64.b64encode(command.encode("utf-16-le")).decode("ascii") diff --git a/src/evidenceforge/generation/activity/suspicious_benign.py b/src/evidenceforge/generation/activity/suspicious_benign.py index 52af7722..98a73566 100644 --- a/src/evidenceforge/generation/activity/suspicious_benign.py +++ b/src/evidenceforge/generation/activity/suspicious_benign.py @@ -30,11 +30,13 @@ low=~1/hr, medium=~2/hr, high=~3/hr, ludicrous=~5/hr """ -import base64 import logging import random from datetime import datetime, timedelta +from evidenceforge.generation.activity.host_activity_profiles import ( + generate_encoded_powershell_command, +) from evidenceforge.models.scenario import Persona, System, User logger = logging.getLogger(__name__) @@ -523,43 +525,22 @@ def generate_temp_dir_execution( } -# Benign PowerShell command templates for base64-encoded commands. -# Each invocation picks a template, substitutes parameters, then encodes -# as UTF-16LE + base64 (matching real PowerShell -EncodedCommand format). -_ENCODED_PS_TEMPLATES = [ - "Get-Service -Name {svc}", - "Get-EventLog -LogName {log} -Newest {n}", - "Test-NetConnection {host} -Port {port}", - "Get-Process -Name {proc}", - "Get-ChildItem -Path C:\\{dir} -Recurse | Measure-Object", - "Get-WmiObject Win32_LogicalDisk | Select-Object DeviceID, FreeSpace", - "Get-HotFix | Sort-Object InstalledOn -Descending | Select-Object -First {n}", -] - -_ENCODED_PS_PARAMS: dict[str, list[str]] = { - "svc": ["Spooler", "W32Time", "wuauserv", "BITS", "WinRM", "Dhcp", "Dnscache", "EventLog"], - "log": ["System", "Application", "Security", "Setup"], - "n": ["10", "25", "50", "100"], - "host": ["dc01", "fileserver", "10.0.0.1", "localhost", "gateway"], - "port": ["80", "443", "3389", "5985", "22"], - "proc": ["svchost", "explorer", "chrome", "outlook", "code", "winlogon"], - "dir": ["Logs", "Temp", "Reports", "Users\\Public"], -} - - -def _generate_encoded_command(rng: random.Random) -> str: +def _generate_encoded_command( + rng: random.Random, + *, + hostname: str = "", + username: str = "", +) -> str: """Generate a unique base64-encoded benign PowerShell command. - Picks a random template, substitutes parameters, then encodes as - UTF-16LE base64 — matching real Windows PowerShell -EncodedCommand format. + Uses data-driven host-biased templates and encodes as UTF-16LE base64, + matching real Windows PowerShell -EncodedCommand format. """ - template = rng.choice(_ENCODED_PS_TEMPLATES) - cmd = template - for key, values in _ENCODED_PS_PARAMS.items(): - placeholder = "{" + key + "}" - if placeholder in cmd: - cmd = cmd.replace(placeholder, rng.choice(values)) - return base64.b64encode(cmd.encode("utf-16-le")).decode("ascii") + return generate_encoded_powershell_command( + rng=rng, + hostname=hostname or "unknown", + username=username or "unknown", + ) def generate_unusual_powershell( @@ -603,7 +584,8 @@ def generate_unusual_powershell( suspicious_ps = [ rf'powershell.exe -WindowStyle Hidden -Command "Get-WinEvent -LogName Security -MaxEvents {rng.choice([50, 100, 200, 500])} | Export-Csv C:\Reports\{report}.csv"', - f"powershell.exe -EncodedCommand {_generate_encoded_command(rng)}", + "powershell.exe -EncodedCommand " + f"{_generate_encoded_command(rng, hostname=system.hostname, username=user.username)}", rf"powershell.exe -Exec Bypass -File C:\Scripts\{script}", rf'powershell.exe -NonInteractive -Command "Invoke-RestMethod -Uri https://{internal_api}{api_path}"', rf'powershell.exe -WindowStyle Hidden -Command "Compress-Archive -Path C:\{log_dir}\*.log -DestinationPath C:\Backups\{backup}.zip"', diff --git a/src/evidenceforge/generation/emitters/cisco_asa.py b/src/evidenceforge/generation/emitters/cisco_asa.py index f05b15b0..0233cd0f 100644 --- a/src/evidenceforge/generation/emitters/cisco_asa.py +++ b/src/evidenceforge/generation/emitters/cisco_asa.py @@ -522,6 +522,8 @@ def _emit_deny( """Emit a Deny record (106023).""" protocol = (net.protocol or "tcp").lower() acl_name = (fw.access_group if fw else "") or "outside_access_in" + deny_hash_a = getattr(fw, "deny_hash_a", "0x0") if fw else "0x0" + deny_hash_b = getattr(fw, "deny_hash_b", "0x0") if fw else "0x0" if protocol == "icmp": icmp_type = net.dst_port if net.dst_port else 8 @@ -530,13 +532,13 @@ def _emit_deny( f"Deny {protocol} src {src_iface}:{net.src_ip} " f"dst {dst_iface}:{net.dst_ip} " f"(type {icmp_type}, code {icmp_code}) " - f'by access-group "{acl_name}" [0x0, 0x0]' + f'by access-group "{acl_name}" [{deny_hash_a}, {deny_hash_b}]' ) else: message = ( f"Deny {protocol} src {src_iface}:{net.src_ip}/{net.src_port} " f"dst {dst_iface}:{net.dst_ip}/{net.dst_port} " - f'by access-group "{acl_name}" [0x0, 0x0]' + f'by access-group "{acl_name}" [{deny_hash_a}, {deny_hash_b}]' ) event_data = { diff --git a/src/evidenceforge/generation/engine/baseline.py b/src/evidenceforge/generation/engine/baseline.py index 14a3b784..762133f7 100644 --- a/src/evidenceforge/generation/engine/baseline.py +++ b/src/evidenceforge/generation/engine/baseline.py @@ -54,6 +54,13 @@ _windows_foreground_lifetime, ) from evidenceforge.generation.activity.helpers import _get_os_category +from evidenceforge.generation.activity.host_activity_profiles import ( + firewall_deny_hash_values, + pick_firewall_deny_offset, + resolve_host_activity_profile, + scale_count_range, + scale_interval_range, +) from evidenceforge.generation.activity.ids_signatures import ( load_ids_signatures, render_dns_query_template, @@ -525,6 +532,7 @@ def _windows_scheduled_task_offsets( current_hour: datetime, system: Any, rng: random.Random, + count_multiplier: float = 1.0, ) -> list[float]: """Return config-driven Windows scheduled/background task offsets for this hour.""" from evidenceforge.generation.activity.endpoint_noise import windows_scheduled_process_config @@ -532,6 +540,7 @@ def _windows_scheduled_task_offsets( cfg = windows_scheduled_process_config() count_min = max(0, int(cfg.get("count_min", 2))) count_max = max(count_min, int(cfg.get("count_max", 5))) + count_min, count_max = scale_count_range(count_min, count_max, count_multiplier) start = max(0, min(3599, int(cfg.get("trigger_window_start_seconds", 90)))) end = max(start + 1, min(3599, int(cfg.get("trigger_window_end_seconds", 3510)))) spacing = max(1, int(cfg.get("slot_spacing_seconds", 300))) @@ -726,6 +735,92 @@ def _resolve_traffic_rate(self, traffic_type: str) -> tuple[int, int]: rate = defaults[traffic_type] return (rate[0], rate[1]) + def _activity_roles_for_system(self, system: Any) -> list[str]: + """Return canonical roles for host activity profile resolution.""" + if hasattr(self, "world_model") and system.hostname in self.world_model.hosts: + roles = list(self.world_model.hosts[system.hostname].canonical_roles) + else: + roles = [r.lower() for r in (getattr(system, "roles", None) or [])] + host_type = (getattr(system, "type", None) or "workstation").lower() + if host_type == "domain_controller" and "domain_controller" not in roles: + roles.append("domain_controller") + return roles + + def _resolve_activity_profile(self, system: Any, persona: str | None = None) -> Any: + """Resolve and cache host activity profile multipliers.""" + cache = getattr(self, "_host_activity_profile_cache", None) + if cache is None: + cache = {} + self._host_activity_profile_cache = cache + key = (getattr(system, "hostname", ""), persona or "") + if key not in cache: + cache[key] = resolve_host_activity_profile( + scenario_name=getattr(self.scenario, "name", "scenario"), + system=system, + roles=self._activity_roles_for_system(system), + persona=persona, + ) + return cache[key] + + def _activity_multiplier( + self, + system: Any | None, + family: str, + persona: str | None = None, + ) -> float: + """Return host/persona multiplier for a broad activity family.""" + if system is None: + return 1.0 + return self._resolve_activity_profile(system, persona).multiplier(family) + + def _scaled_count_range( + self, + system: Any | None, + family: str, + lo: int, + hi: int, + *, + persona: str | None = None, + ) -> tuple[int, int]: + """Scale a count range for the host activity profile.""" + return scale_count_range(lo, hi, self._activity_multiplier(system, family, persona)) + + def _scaled_randint( + self, + rng: random.Random, + system: Any | None, + family: str, + lo: int, + hi: int, + *, + persona: str | None = None, + ) -> int: + """Draw from a count range after applying host activity profile scaling.""" + scaled_lo, scaled_hi = self._scaled_count_range(system, family, lo, hi, persona=persona) + return rng.randint(scaled_lo, scaled_hi) + + def _scaled_interval_range( + self, + system: Any | None, + family: str, + lo: int, + hi: int, + ) -> tuple[int, int]: + """Scale a seconds-between-events range for a host activity profile.""" + return scale_interval_range(lo, hi, self._activity_multiplier(system, family)) + + def _activity_system_for_user(self, user: User) -> Any | None: + """Return the primary host whose profile should shape user activity.""" + systems = self.scenario.environment.systems + if user.primary_system: + primary = next((s for s in systems if s.hostname == user.primary_system), None) + if primary is not None: + return primary + assigned = next((s for s in systems if s.assigned_user == user.username), None) + if assigned is not None: + return assigned + return systems[0] if systems else None + def _emit_dhcp_registry_side_effect( self, *, @@ -2234,8 +2329,25 @@ def _pick_public_scan_target( offset = rng.randint(1, cidr.num_addresses - 2) return str(cidr.network_address + offset) - # Estimate allow traffic: ~10-20 connections per internal system per hour - estimated_allows = len(internal_ips) * rng.randint(10, 20) + sensor_systems = [] + for candidate in self.scenario.environment.systems: + try: + candidate_ip = ipaddress.ip_address(candidate.ip) + except ValueError: + continue + if any( + seg_name in sensor.monitoring_segments and candidate_ip in cidr + for seg_name, cidr in segment_cidrs.items() + ): + sensor_systems.append(candidate) + sensor_systems = sensor_systems or self.scenario.environment.systems + avg_multiplier = sum( + self._activity_multiplier(system, "firewall_deny") for system in sensor_systems + ) / max(1, len(sensor_systems)) + + # Estimate allow traffic: ~10-20 connections per internal system per hour. + allows_lo, allows_hi = scale_count_range(10, 20, avg_multiplier) + estimated_allows = len(internal_ips) * rng.randint(allows_lo, allows_hi) deny_count = int(estimated_allows * sensor.deny_ratio) if deny_count <= 0: continue @@ -2319,13 +2431,22 @@ def _resolve_iface(ip: str, _ifaces: dict = sensor_interfaces) -> str: # noqa: ): continue - offset_sec = rng.uniform(0, 3600) + offset_sec = pick_firewall_deny_offset( + rng=rng, + sensor_name=sensor.hostname or sensor.name, + current_hour_epoch=int(current_hour.timestamp()), + generated_index=generated, + multiplier=avg_multiplier, + ) + if offset_sec is None: + continue ts = current_hour + timedelta(seconds=offset_sec) self.state_manager.set_current_time(ts) src_iface = _resolve_iface(src_ip) dst_iface = _resolve_iface(dst_ip) acl_name = f"{src_iface}_access_in" + deny_hash_a, deny_hash_b = firewall_deny_hash_values(rng) fw_ctx = FirewallContext( action="deny", @@ -2334,6 +2455,8 @@ def _resolve_iface(ip: str, _ifaces: dict = sensor_interfaces) -> str: # noqa: src_interface=src_iface, dst_interface=dst_iface, access_group=acl_name, + deny_hash_a=deny_hash_a, + deny_hash_b=deny_hash_b, ) self.activity_generator.generate_connection( @@ -2543,6 +2666,13 @@ def _calculate_events_for_hour( """Calculate number of events for user this hour.""" lo, hi = self._resolve_traffic_rate("user_activity") base_events = lo if lo == hi else _get_rng().randint(lo, hi) + activity_system = self._activity_system_for_user(user) + base_events = int( + round( + base_events + * self._activity_multiplier(activity_system, "user_activity", user.persona) + ) + ) if persona and persona.risk_profile: risk_mult = {"low": 0.7, "medium": 1.0, "high": 1.3} @@ -3365,7 +3495,10 @@ def _burst_offset() -> float: if role_conns: weights = [c.get("weight", 1) for c in role_conns] # Scale connection count by time-of-day (fewer at night) - base_count = rng.randint(8, 20) if is_business else rng.randint(2, 6) + if is_business: + base_count = self._scaled_randint(rng, system, "role_network", 8, 20) + else: + base_count = self._scaled_randint(rng, system, "role_network", 2, 6) for _ in range(base_count): conn = rng.choices(role_conns, weights=weights, k=1)[0] @@ -3492,7 +3625,10 @@ def _fw_is_on_path(fw_sensor, src_ip: str, dst_ip: str) -> bool: from evidenceforge.events.contexts import FirewallContext as _InboundFwCtx inbound_weights = [c.get("weight", 1) for c in inbound_conns] - num_inbound = rng.randint(4, 15) if is_business else rng.randint(1, 4) + if is_business: + num_inbound = self._scaled_randint(rng, system, "inbound_network", 4, 15) + else: + num_inbound = self._scaled_randint(rng, system, "inbound_network", 1, 4) for _ in range(num_inbound): conn = rng.choices(inbound_conns, weights=inbound_weights, k=1)[0] is_external_src = conn["role"] == "_external" @@ -3566,6 +3702,7 @@ def _fw_is_on_path(fw_sensor, src_ip: str, dst_ip: str) -> bool: dst_hostname = self.world_model.fqdn_for_system(system) if fw_denied and denying_sensor: + deny_hash_a, deny_hash_b = firewall_deny_hash_values(rng) # Emit as a deny record from the actual in-path firewall deny_state = "REJ" if denying_sensor.drop_mode == "reject" else "S0" self.activity_generator.generate_connection( @@ -3583,6 +3720,8 @@ def _fw_is_on_path(fw_sensor, src_ip: str, dst_ip: str) -> bool: src_interface=_fw_iface_for(src_ip, denying_sensor), dst_interface=_fw_iface_for(system.ip, denying_sensor), access_group=f"{_fw_iface_for(src_ip, denying_sensor)}_access_in", + deny_hash_a=deny_hash_a, + deny_hash_b=deny_hash_b, ), emit_dns=False, ) @@ -3655,6 +3794,13 @@ def _fw_is_on_path(fw_sensor, src_ip: str, dst_ip: str) -> bool: p_weights = [c.get("weight", 1) for c in persona_conns] # Fewer persona connections than role connections; scaled by intensity _pc_lo, _pc_hi = self._resolve_traffic_rate("persona_connections") + _pc_lo, _pc_hi = self._scaled_count_range( + system, + "persona_connections", + _pc_lo, + _pc_hi, + persona=persona, + ) num_persona = rng.randint(_pc_lo, _pc_hi) if is_business else 0 # Clamp timestamps to session lifetime within this hour session_start_sec = max(0.0, (session.start_time - current_hour).total_seconds()) @@ -3948,6 +4094,9 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 # DNS lookups: truly periodic with small jitter, using global schedule if "dns-client" in services: _dns_lo, _dns_hi = self._resolve_traffic_rate("dns_interval") + _dns_lo, _dns_hi = self._scaled_interval_range( + system, "dns_interval", _dns_lo, _dns_hi + ) _dns_range = max(1, _dns_hi - _dns_lo) dns_interval = _dns_lo + (_stable_seed(f"dns_iv_{system.hostname}") % _dns_range) dns_phase = _stable_seed(f"dns_ph_{system.hostname}") % dns_interval @@ -4064,6 +4213,9 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 smb_targets, fs_targets = self._build_smb_targets(system, dc_ips) if smb_targets: _smb_lo, _smb_hi = self._resolve_traffic_rate("smb_interval") + _smb_lo, _smb_hi = self._scaled_interval_range( + system, "smb_interval", _smb_lo, _smb_hi + ) _smb_range = max(1, _smb_hi - _smb_lo) smb_interval = _smb_lo + ( _stable_seed(f"smb_iv_{system.hostname}") % _smb_range @@ -4143,6 +4295,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 # Kerberos if "kerberos-client" in services and os_cat == "windows" and dc_targets: _krb_lo, _krb_hi = self._resolve_traffic_rate("kerberos") + _krb_lo, _krb_hi = self._scaled_count_range(system, "kerberos", _krb_lo, _krb_hi) num_krb = rng.randint(_krb_lo, _krb_hi) base_interval = 3600 / (num_krb + 1) for i in range(num_krb): @@ -4168,6 +4321,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 # LDAP if "ldap-client" in services and os_cat == "windows" and dc_targets: _ldap_lo, _ldap_hi = self._resolve_traffic_rate("ldap") + _ldap_lo, _ldap_hi = self._scaled_count_range(system, "ldap", _ldap_lo, _ldap_hi) num_ldap = rng.randint(_ldap_lo, _ldap_hi) base_interval = 3600 / (num_ldap + 1) for i in range(num_ldap): @@ -4210,7 +4364,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 ) sys_type_str = (system.type or "workstation").lower() - num_svc = rng.randint(3, 8) + num_svc = self._scaled_randint(rng, system, "windows_service_process", 3, 8) for _si in range(num_svc): svc_offset = rng.uniform(0, 3599) svc_ts = current_hour + timedelta(seconds=svc_offset) @@ -4247,7 +4401,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 _REG_KEYS_HKCU = get_registry_keys_hkcu() _REG_KEYS_HKLM = get_registry_keys_hklm() - _reg_count = rng.randint(18, 42) + _reg_count = self._scaled_randint(rng, system, "windows_registry", 18, 42) _svc_pid = sys_pids.get("svchost_netsvcs", sys_pids.get("services", 4)) _host_ctx = self.activity_generator._build_host_context(system) _registry_cfg = registry_noise_config() @@ -4388,7 +4542,15 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 pick_scheduled_task, ) - for offset in _windows_scheduled_task_offsets(current_hour, system, rng): + for offset in _windows_scheduled_task_offsets( + current_hour, + system, + rng, + count_multiplier=self._activity_multiplier( + system, + "windows_scheduled_task", + ), + ): ts = current_hour + timedelta(seconds=offset) self.state_manager.set_current_time(ts) task_image, task_cmd, task_parent_key = pick_scheduled_task(rng) @@ -4474,7 +4636,8 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 noise_cfg = load_create_remote_thread_noise_config() probability = float(noise_cfg.get("probability_per_host_hour", 0.08)) max_events = int(noise_cfg.get("max_events_per_hour", 1)) - if valid_crt and max_events > 0 and rng.random() < probability: + probability *= self._activity_multiplier(system, "windows_remote_thread") + if valid_crt and max_events > 0 and rng.random() < min(0.95, probability): num_crt = rng.randint(1, max_events) for _ in range(num_crt): pattern = pick_create_remote_thread_pattern(valid_crt, rng) @@ -4507,7 +4670,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 if p.get("source_pid_key") in sys_pids and p.get("target_pid_key") in sys_pids ] if valid_pa: - num_pa = rng.randint(3, 8) + num_pa = self._scaled_randint(rng, system, "windows_process_access", 3, 8) for _ in range(num_pa): pattern = rng.choice(valid_pa) src_key = pattern["source_pid_key"] @@ -4546,7 +4709,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 running = self.state_manager.get_processes_on_system(system.hostname) if running: generic_dll_pool = get_dll_pool() - num_dll = rng.randint(20, 45) + num_dll = self._scaled_randint(rng, system, "windows_module_load", 20, 45) for _ in range(num_dll): offset = rng.uniform(0, 3599) ts = current_hour + timedelta(seconds=offset) @@ -4607,7 +4770,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 pick_bash_command_entry, ) - num_ssh = rng.randint(1, 3) + num_ssh = self._scaled_randint(rng, system, "linux_remote_admin", 1, 3) for _ in range(num_ssh): ssh_user = rng.choice(roster) offset = rng.uniform(0, 3599) @@ -4624,11 +4787,32 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 persona_lower = (ssh_user.persona or "").lower() if persona_lower == "sysadmin": - n_cmds = rng.randint(3, 8) + n_cmds = self._scaled_randint( + rng, + system, + "linux_shell", + 3, + 8, + persona=ssh_user.persona, + ) elif persona_lower == "developer": - n_cmds = rng.randint(2, 6) + n_cmds = self._scaled_randint( + rng, + system, + "linux_shell", + 2, + 6, + persona=ssh_user.persona, + ) else: - n_cmds = rng.randint(1, 4) + n_cmds = self._scaled_randint( + rng, + system, + "linux_shell", + 1, + 4, + persona=ssh_user.persona, + ) hour_end = current_hour + timedelta(hours=1) cumulative_gap = 0 _SLOW_CMD_KEYWORDS = frozenset( @@ -4701,7 +4885,14 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 pick_bash_command_entry, ) - n_cmds = rng.randint(1, 4) + n_cmds = self._scaled_randint( + rng, + system, + "linux_shell", + 1, + 4, + persona=ws_user.persona, + ) ts0 = current_hour + timedelta(seconds=rng.uniform(0, 3599)) hour_end = current_hour + timedelta(hours=1) cumulative = 0 @@ -4735,8 +4926,9 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 if os_cat_rdp != "windows" or sys_type_rdp not in ("server", "domain_controller"): continue - # 1-3 RDP admin sessions per hour to servers, ~60% probability - if rng.random() > 0.60: + # 1-3 RDP admin sessions per hour to servers, shaped by host role/profile. + rdp_multiplier = self._activity_multiplier(system, "windows_remote_admin") + if rng.random() > min(0.95, 0.60 * rdp_multiplier): continue if not any( @@ -4745,7 +4937,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 ): continue - num_rdp = rng.randint(1, 3) + num_rdp = self._scaled_randint(rng, system, "windows_remote_admin", 1, 3) roster = self._get_server_ssh_users(system) if not roster: continue @@ -4773,7 +4965,10 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 continue sys_type_svc = (system.type or "workstation").lower() - num_svc = rng.randint(2, 5) if sys_type_svc != "workstation" else rng.randint(1, 2) + if sys_type_svc != "workstation": + num_svc = self._scaled_randint(rng, system, "windows_service_logon", 2, 5) + else: + num_svc = self._scaled_randint(rng, system, "windows_service_logon", 1, 2) for _ in range(num_svc): offset = rng.uniform(0, 3599) ts = current_hour + timedelta(seconds=offset) @@ -4786,7 +4981,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 ) if sys_type_svc in ("server", "domain_controller"): - num_anon = rng.randint(1, 3) + num_anon = self._scaled_randint(rng, system, "windows_service_logon", 1, 3) for _ in range(num_anon): offset = rng.uniform(0, 3599) ts = current_hour + timedelta(seconds=offset) @@ -4807,7 +5002,7 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 if os_cat != "windows" or system.ip in dc_ips: continue - num_auth = rng.randint(2, 6) + num_auth = self._scaled_randint(rng, system, "windows_machine_auth", 2, 6) base_interval = 3600 / (num_auth + 1) for i in range(num_auth): offset = base_interval * (i + 1) + rng.gauss(0, base_interval * 0.1) @@ -4832,8 +5027,12 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 if _get_os_category(s.os) == "windows" and s.ip not in dc_ips ] for _dc_idx, dc_hostname in enumerate(dc_hostnames): + dc_system = next( + (s for s in self.scenario.environment.systems if s.hostname == dc_hostname), + None, + ) for client in windows_clients: - num_cycles = rng.randint(3, 8) + num_cycles = self._scaled_randint(rng, dc_system, "dc_kerberos", 3, 8) base_interval = 3600 / (num_cycles + 1) for i in range(num_cycles): offset = base_interval * (i + 1) + rng.gauss(0, base_interval * 0.15) @@ -4848,7 +5047,16 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 dc_hostname=dc_hostname, time=ts, ) - num_tgs = 0 if rng.random() < 0.22 else rng.randint(1, 5) + if rng.random() < 0.22: + num_tgs = 0 + else: + num_tgs = self._scaled_randint( + rng, + dc_system, + "dc_kerberos", + 1, + 5, + ) member_servers = [ s.hostname for s in self.scenario.environment.systems @@ -4935,7 +5143,10 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 or "web" in system.hostname.lower() ) has_ntp_client = "ntp-client" in self._system_service_defaults.get(system.hostname, []) - num_events = rng.randint(100, 300) if is_dmz else rng.randint(50, 120) + if is_dmz: + num_events = self._scaled_randint(rng, system, "linux_syslog", 100, 300) + else: + num_events = self._scaled_randint(rng, system, "linux_syslog", 50, 120) scenario_start = self.scenario.time_window.start boot_uptime = self._kernel_boot_uptimes.get(system.hostname, 500000.0) @@ -5332,7 +5543,11 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 # ICMP ping between systems on same subnet systems = self.scenario.environment.systems if len(systems) >= 2: - num_pings = rng.randint(1, 3) + avg_multiplier = sum( + self._activity_multiplier(system, "icmp_monitoring") for system in systems + ) / len(systems) + ping_lo, ping_hi = scale_count_range(1, 3, avg_multiplier) + num_pings = rng.randint(ping_lo, ping_hi) base_interval = 3600 / (num_pings + 1) for i in range(num_pings): src_sys = rng.choice(systems) @@ -5388,7 +5603,11 @@ def _svc_pid(*keys: str, _pids: dict = sys_pids) -> int: # noqa: B006 monitored_systems.extend(segment_systems.get(seg_name, [])) if not monitored_systems: continue - num_alerts = rng.randint(5, 15) + avg_multiplier = sum( + self._activity_multiplier(system, "ids_alert") for system in monitored_systems + ) / len(monitored_systems) + alerts_lo, alerts_hi = scale_count_range(5, 15, avg_multiplier) + num_alerts = rng.randint(alerts_lo, alerts_hi) # For IDS sensors (typically perimeter), generate alerts with # external source IPs targeting monitored systems. _EXTERNAL_SCAN_IPS = getattr( @@ -5535,6 +5754,17 @@ def _emit_web_server_access( ) web_lo, web_hi = self._resolve_traffic_rate("web") + scale_method = getattr(self, "_scaled_count_range", None) + if callable(scale_method): + scaled_range: tuple[int, int] | None = None + try: + candidate = scale_method(sys_obj, "web", web_lo, web_hi) + except (AttributeError, TypeError, ValueError): + candidate = None + if isinstance(candidate, (tuple, list)) and len(candidate) == 2: + scaled_range = (int(candidate[0]), int(candidate[1])) + if scaled_range is not None: + web_lo, web_hi = scaled_range top_level_budget = rng.randint(web_lo, web_hi) if top_level_budget <= 0: return diff --git a/tests/unit/test_baseline_canonical.py b/tests/unit/test_baseline_canonical.py index 2cdbe72e..146f0c0e 100644 --- a/tests/unit/test_baseline_canonical.py +++ b/tests/unit/test_baseline_canonical.py @@ -875,7 +875,9 @@ def test_registry_noise_prefers_dynamic_pools_and_filters_repeated_tells(self): from evidenceforge.generation.engine.baseline import BaselineMixin source = inspect.getsource(BaselineMixin) - assert "_reg_count = rng.randint(18, 42)" in source + assert ( + '_reg_count = self._scaled_randint(rng, system, "windows_registry", 18, 42)' in source + ) assert "Office\\\\16.0\\\\Word\\\\Reading Locations\\\\Document 1" in source assert "Windows NT\\\\CurrentVersion\\\\Winlogon" in source assert "Services\\\\EventLog\\\\Application" in source diff --git a/tests/unit/test_cisco_asa_emitter.py b/tests/unit/test_cisco_asa_emitter.py index 6af47a95..43cac3d0 100644 --- a/tests/unit/test_cisco_asa_emitter.py +++ b/tests/unit/test_cisco_asa_emitter.py @@ -467,6 +467,8 @@ def test_deny_produces_single_record(self, asa_emitter, tmp_path): src_interface="outside", dst_interface="inside", access_group="outside_access_in", + deny_hash_a="0x2a1b", + deny_hash_b="0x031f", ), ) asa_emitter.emit(event) @@ -479,6 +481,7 @@ def test_deny_produces_single_record(self, asa_emitter, tmp_path): assert "Deny tcp src outside:198.51.100.1/54321" in lines[0] assert "dst inside:10.0.10.50/445" in lines[0] assert 'by access-group "outside_access_in"' in lines[0] + assert "[0x2a1b, 0x031f]" in lines[0] def test_icmp_deny_includes_type_code(self, asa_emitter, tmp_path): """ICMP deny should include (type N, code N) in the message.""" diff --git a/tests/unit/test_host_activity_profiles.py b/tests/unit/test_host_activity_profiles.py new file mode 100644 index 00000000..1ab7fe9f --- /dev/null +++ b/tests/unit/test_host_activity_profiles.py @@ -0,0 +1,141 @@ +# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates +# SPDX-License-Identifier: MIT + +"""Tests for host/persona/role activity profile configuration.""" + +import base64 +import random +from types import SimpleNamespace + +import pytest + +from evidenceforge.generation.activity.host_activity_profiles import ( + RATE_FAMILIES, + firewall_deny_hash_values, + generate_encoded_powershell_command, + load_host_activity_profiles, + reset_cache, + resolve_host_activity_profile, + scale_count_range, + scale_interval_range, +) +from evidenceforge.generation.engine.baseline import BaselineMixin + + +@pytest.fixture(autouse=True) +def _reset_host_activity_profiles_cache(): + reset_cache() + yield + reset_cache() + + +def _system( + hostname: str, + system_type: str, + roles: list[str] | None = None, +) -> SimpleNamespace: + return SimpleNamespace(hostname=hostname, type=system_type, roles=roles or []) + + +def test_host_activity_profiles_cover_core_families(): + data = load_host_activity_profiles() + + assert {"workstation", "server", "domain_controller"} <= set(data["host_types"]) + assert set(data["rate_families"]["bounds"]) <= RATE_FAMILIES + assert set(data["host_types"]["domain_controller"]["families"]) <= RATE_FAMILIES + + +def test_resolved_profiles_shape_infrastructure_hosts_differently(): + workstation = resolve_host_activity_profile( + scenario_name="profile-test", + system=_system("wkstn01", "workstation"), + ) + server = resolve_host_activity_profile( + scenario_name="profile-test", + system=_system("files01", "server", ["file_server"]), + ) + dc = resolve_host_activity_profile( + scenario_name="profile-test", + system=_system("dc01", "domain_controller", ["domain_controller"]), + ) + + assert dc.multiplier("dc_kerberos") > workstation.multiplier("dc_kerberos") + assert dc.multiplier("windows_machine_auth") > workstation.multiplier("windows_machine_auth") + assert server.multiplier("inbound_network") > workstation.multiplier("inbound_network") + + +def test_count_and_interval_scaling_preserve_sensible_bounds(): + assert scale_count_range(2, 6, 2.0) == (4, 12) + assert scale_count_range(0, 3, 0.25) == (0, 1) + assert scale_interval_range(300, 900, 2.0) == (150, 450) + assert scale_interval_range(300, 900, 0.5) == (600, 1800) + + +def test_host_activity_profiles_overlay_merges(tmp_path, monkeypatch): + overlay_dir = tmp_path / ".eforge" / "config" / "activity" + overlay_dir.mkdir(parents=True) + (overlay_dir / "host_activity_profiles.yaml").write_text( + """ +role_profiles: + web_server: + families: + firewall_deny: 2.0 +firewall_deny: + metadata_hash_nonzero_probability: 1.0 +""", + encoding="utf-8", + ) + + monkeypatch.chdir(tmp_path) + reset_cache() + + data = load_host_activity_profiles() + assert data["host_types"]["workstation"] + assert data["role_profiles"]["web_server"]["families"]["firewall_deny"] == 2.0 + assert firewall_deny_hash_values(random.Random(4)) != ("0x0", "0x0") + + +def test_encoded_powershell_variants_are_data_driven_and_decodable(): + encoded = generate_encoded_powershell_command( + rng=random.Random(7), + hostname="wkstn01", + username="alice", + ) + + decoded = base64.b64decode(encoded).decode("utf-16-le") + assert "{" not in decoded + assert any( + decoded.startswith(prefix) + for prefix in ( + "Get-Service", + "Get-EventLog", + "Test-NetConnection", + "Get-Process", + "Get-ChildItem", + "Get-WmiObject", + "Get-HotFix", + "Get-CimInstance", + "Get-ScheduledTask", + ) + ) + + +def test_baseline_mixin_resolves_primary_host_activity_profile(): + class Harness(BaselineMixin): + pass + + workstation = _system("wkstn01", "workstation") + server = _system("files01", "server", ["file_server"]) + harness = Harness() + harness.scenario = SimpleNamespace( + name="baseline-profile-test", + environment=SimpleNamespace(systems=[workstation, server]), + ) + + user = SimpleNamespace(username="alice", primary_system="wkstn01", persona="developer") + + assert harness._activity_system_for_user(user) is workstation + assert harness._activity_multiplier(server, "inbound_network") > harness._activity_multiplier( + workstation, + "inbound_network", + ) diff --git a/tests/unit/test_validate_config.py b/tests/unit/test_validate_config.py index 6728f400..fe7b0794 100644 --- a/tests/unit/test_validate_config.py +++ b/tests/unit/test_validate_config.py @@ -113,6 +113,34 @@ def load_invalid_observation_profiles(): for issue in result.issues ) + def test_validate_config_rejects_unknown_host_activity_family(self, monkeypatch): + from evidenceforge.generation.activity import host_activity_profiles + + real_loader = host_activity_profiles.load_host_activity_profiles + + def load_invalid_host_activity_profiles(): + data = real_loader() + host_types = dict(data["host_types"]) + workstation = dict(host_types["workstation"]) + workstation["families"] = {**workstation.get("families", {}), "zeek_magic": 1.5} + host_types["workstation"] = workstation + return {**data, "host_types": host_types} + + monkeypatch.setattr( + host_activity_profiles, + "load_host_activity_profiles", + load_invalid_host_activity_profiles, + ) + + result = validate_config() + + assert any( + issue.severity == "ERROR" + and issue.file == "host_activity_profiles.yaml" + and "unknown activity families" in issue.message + for issue in result.issues + ) + def test_validate_config_rejects_third_party_module_with_microsoft_identity(self, monkeypatch): from evidenceforge.generation.activity import application_catalog