From 599a40eef38606cceb101e006ba889cafca944a4 Mon Sep 17 00:00:00 2001 From: "David J. Bianco" Date: Fri, 15 May 2026 11:52:24 -0400 Subject: [PATCH] feat: add observation-aware eval manifest --- TODO.md | 8 +- commands/eforge/config.md | 2 +- commands/eforge/evaluate.md | 15 +- .../references/config-dependency-graph.md | 2 +- .../eforge/references/config-evaluation.md | 9 + .../eforge/references/config-host-activity.md | 5 + .../eforge/references/scenario-reference.md | 3 +- docs/design/data-quality-prd.md | 6 + docs/reference/CUSTOMIZING_CONFIG.md | 8 + docs/reference/scenario-reference.md | 3 +- scenarios/ITERATION-TEST-PROMPT.md | 24 +-- src/evidenceforge/cli/commands.py | 22 +- src/evidenceforge/config/activity/README.md | 2 +- src/evidenceforge/evaluation/context.py | 17 ++ .../evaluation/dimensions/__init__.py | 3 + src/evidenceforge/evaluation/engine.py | 23 +- src/evidenceforge/evaluation/models.py | 4 + .../evaluation/pillars/causality.py | 198 ++++++++++++++++-- .../evaluation/pillars/parseability.py | 2 + .../evaluation/pillars/plausibility.py | 2 + .../evaluation/pillars/timing.py | 2 + src/evidenceforge/evaluation/report.py | 7 + src/evidenceforge/evaluation/storyline.py | 2 + .../events/observation_manifest.py | 177 ++++++++++++++++ src/evidenceforge/generation/engine/core.py | 13 +- tests/unit/test_eval_cross_source.py | 101 +++++++++ tests/unit/test_observation_manifest.py | 94 +++++++++ 27 files changed, 707 insertions(+), 47 deletions(-) create mode 100644 src/evidenceforge/evaluation/context.py create mode 100644 src/evidenceforge/events/observation_manifest.py create mode 100644 tests/unit/test_observation_manifest.py diff --git a/TODO.md b/TODO.md index b97146cf..dc5796e6 100644 --- a/TODO.md +++ b/TODO.md @@ -2,7 +2,7 @@ **Status:** Phase 8.5 (Dual src/dst HostContext) COMPLETE; Pre-MVP quality fixes ongoing **Started:** 2026-03-11 -**Last Updated:** 2026-05-14 +**Last Updated:** 2026-05-15 See [CHANGELOG.md](CHANGELOG.md) for detailed development history of completed phases. @@ -243,6 +243,8 @@ Replaced manual per-emitter field coordination with SecurityEvent intermediate r - [x] **P1** Source identity and endpoint baseline realism sprint — completed TLS/X.509 issuer-compatible chain signatures, Sysmon Event 7 native third-party module identity, config-driven Windows scheduled-process timing, and DHCP registry emission policy tied to lease activity. Verified with `uv run eforge validate-config`, focused regressions, Ruff, normal pytest, and slow-inclusive pytest. - [x] **P2** Endpoint/eCAR baseline variance follow-up — addressed through the host/activity profile realism layer. Host family, role, persona, and stable per-host multipliers now shape endpoint, process, registry, scheduled-task, syslog, bash, eCAR, Windows, Zeek, firewall, IDS, web, and proxy rates; config-driven encoded PowerShell variants and benign endpoint texture reduce repeated per-host artifacts. Verification passed with focused host-activity/config/ASA/baseline tests, `uv run eforge validate-config`, Ruff checks/format checks, full normal `uv run pytest -v`, and slow-inclusive `uv run pytest -v --include-slow --no-cov` (`3057 passed, 1 skipped`). - [x] **Later architectural sprint: imperfect observation and source coverage** — implemented a training-friendly `complete` default plus overlay-compatible named observation profiles that apply deterministic source-level drop/delay/coverage semantics without modeling contradictions. The policy covers endpoint, network, proxy/web, firewall, IDS, Windows, Sysmon, Zeek, syslog, bash history, and eCAR source families, while ground truth preserves canonical truth and records source evidence status. Verification passed: focused observation/config/ground-truth tests, `uv run eforge validate-config`, Ruff checks/format checks, full normal `uv run pytest -v` (`3036 passed, 15 skipped`), and slow-inclusive `uv run pytest -v --include-slow` (`3050 passed, 1 skipped`). +- [x] Observation-aware automated eval and manifest — generation now writes `OBSERVATION_MANIFEST.json` beside ground truth, `eforge eval` loads it when present, coverage-style causality metrics report raw and observation-adjusted scores for expected non-visible evidence, and correctness/contradiction checks remain strict. Verification passed with config validation, Ruff checks/format checks, focused eval/manifest tests, and full normal `uv run pytest -v` (`3047 passed, 15 skipped`). +- [x] Post-host-activity score check — synced `dev`, cleaned up stale TODOs, regenerated/evaluated `scenarios/iteration-test` from the current iteration-test prompt with `enterprise_standard` observation, and ran one blind expert-panel review without entering another fix loop. Automated eval passed at `92.39` over `108,858` records; blind synthetic-confidence averaged `82.75`. Highest-leverage follow-ups are Linux SSH/syslog lifecycle ordering, Zeek observation-tree consistency, X.509 metadata coherence, Windows OS-build/local-SID identity, and static web asset manifests. - [x] Full slow-suite regression cleanup after loop-65 merge — explicit-proxy storyline beacons now preserve authored hostname+destination IP pairs only when the storyline marks that pair as intentional, normal proxy-origin DNS resolution remains intact, and the parallel-generation LogonID assertion treats Type 7 unlock reuse as valid slice-of-time Windows behavior. Verified with targeted proxy/parallel tests, `uv run ruff check .`, `uv run ruff format --check .`, and `uv run pytest -v --include-slow` (`2875 passed, 23 skipped`). Detection Engineer blind review completed for the regenerated Loop 61 dataset at `scenarios/iteration-test/data`; reviewer verdict: Synthetic, 63/100 confidence. Main findings: one PROXY-01 sshd accepted-login lifecycle gap/self-source artifact and Windows 4648 explicit-credential caller PID/image provenance ambiguity around `WS-MCHEN-01`. @@ -279,7 +281,7 @@ Verification is complete: dedicated `tests/unit/test_world_model.py` coverage wa - [x] **SUPERSEDED** Canonical emitter field provenance blind-review remaining findings from 78% synthetic review — superseded by later full-path storyline normalization, bash typo/path cleanup, proxy domain-class path/content profiles, and Sysmon follow-on ordering fixes. The still-current related work is now represented by web/session realism, imperfect observation/source coverage, and process lifecycle modeling TODOs. -- [ ] Source-specific process lifecycle completeness modeling — deferred design item. Add a configurable telemetry coverage/profile layer that can model realistic Security/Sysmon/eCAR missingness, ingestion delay, audit-policy gaps, and endpoint coverage variance without ad hoc omissions in individual emitters. This should be part of the broader cross-source distribution realism layer, not a Windows-only workaround. +- [x] **SUPERSEDED** Source-specific process lifecycle completeness modeling — the broad requirement is now covered by named observation profiles plus the host/activity profile layer. Observation profiles model deterministic source-family missingness/delay/coverage semantics for Security/Sysmon/eCAR and other sources, while host activity profiles add endpoint/source volume variance; the remaining narrower deployment-topology gap is tracked as configurable per-host/source log deployment coverage. - [x] Open PR consolidation into `dev` — re-applied the storyline typing-cadence monotonicity fix from PR #81, folded Dependabot pytest/Pygments updates into the dev workflow, and added Dependabot configuration so future dependency PRs target `dev`. @@ -601,7 +603,7 @@ Data works but experienced analysts spot tells. Grouped by format for efficient - [x] **P2** Per-host-type event rate multiplier — implemented as implicit host/activity profile defaults rather than scenario YAML fields. Domain controllers, file servers, web servers, proxies, Linux servers, and workstations now receive role/family/persona-specific multipliers across baseline activity, auth, endpoint, network, and source-specific noise. - [x] Configurable per-entity artifact variation — implemented in the host/activity profile layer for baseline artifact texture, including stable per-host encoded PowerShell variants and profile-owned endpoint activity scaling. - [x] Configurable per-host volume variance — implemented via stable host/persona/role multipliers applied across major activity families so hosts no longer share narrow uniform volume bands by construction. -- [ ] Configurable per-host/source log deployment coverage — observation profiles now support source-family gaps and host-scoped missingness multipliers, but explicit per-host source enablement/disablement remains future work. A later setting should model named host groups, disabled sensors, partial deployments, and collection windows when users need topology-level telemetry coverage differences rather than event-level missingness. +- [ ] Configurable per-host/source log deployment coverage — observation profiles now support source-family gaps and host-scoped missingness multipliers, but explicit per-host source enablement/disablement remains future work. A later setting should model named host groups, disabled sensors, partial deployments, and collection windows when users need topology-level telemetry coverage differences rather than event-level missingness or host/activity volume variance. - [ ] **P2** Generation speed and efficiency follow-up — Sprint 4 host/activity realism is functionally verified, but the slow-inclusive suite exposed that `pytest-cov` plus `tracemalloc` can make the medium dataset memory test pathological. A future sprint should profile generation without instrumentation noise, identify hot paths introduced by richer host activity/web fanout/firewall texture, and decide whether to optimize generation, mark the memory test `--no-cov`, or relax/update stale performance assertions. - [x] DNS IP pool reuse causes cross-provider resolution (CloudFront→Microsoft IPs, etc.) — domain-first selection ensures consistent domain→IP mapping via FORWARD_DNS - [x] AWS region mismatch between DNS PTR and SSL SNI for same IP — AWS hostname/PTR generation now derives a stable per-IP region/edge identity and PTR generation respects known forward hostname context. diff --git a/commands/eforge/config.md b/commands/eforge/config.md index 17a026e3..59ecdb12 100644 --- a/commands/eforge/config.md +++ b/commands/eforge/config.md @@ -71,7 +71,7 @@ When writing to the overlay, files are partial — they contain ONLY the user's | Modify baseline auth noise | `auth_noise.yaml` | (standalone — stale scheduled-credential accounts and irregular recurrence timing) | | Modify endpoint background noise | `endpoint_noise.yaml` | (standalone — scheduled-process timing and DHCP registry emission policy) | | Modify host activity distribution | `host_activity_profiles.yaml` | (standalone — host/persona/role rate-family multipliers, firewall deny bursts, and artifact variants) | -| Modify source observation coverage | `observation_profiles.yaml` | Scenario `observation_profile` selects the named profile; keep `complete` as the default training profile | +| Modify source observation coverage | `observation_profiles.yaml` | Scenario `observation_profile` selects the named profile; generated `OBSERVATION_MANIFEST.json` lets eval account for expected gaps; keep `complete` as the default training profile | | Modify causal/source timing | `timing_profiles.yaml` | (standalone — causal prerequisite, source latency, teardown, and Windows/Sysmon collision-spacing knobs) | | ~~Format definitions~~ | Not user-customizable | Engine internals — requires code changes | | ~~Evaluation rules~~ | Not user-customizable | Must match format definitions — requires code changes | diff --git a/commands/eforge/evaluate.md b/commands/eforge/evaluate.md index e9c5ed26..7a2c7765 100644 --- a/commands/eforge/evaluate.md +++ b/commands/eforge/evaluate.md @@ -36,6 +36,7 @@ scenarios// scenario.yaml ENVIRONMENT.md GROUND_TRUTH.md + OBSERVATION_MANIFEST.json ← optional, generated for source-observation-aware eval data/ ← this is the output_dir for eforge eval ``` @@ -65,6 +66,12 @@ Present a clear summary of the evaluation results. The report shows two tiers fo - **Minimum** (hard gate): must pass or the dataset fails overall - **Aspirational** (informational): a stretch target; failure here is noted but does not fail the dataset +If the scenario uses `observation_profile` other than `complete`, check whether the report says +the observation manifest was loaded. With a manifest, coverage-style causality sub-scores may be +adjusted for expected source gaps and will show a `raw` score when the adjusted score differs. +Do not describe this as a lowered threshold: visible contradictions, parseability failures, +source-native field mismatches, and evidence marked `visible` or `delayed` remain real failures. + For each pillar, explain what the score means in practical terms: **Pillar 1: Parseability (weight 0.30)** @@ -81,11 +88,11 @@ For each pillar, explain what the score means in practical terms: **Pillar 3: Causality (weight 0.25)** - Causal Ordering: Are logon→process→logoff sequences correctly ordered? DNS before TCP? Kerberos TGT/TGS before domain logons? -- Storyline Event Presence: Are all storyline events visible in at least one log source? +- Storyline Event Presence: Are all expected-visible storyline events visible in at least one log source? For non-`complete` observation profiles with a manifest, source rows marked `dropped`, `filtered`, or `out_of_window` are excluded from this coverage denominator. - Indicator Accuracy: Do traces carry the correct IPs, usernames, hostnames from the scenario? -- Pivot Linkability: Can a hunter pivot between consecutive attack steps using shared field values? -- Storyline Temporal Integrity: Are attack events in the right relative order at the right times? -- Storyline Trace Coverage: For each expected log format on each involved host, does the storyline leave a trace? +- Pivot Linkability: Can a hunter pivot between consecutive expected-visible attack steps using shared field values? +- Storyline Temporal Integrity: Are expected-visible attack events in the right relative order at the right times? +- Storyline Trace Coverage: For each expected-visible log format group on each involved host, does the storyline leave a trace? **Pillar 4: Timing (weight 0.20)** - Attack-Chain Timing: Do elapsed times between consecutive storyline steps fall within plausible bounds? Bounds come from `timing_bounds.yaml` — default 5s–2h, with per-action-type overrides (e.g., lateral movement: 30s–1h, exfiltration: 60s–24h). First matching keyword in the step activity wins. diff --git a/commands/eforge/references/config-dependency-graph.md b/commands/eforge/references/config-dependency-graph.md index c3ee6dd8..38010c95 100644 --- a/commands/eforge/references/config-dependency-graph.md +++ b/commands/eforge/references/config-dependency-graph.md @@ -170,7 +170,7 @@ Each row is a file; columns show what it depends on and what depends on it. | Direction | File | Relationship | |-----------|------|-------------| | depends on | scenario `observation_profile` | The scenario selects a named profile; the profile file owns source-level missingness/delay values | -| **depended on by** | Event dispatcher, GROUND_TRUTH.md | Applies deterministic source-observation drops/delays after canonical state updates and reports source evidence status | +| **depended on by** | Event dispatcher, GROUND_TRUTH.md, OBSERVATION_MANIFEST.json, `eforge eval` | Applies deterministic source-observation drops/delays after canonical state updates, reports source evidence status, and lets eval distinguish expected gaps from missing visible evidence | | validated by | `eforge validate-config` and `eforge validate` | Config validation checks source-family names/ranges; scenario validation checks that the named profile exists | ### network_params.yaml diff --git a/commands/eforge/references/config-evaluation.md b/commands/eforge/references/config-evaluation.md index d84a09fc..5e0d3e68 100644 --- a/commands/eforge/references/config-evaluation.md +++ b/commands/eforge/references/config-evaluation.md @@ -21,6 +21,15 @@ Schema documentation for data quality evaluation rule files in `src/evidenceforg Controls the two-tier acceptance model for `eforge eval`. Each sub-score has a **minimum** (hard gate: dataset fails if below) and an **aspirational** target (informational stretch goal). Pillar weights must sum to 1.0. +When a generated dataset includes `OBSERVATION_MANIFEST.json` beside `GROUND_TRUTH.md`, +`eforge eval` automatically applies observation-aware coverage scoring. Non-`complete` +profiles can adjust only coverage-style causality sub-scores (`event_presence`, +`pivot_linkability`, `temporal_integrity`, and `storyline_trace_coverage`) by excluding +evidence that the manifest marks `dropped`, `filtered`, or `out_of_window`. Source-native +correctness gates such as parseability, value plausibility, field agreement, and visible causal +ordering remain strict. Adjusted sub-scores expose `raw_score` in JSON and show `raw:` in +the text report. + ### Structure ```yaml diff --git a/commands/eforge/references/config-host-activity.md b/commands/eforge/references/config-host-activity.md index 33634892..e4314509 100644 --- a/commands/eforge/references/config-host-activity.md +++ b/commands/eforge/references/config-host-activity.md @@ -430,6 +430,11 @@ profiles: Profiles are intentionally source-level, not event-type matrices. Scenario authors select a named profile; code owns safe source-native application semantics so new event types inherit their source-family default. Non-complete profiles may make evidence `visible`, `delayed`, `dropped`, `filtered`, or `out_of_window`, but must not create contradictory identifiers or field values across sources. +Generation writes `OBSERVATION_MANIFEST.json` beside `GROUND_TRUTH.md`. `eforge eval` uses this +sidecar to adjust only coverage-style causality scoring for expected missing evidence under +non-`complete` profiles. The raw score remains visible in the report, and source-native +correctness checks are not relaxed. + Valid source families are `windows_security`, `sysmon`, `ecar`, `syslog`, `bash_history`, `zeek`, `proxy`, `web`, `asa`, and `ids`. Run `eforge validate-config` after overlay changes; it rejects unknown source-family names, invalid probabilities, and inverted ranges. Run `eforge validate` on scenarios that use a non-default profile so unknown profile names are caught before generation. --- diff --git a/commands/eforge/references/scenario-reference.md b/commands/eforge/references/scenario-reference.md index 0820e334..bccfbefc 100644 --- a/commands/eforge/references/scenario-reference.md +++ b/commands/eforge/references/scenario-reference.md @@ -405,7 +405,8 @@ training-friendly perfect source coverage and correlation. Non-default profiles deterministic source-level missingness and source-native delays while preserving canonical truth: they can make evidence `visible`, `delayed`, `dropped`, `filtered`, or `out_of_window`, but they must not create contradictory users, PIDs, ports, hashes, UIDs, or session identifiers across -sources. `GROUND_TRUTH.md` records source evidence status when a non-complete profile is used. +sources. `GROUND_TRUTH.md` records source evidence status for instructors, and +`OBSERVATION_MANIFEST.json` records the same source-observation contract for automated eval. ## Storyline diff --git a/docs/design/data-quality-prd.md b/docs/design/data-quality-prd.md index 49d7d0a5..15a6c4c7 100644 --- a/docs/design/data-quality-prd.md +++ b/docs/design/data-quality-prd.md @@ -339,6 +339,12 @@ Every sub-score now has: Thresholds are stored in `src/evidenceforge/config/evaluation/thresholds.yaml` for tuning without code changes. Calibration against purpose-built scenarios is deferred to a separate pass. +Datasets generated with non-`complete` observation profiles include `OBSERVATION_MANIFEST.json`. +When present, eval uses it to adjust coverage-style causality sub-scores for evidence that was +intentionally `dropped`, `filtered`, or `out_of_window`. Hard correctness gates remain strict: +observation profiles do not excuse parse failures, impossible values, source-native contradictions, +or evidence marked `visible`/`delayed` but missing from logs. + ### Calibration Plan Thresholds are currently judgment-based. After the restructure is stable, the plan is to design purpose-built calibration scenarios (known-good and known-bad), run `eforge eval` against them, and use the results to propose empirically grounded threshold values. Out of scope for v0.5.1. diff --git a/docs/reference/CUSTOMIZING_CONFIG.md b/docs/reference/CUSTOMIZING_CONFIG.md index 286baf38..d46590bd 100644 --- a/docs/reference/CUSTOMIZING_CONFIG.md +++ b/docs/reference/CUSTOMIZING_CONFIG.md @@ -193,6 +193,14 @@ The `eforge eval` scoring rules are also YAML-based and can be tuned per-project All eval config files live in `src/evidenceforge/config/evaluation/`. They are **not** overlaid from `.eforge/config/` — edit them in-place if you want project-specific tuning, or copy the package files into your project and set the `EFORGE_EVAL_CONFIG_DIR` environment variable to point to your copies. +Generated scenario directories may also include `OBSERVATION_MANIFEST.json` beside +`GROUND_TRUTH.md`. `eforge eval` loads this sidecar automatically when present. For +non-`complete` observation profiles, causality coverage metrics use the manifest to exclude +source evidence that was intentionally `dropped`, `filtered`, or `out_of_window`, while still +failing visible contradictions, parse errors, value mismatches, and missing evidence that the +manifest marks `visible` or `delayed`. Text and JSON reports keep the adjusted score and expose +the raw score for affected sub-scores. + For full schema documentation for each file, see the skill reference: `/eforge:references:config-evaluation`. ## Reference Documentation diff --git a/docs/reference/scenario-reference.md b/docs/reference/scenario-reference.md index 118fa2bd..c5ada98f 100644 --- a/docs/reference/scenario-reference.md +++ b/docs/reference/scenario-reference.md @@ -405,7 +405,8 @@ training-friendly perfect source coverage and correlation. Non-default profiles deterministic source-level missingness and source-native delays while preserving canonical truth: they can make evidence `visible`, `delayed`, `dropped`, `filtered`, or `out_of_window`, but they must not create contradictory users, PIDs, ports, hashes, UIDs, or session identifiers across -sources. `GROUND_TRUTH.md` records source evidence status when a non-complete profile is used. +sources. `GROUND_TRUTH.md` records source evidence status for instructors, and +`OBSERVATION_MANIFEST.json` records the same source-observation contract for automated eval. ## Storyline diff --git a/scenarios/ITERATION-TEST-PROMPT.md b/scenarios/ITERATION-TEST-PROMPT.md index 199cf680..554c1455 100644 --- a/scenarios/ITERATION-TEST-PROMPT.md +++ b/scenarios/ITERATION-TEST-PROMPT.md @@ -39,12 +39,12 @@ default_action: deny, deny_ratio: 2.0, drop_mode: drop, threat_detection_rate: 10, nat_rules: - type: dynamic_pat - src: [corporate_lan, server_vlan] - mapped_ip: 45.33.32.1 + src: [corporate_lan, server_vlan, dmz] + mapped_ip: 203.14.220.1 - type: static src: dmz real_ip: 10.10.3.10 (WEB-EXT-01) - mapped_ip: 45.33.32.10 + mapped_ip: 203.14.220.10 policy: - {src: external, dst: dmz, ports: [80, 443]} - {src: corporate_lan, dst: any} @@ -161,12 +161,12 @@ service_file_name: "%SystemRoot%\PSEXESVC.exe") + process events for commands run under the service. Do NOT use "cmd.exe /c PSEXESVC.exe" — that produces the wrong parent chain. - 15. Privilege Escalation (+4h15m): Create backdoor account svc_sqlreader (account_created event), + 15. Privilege Escalation (+4h15m): Create backdoor account svc_mhsync (account_created event), add to Domain Admins (group_member_added event). Actor: SYSTEM on DC-01. - 16. Persistence (+4h20m): Install service "HealthMonitorSvc" (service_installed event with + 16. Persistence (+4h20m): Install service "DeviceSyncSvc" (service_installed event with service_name, service_file_name, service_account) and create scheduled task - "\Microsoft\Windows\Maintenance\SystemHealthCheck" (scheduled_task_created event) on DC-01. + "\Microsoft\Windows\Maintenance\DeviceSync" (scheduled_task_created event) on DC-01. 17. C2 Beaconing (+4h30m): HTTPS beacon from DC-01 to 45.33.32.30:443 (beacon event with interval: "10m", duration: "1h30m", jitter: 0.3, hostname, user_agent, method: GET, @@ -178,14 +178,14 @@ internal sensors only. 19. DNS Tunneling (+4h45m): Exfiltrate data via DNS tunnel from APP-INT-01 (dns_tunnel event - with base_domain: "ns1.cdn-health-updates.net", encoding: hex, qtype: TXT, interval: "2s", + with base_domain: "ns1.westbridge-services.net", encoding: hex, qtype: TXT, interval: "2s", duration: "15m", payload_size: 512). 20. DGA Activity (+5h): DGA queries from WEB-EXT-01 (dga_queries event with tld: ".net", length_range: [10, 18], interval: "30s", duration: "45m", rcode_distribution for mostly NXDOMAIN). - 21. Collection (+5h): Authenticate to FILE-SRV-01 with backdoor account svc_sqlreader + 21. Collection (+5h): Authenticate to FILE-SRV-01 with backdoor account svc_mhsync (logon event, type 3), enumerate shares, stage financial and patient data, compress with PowerShell Compress-Archive. @@ -195,9 +195,9 @@ 23. Workstation Lock (+5h20m): Attacker locks the compromised workstation before stepping away (workstation_lock event) — exercises EventID 4800. - 24. Exfiltration (+5h25m): Upload archive to cdn-assets-update.com (45.33.32.30) over HTTPS + 24. Exfiltration (+5h25m): Upload archive to api.westbridge-services.net (45.33.32.30) over HTTPS (connection event with HTTP fields, method: POST, large orig_bytes — use a physically - plausible value in the 100-500 MB range, NOT multi-GB). + plausible non-round value in the 100-500 MB range, NOT multi-GB or a power-of-two anchor). 25. Workstation Unlock (+5h35m): Attacker returns, unlocks workstation (workstation_unlock event) — exercises EventID 4801. @@ -212,8 +212,8 @@ 28. Ongoing C2 (+5h, +5h30m): Periodic beacons from WEB-EXT-01 to 45.33.32.30:443 (separate beacon events). - 29. Account Cleanup (+5h50m): Delete the backdoor account svc_sqlreader (account_deleted event - with target_username: svc_sqlreader). + 29. Account Cleanup (+5h50m): Delete the backdoor account svc_mhsync (account_deleted event + with target_username: svc_mhsync). 30. Logoff (+5h55m): Attacker logs off from compromised systems (logoff events). diff --git a/src/evidenceforge/cli/commands.py b/src/evidenceforge/cli/commands.py index 04793d25..632dca4a 100644 --- a/src/evidenceforge/cli/commands.py +++ b/src/evidenceforge/cli/commands.py @@ -250,6 +250,8 @@ def generate( data_dir = scenario_dir / "data" ground_truth_dir = scenario_dir + from evidenceforge.events.observation_manifest import OBSERVATION_MANIFEST_FILENAME + # Apply --formats filter (intersection with scenario output.logs) if formats: from evidenceforge.events.dispatcher import expand_formats @@ -284,6 +286,9 @@ def generate( gt_path = ground_truth_dir / "GROUND_TRUTH.md" if gt_path.exists(): existing.append(f" GROUND_TRUTH.md ({gt_path})") + manifest_path = ground_truth_dir / OBSERVATION_MANIFEST_FILENAME + if manifest_path.exists(): + existing.append(f" {OBSERVATION_MANIFEST_FILENAME} ({manifest_path})") has_existing = bool(existing) if has_existing: @@ -386,6 +391,7 @@ def progress_callback(event_type: str, data: dict) -> None: # as a matched pair — partial preservation is never valid. if staging_dir: staged_gt = gen_gt_dir / "GROUND_TRUTH.md" + staged_manifest = gen_gt_dir / OBSERVATION_MANIFEST_FILENAME if not gen_data_dir.exists(): raise RuntimeError("Staged data/ directory missing after generation") if not staged_gt.exists(): @@ -404,10 +410,14 @@ def progress_callback(event_type: str, data: dict) -> None: data_dir.rename(rollback_dir / "data") if gt_path.exists(): gt_path.rename(rollback_dir / "GROUND_TRUTH.md") + if manifest_path.exists(): + manifest_path.rename(rollback_dir / OBSERVATION_MANIFEST_FILENAME) # Step 2: Install new output gen_data_dir.rename(data_dir) staged_gt.rename(gt_path) + if staged_manifest.exists(): + staged_manifest.rename(manifest_path) swap_succeeded = True except BaseException: @@ -417,10 +427,15 @@ def progress_callback(event_type: str, data: dict) -> None: shutil.rmtree(data_dir) if gt_path.exists() and (rollback_dir / "GROUND_TRUTH.md").exists(): gt_path.unlink() + if manifest_path.exists(): + manifest_path.unlink() if (rollback_dir / "data").exists(): (rollback_dir / "data").rename(data_dir) if (rollback_dir / "GROUND_TRUTH.md").exists(): (rollback_dir / "GROUND_TRUTH.md").rename(gt_path) + rollback_manifest = rollback_dir / OBSERVATION_MANIFEST_FILENAME + if rollback_manifest.exists(): + rollback_manifest.rename(manifest_path) except Exception: logger.error("Rollback failed — old output may be in: %s", rollback_dir) raise @@ -435,10 +450,13 @@ def progress_callback(event_type: str, data: dict) -> None: console.print("\nGenerated files:") console.print(f" Scenario directory: {ground_truth_dir}") - # List files in scenario root (GROUND_TRUTH.md) + # List files in scenario root (GROUND_TRUTH.md + machine-readable sidecars) if ground_truth_dir.exists(): for file in sorted(ground_truth_dir.iterdir()): - if file.is_file() and file.name == "GROUND_TRUTH.md": + if file.is_file() and file.name in { + "GROUND_TRUTH.md", + OBSERVATION_MANIFEST_FILENAME, + }: size = file.stat().st_size size_str = f"{size:,} bytes" if size < 1024 else f"{size / 1024:.1f} KB" console.print(f" • {file.name} ({size_str})") diff --git a/src/evidenceforge/config/activity/README.md b/src/evidenceforge/config/activity/README.md index 684bbb1a..6c3d3762 100644 --- a/src/evidenceforge/config/activity/README.md +++ b/src/evidenceforge/config/activity/README.md @@ -24,7 +24,7 @@ caches data after first load. Two files (`network_params.yaml`, | `auth_noise.yaml` | `auth_noise.py` | Baseline authentication-noise profiles such as stale scheduled-credential account pools and irregular recurrence timing. | | `endpoint_noise.yaml` | `endpoint_noise.py` | Endpoint background timing and registry-emission policies for Windows scheduled processes and DHCP interface registry writes. | | `host_activity_profiles.yaml` | `host_activity_profiles.py` | Coarse host/persona/role rate multipliers for baseline volume, endpoint noise, firewall deny bursts, and data-driven artifact variation. | -| `observation_profiles.yaml` | `config/observation_profiles.py` | Named source-observation profiles for optional source-level missingness and delays. Scenario `observation_profile` defaults to `complete`. | +| `observation_profiles.yaml` | `config/observation_profiles.py` | Named source-observation profiles for optional source-level missingness and delays. Scenario `observation_profile` defaults to `complete`; generation records status in `OBSERVATION_MANIFEST.json` for eval. | | `proxy_uri_templates.yaml` | `proxy_uri.py` | Per-domain URI path templates for proxy logs (Windows Update, CRL, OCSP, Azure AD, etc.). | | `network_params.yaml` | `network_params.py`, `engine/emitter_setup.py` | MAC address OUI prefixes, public NTP fallback servers, and DNS tunnel RTT bounds. | | `systemd_schedules.yaml` | `engine/baseline.py` | Systemd timer and cron job schedules (logrotate, fstrim, apt-daily, etc.). | diff --git a/src/evidenceforge/evaluation/context.py b/src/evidenceforge/evaluation/context.py new file mode 100644 index 00000000..e8c8406c --- /dev/null +++ b/src/evidenceforge/evaluation/context.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates +# SPDX-License-Identifier: MIT + +"""Shared context passed to evaluation pillar scorers.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from evidenceforge.events.observation_manifest import ObservationManifest + + +@dataclass(frozen=True, slots=True) +class EvaluationContext: + """Additional dataset metadata available to scorers.""" + + observation_manifest: ObservationManifest | None = None diff --git a/src/evidenceforge/evaluation/dimensions/__init__.py b/src/evidenceforge/evaluation/dimensions/__init__.py index 4cadbd69..7bc0c8b8 100644 --- a/src/evidenceforge/evaluation/dimensions/__init__.py +++ b/src/evidenceforge/evaluation/dimensions/__init__.py @@ -26,6 +26,7 @@ from collections.abc import Callable, Iterable from typing import Any +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.models import PillarScore, SubScore from evidenceforge.evaluation.parsers import ParsedRecord from evidenceforge.models.scenario import Scenario @@ -71,6 +72,7 @@ def score( self, records: dict[str, list[ParsedRecord]], scenario: Scenario, + context: EvaluationContext | None = None, progress: ProgressCallback = _noop_callback, ) -> PillarScore: """Score a dataset on this pillar. @@ -78,6 +80,7 @@ def score( Args: records: Parsed records grouped by format name. scenario: The scenario used to generate the dataset. + context: Optional metadata sidecars discovered for the dataset. progress: Optional callback for reporting sub-score progress. Returns: diff --git a/src/evidenceforge/evaluation/engine.py b/src/evidenceforge/evaluation/engine.py index 1c4c1257..1255d9bb 100644 --- a/src/evidenceforge/evaluation/engine.py +++ b/src/evidenceforge/evaluation/engine.py @@ -30,6 +30,7 @@ from datetime import UTC, datetime from pathlib import Path +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.dimensions import DimensionScorer, ProgressCallback, _noop_callback from evidenceforge.evaluation.models import ( AcceptanceCriterion, @@ -44,6 +45,7 @@ TimingScorer, ) from evidenceforge.evaluation.thresholds import EvalThresholds, load_thresholds +from evidenceforge.events.observation_manifest import load_observation_manifest from evidenceforge.models.scenario import Scenario logger = logging.getLogger(__name__) @@ -168,6 +170,8 @@ def run(self) -> QualityReport: ) logger.info(f"Parsed {total_records} records across {len(source_counts)} sources") + observation_manifest = load_observation_manifest(self.output_dir) + context = EvaluationContext(observation_manifest=observation_manifest) # 2. Run each available pillar scorer total_pillars = len(DIMENSION_SCORERS) @@ -186,7 +190,12 @@ def run(self) -> QualityReport: logger.info(f"Scoring Pillar {scorer.number}: {scorer.name}") pillar_score: PillarScore try: - pillar_score = scorer.score(records, self.scenario, progress=self._progress) + pillar_score = scorer.score( + records, + self.scenario, + context=context, + progress=self._progress, + ) pillars.append(pillar_score) except Exception: logger.exception(f"Pillar {scorer.number} scoring failed") @@ -225,6 +234,18 @@ def run(self) -> QualityReport: supplementary: dict = {} for pillar in pillars: supplementary.update(pillar.supplementary) + if observation_manifest is not None: + supplementary["observation_profile"] = { + "profile": observation_manifest.observation_profile, + "manifest_present": True, + "source_summary": observation_manifest.source_summary, + } + elif self.scenario.observation_profile != "complete": + supplementary["observation_profile"] = { + "profile": self.scenario.observation_profile, + "manifest_present": False, + "source_summary": {}, + } return QualityReport( scenario_name=self.scenario.name, diff --git a/src/evidenceforge/evaluation/models.py b/src/evidenceforge/evaluation/models.py index 1db1c346..2361f5c3 100644 --- a/src/evidenceforge/evaluation/models.py +++ b/src/evidenceforge/evaluation/models.py @@ -19,6 +19,10 @@ class SubScore(BaseModel): key: str weight: float = Field(ge=0.0, le=1.0) score: float | None = Field(None, ge=0.0, le=100.0) + raw_score: float | None = Field(None, ge=0.0, le=100.0) + """Unadjusted score when profile-aware scoring changes the displayed score.""" + adjusted: bool = False + """True when the score excludes expected observation-profile gaps.""" details: str = "" sample_failures: list[str] = Field(default_factory=list) failure_summary: dict[str, dict[str, int]] = Field(default_factory=dict) diff --git a/src/evidenceforge/evaluation/pillars/causality.py b/src/evidenceforge/evaluation/pillars/causality.py index 5de77d37..c07a4244 100644 --- a/src/evidenceforge/evaluation/pillars/causality.py +++ b/src/evidenceforge/evaluation/pillars/causality.py @@ -39,6 +39,7 @@ from urllib.parse import urlsplit from evidenceforge.evaluation._shared import _condition_matches, _extract_hostname, _normalize_ts +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.dimensions import ( DimensionScorer, ProgressCallback, @@ -55,6 +56,8 @@ resolve_storyline, ) from evidenceforge.evaluation.visibility import VisibilityModel +from evidenceforge.events.observation import source_family_for_format +from evidenceforge.events.observation_manifest import ObservationManifestEvent from evidenceforge.models.scenario import Scenario from evidenceforge.utils.time import parse_duration @@ -70,8 +73,10 @@ def score( self, records: dict[str, list[ParsedRecord]], scenario: Scenario, + context: EvaluationContext | None = None, progress: ProgressCallback = _noop_callback, ) -> PillarScore: + context = context or EvaluationContext() storyline = scenario.storyline or [] resolved: list[ResolvedEvent] = [] @@ -99,7 +104,7 @@ def score( progress("sub_score_done", {"name": "Causal Ordering", "score": s1.score}) progress("sub_score_start", {"name": "Event Presence", "step": 2, "total": 6}) - s2 = self._score_event_presence(resolved) + s2 = self._score_event_presence(resolved, context) progress("sub_score_done", {"name": "Event Presence", "score": s2.score}) progress("sub_score_start", {"name": "Indicator Accuracy", "step": 3, "total": 6}) @@ -107,15 +112,15 @@ def score( progress("sub_score_done", {"name": "Indicator Accuracy", "score": s3.score}) progress("sub_score_start", {"name": "Pivot Linkability", "step": 4, "total": 6}) - s4 = self._score_pivot_linkability(resolved) + s4 = self._score_pivot_linkability(resolved, context) progress("sub_score_done", {"name": "Pivot Linkability", "score": s4.score}) progress("sub_score_start", {"name": "Temporal Integrity", "step": 5, "total": 6}) - s5 = self._score_temporal_integrity(resolved) + s5 = self._score_temporal_integrity(resolved, context) progress("sub_score_done", {"name": "Temporal Integrity", "score": s5.score}) progress("sub_score_start", {"name": "Storyline Trace Coverage", "step": 6, "total": 6}) - s6 = self._score_storyline_trace_coverage(resolved, vis, host_time_index) + s6 = self._score_storyline_trace_coverage(resolved, vis, host_time_index, context) progress("sub_score_done", {"name": "Storyline Trace Coverage", "score": s6.score}) sub_scores = [s1, s2, s3, s4, s5, s6] @@ -188,6 +193,71 @@ def _find_traces( traces = self._search_for_event_indexed(event, event_type, host_time_index) event.traces.extend(traces) + # --- Observation-profile adjustment helpers --- + + @staticmethod + def _manifest_event( + event: ResolvedEvent, + context: EvaluationContext, + ) -> ObservationManifestEvent | None: + manifest = context.observation_manifest + if manifest is None or manifest.observation_profile == "complete": + return None + return manifest.storyline_by_id().get(event.storyline_id) + + @classmethod + def _event_observation_exempt( + cls, + event: ResolvedEvent, + context: EvaluationContext, + ) -> bool: + manifest_event = cls._manifest_event(event, context) + if manifest_event is None: + return False + return manifest_event.visible_or_delayed_count == 0 and manifest_event.non_visible_count > 0 + + @classmethod + def _format_group_observation_exempt( + cls, + event: ResolvedEvent, + group_formats: set[str], + context: EvaluationContext, + ) -> bool: + manifest_event = cls._manifest_event(event, context) + if manifest_event is None: + return False + source_families = {source_family_for_format(fmt) for fmt in group_formats} + relevant = { + source: counts + for source, counts in manifest_event.source_status.items() + if source in source_families + } + if not relevant: + return False + visible_or_delayed = sum( + counts.get("visible", 0) + counts.get("delayed", 0) for counts in relevant.values() + ) + non_visible = sum( + counts.get("dropped", 0) + counts.get("filtered", 0) + counts.get("out_of_window", 0) + for counts in relevant.values() + ) + return visible_or_delayed == 0 and non_visible > 0 + + @staticmethod + def _adjusted_details( + adjusted_details: str, + raw_found: int, + raw_total: int, + excluded: int, + ) -> str: + if excluded <= 0: + return adjusted_details + raw_score = (100.0 * raw_found / raw_total) if raw_total > 0 else 100.0 + return ( + f"{adjusted_details}; raw {raw_found}/{raw_total} ({raw_score:.1f}/100), " + f"{excluded} excluded by observation profile" + ) + def _search_for_event_indexed( self, event: ResolvedEvent, @@ -830,7 +900,11 @@ def _score_causal_ordering( # --- Sub-score 2: Event Presence --- - def _score_event_presence(self, resolved: list[ResolvedEvent]) -> SubScore: + def _score_event_presence( + self, + resolved: list[ResolvedEvent], + context: EvaluationContext, + ) -> SubScore: if not resolved: return SubScore( name="Event Presence", @@ -839,20 +913,39 @@ def _score_event_presence(self, resolved: list[ResolvedEvent]) -> SubScore: score=100.0, details="No storyline events", ) - total = len(resolved) - found = sum(1 for e in resolved if e.traces) + raw_total = len(resolved) + raw_found = sum(1 for e in resolved if e.traces) + total = 0 + found = 0 + excluded = 0 + for event in resolved: + if event.traces: + total += 1 + found += 1 + elif self._event_observation_exempt(event, context): + excluded += 1 + else: + total += 1 failures = [ f"Event {e.index}: {e.actor}@{e.system} '{e.activity[:60]}' — no traces" for e in resolved - if not e.traces + if not e.traces and not self._event_observation_exempt(e, context) ] score = (100.0 * found / total) if total > 0 else 100.0 + raw_score = (100.0 * raw_found / raw_total) if raw_total > 0 else 100.0 return SubScore( name="Event Presence", key="event_presence", weight=0.20, score=score, - details=f"{found}/{total} storyline events have traces in logs", + raw_score=raw_score if excluded else None, + adjusted=excluded > 0, + details=self._adjusted_details( + f"{found}/{total} expected-visible storyline events have traces in logs", + raw_found, + raw_total, + excluded, + ), sample_failures=failures[:10], ) @@ -966,7 +1059,11 @@ def _best_sub_detail(event: ResolvedEvent, fields: dict) -> dict[str, Any]: # --- Sub-score 4: Pivot Linkability --- - def _score_pivot_linkability(self, resolved: list[ResolvedEvent]) -> SubScore: + def _score_pivot_linkability( + self, + resolved: list[ResolvedEvent], + context: EvaluationContext, + ) -> SubScore: if len(resolved) < 2: return SubScore( name="Pivot Linkability", @@ -975,12 +1072,26 @@ def _score_pivot_linkability(self, resolved: list[ResolvedEvent]) -> SubScore: score=100.0, details="Fewer than 2 events — nothing to link", ) - total_pairs = len(resolved) - 1 + raw_total_pairs = len(resolved) - 1 + raw_linkable = 0 + total_pairs = 0 linkable = 0 + excluded = 0 failures: list[str] = [] - for i in range(total_pairs): + for i in range(raw_total_pairs): a, b = resolved[i], resolved[i + 1] - if self._extract_indicator_values(a) & self._extract_indicator_values(b): + pair_linkable = bool( + self._extract_indicator_values(a) & self._extract_indicator_values(b) + ) + if pair_linkable: + raw_linkable += 1 + if (not a.traces and self._event_observation_exempt(a, context)) or ( + not b.traces and self._event_observation_exempt(b, context) + ): + excluded += 1 + continue + total_pairs += 1 + if pair_linkable: linkable += 1 elif len(failures) < 10: failures.append( @@ -988,12 +1099,21 @@ def _score_pivot_linkability(self, resolved: list[ResolvedEvent]) -> SubScore: f"({a.actor}@{a.system} → {b.actor}@{b.system})" ) score = (100.0 * linkable / total_pairs) if total_pairs > 0 else 100.0 + raw_score = (100.0 * raw_linkable / raw_total_pairs) if raw_total_pairs > 0 else 100.0 return SubScore( name="Pivot Linkability", key="pivot_linkability", weight=0.15, score=score, - details=f"{linkable}/{total_pairs} consecutive pairs share a pivotable indicator", + raw_score=raw_score if excluded else None, + adjusted=excluded > 0, + details=self._adjusted_details( + f"{linkable}/{total_pairs} expected-visible consecutive pairs share a " + "pivotable indicator", + raw_linkable, + raw_total_pairs, + excluded, + ), sample_failures=failures, ) @@ -1025,7 +1145,11 @@ def _extract_indicator_values(self, event: ResolvedEvent) -> set[str]: # --- Sub-score 5: Temporal Integrity --- - def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore: + def _score_temporal_integrity( + self, + resolved: list[ResolvedEvent], + context: EvaluationContext, + ) -> SubScore: if not resolved: return SubScore( name="Temporal Integrity", @@ -1034,13 +1158,20 @@ def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore: score=100.0, details="No storyline events", ) - total = len(resolved) + raw_total = len(resolved) + raw_correct = 0 + total = 0 correct = 0 + excluded = 0 failures: list[str] = [] prev_earliest: datetime | None = None for event in resolved: if not event.traces: + if self._event_observation_exempt(event, context): + excluded += 1 + continue + total += 1 if len(failures) < 10: failures.append(f"Event {event.index}: no traces to verify timing") continue @@ -1056,12 +1187,14 @@ def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore: if not trace_times: continue + total += 1 earliest = min(trace_times) time_ok = abs((earliest - event.time).total_seconds()) <= TIME_TOLERANCE.total_seconds() order_ok = prev_earliest is None or earliest >= prev_earliest - timedelta(seconds=5) if time_ok and order_ok: correct += 1 + raw_correct += 1 elif len(failures) < 10: if not time_ok: delta = (earliest - event.time).total_seconds() @@ -1075,12 +1208,20 @@ def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore: prev_earliest = earliest score = (100.0 * correct / total) if total > 0 else 100.0 + raw_score = (100.0 * raw_correct / raw_total) if raw_total > 0 else 100.0 return SubScore( name="Temporal Integrity", key="temporal_integrity", weight=0.15, score=score, - details=f"{correct}/{total} events correctly timed and ordered", + raw_score=raw_score if excluded else None, + adjusted=excluded > 0, + details=self._adjusted_details( + f"{correct}/{total} expected-visible events correctly timed and ordered", + raw_correct, + raw_total, + excluded, + ), sample_failures=failures, ) @@ -1091,6 +1232,7 @@ def _score_storyline_trace_coverage( resolved: list[ResolvedEvent], vis: VisibilityModel, host_time_index: dict[str, dict[str, list[ParsedRecord]]], + context: EvaluationContext, ) -> SubScore: if not resolved: return SubScore( @@ -1101,8 +1243,11 @@ def _score_storyline_trace_coverage( details="No storyline events", ) + raw_total_expected = 0 + raw_found = 0 total_expected = 0 found = 0 + excluded = 0 failures: list[str] = [] for event in resolved: @@ -1120,7 +1265,7 @@ def _score_storyline_trace_coverage( lookup_keys.append(val) for group_name, group_formats in groups: - total_expected += 1 + raw_total_expected += 1 group_found = False for fmt in group_formats: if fmt not in host_time_index.get("__formats__", {fmt: True}): @@ -1145,20 +1290,35 @@ def _score_storyline_trace_coverage( break if group_found: + raw_found += 1 + total_expected += 1 found += 1 + elif self._format_group_observation_exempt(event, group_formats, context): + excluded += 1 elif len(failures) < 10: + total_expected += 1 failures.append( f"Event {event.index}: no trace in {group_name} group " f"for {event.actor}@{event.system}" ) + else: + total_expected += 1 score = (100.0 * found / total_expected) if total_expected > 0 else 100.0 + raw_score = (100.0 * raw_found / raw_total_expected) if raw_total_expected > 0 else 100.0 return SubScore( name="Storyline Trace Coverage", key="storyline_trace_coverage", weight=0.10, score=score, - details=f"{found}/{total_expected} expected format-traces found", + raw_score=raw_score if excluded else None, + adjusted=excluded > 0, + details=self._adjusted_details( + f"{found}/{total_expected} expected-visible format-traces found", + raw_found, + raw_total_expected, + excluded, + ), sample_failures=failures, ) diff --git a/src/evidenceforge/evaluation/pillars/parseability.py b/src/evidenceforge/evaluation/pillars/parseability.py index fcc8545c..4717db20 100644 --- a/src/evidenceforge/evaluation/pillars/parseability.py +++ b/src/evidenceforge/evaluation/pillars/parseability.py @@ -30,6 +30,7 @@ import logging from typing import Any +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.dimensions import ( DimensionScorer, ProgressCallback, @@ -92,6 +93,7 @@ def score( self, records: dict[str, list[ParsedRecord]], scenario: Scenario, + context: EvaluationContext | None = None, progress: ProgressCallback = _noop_callback, ) -> PillarScore: progress("sub_score_start", {"name": "Spec Conformance", "step": 1, "total": 2}) diff --git a/src/evidenceforge/evaluation/pillars/plausibility.py b/src/evidenceforge/evaluation/pillars/plausibility.py index c43212ae..3643f162 100644 --- a/src/evidenceforge/evaluation/pillars/plausibility.py +++ b/src/evidenceforge/evaluation/pillars/plausibility.py @@ -45,6 +45,7 @@ _jensen_shannon_divergence, ) from evidenceforge.evaluation.anomaly import detect_anomalies +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.dimensions import ( DimensionScorer, ProgressCallback, @@ -81,6 +82,7 @@ def score( self, records: dict[str, list[ParsedRecord]], scenario: Scenario, + context: EvaluationContext | None = None, progress: ProgressCallback = _noop_callback, ) -> PillarScore: enabled = {log_spec["format"] for log_spec in scenario.output.logs if "format" in log_spec} diff --git a/src/evidenceforge/evaluation/pillars/timing.py b/src/evidenceforge/evaluation/pillars/timing.py index 66978674..95e6989f 100644 --- a/src/evidenceforge/evaluation/pillars/timing.py +++ b/src/evidenceforge/evaluation/pillars/timing.py @@ -43,6 +43,7 @@ _extract_username, _jensen_shannon_2d, ) +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.dimensions import ( DimensionScorer, ProgressCallback, @@ -70,6 +71,7 @@ def score( self, records: dict[str, list[ParsedRecord]], scenario: Scenario, + context: EvaluationContext | None = None, progress: ProgressCallback = _noop_callback, ) -> PillarScore: user_events = _group_by_user(records) diff --git a/src/evidenceforge/evaluation/report.py b/src/evidenceforge/evaluation/report.py index 66d61912..2fcafb17 100644 --- a/src/evidenceforge/evaluation/report.py +++ b/src/evidenceforge/evaluation/report.py @@ -46,6 +46,11 @@ def format_text_report(report: QualityReport, console: Console, verbose: bool = ) if verbose and source_parts: console.print(f" ({source_parts})") + observation = report.supplementary.get("observation_profile") + if observation: + profile = observation.get("profile", "complete") + manifest_note = "manifest loaded" if observation.get("manifest_present") else "no manifest" + console.print(f"Observation profile: {profile} ({manifest_note})") console.print() @@ -183,6 +188,8 @@ def _print_sub_score( if ac.aspirational is not None and ac.meets_aspirational is not None: asp_tag = "[green]met[/green]" if ac.meets_aspirational else "[dim]below[/dim]" line += f" [asp:{ac.aspirational:.0f} {asp_tag}]" + if sub.adjusted and sub.raw_score is not None: + line += f" [dim]raw:{sub.raw_score:.0f}[/dim]" console.print(line) diff --git a/src/evidenceforge/evaluation/storyline.py b/src/evidenceforge/evaluation/storyline.py index 1629b919..25307185 100644 --- a/src/evidenceforge/evaluation/storyline.py +++ b/src/evidenceforge/evaluation/storyline.py @@ -105,6 +105,7 @@ class ResolvedEvent: event_types: list[str] sub_details: list[dict[str, Any]] = field(default_factory=list) traces: list[ParsedRecord] = field(default_factory=list) + storyline_id: str = "" def _parse_event_time(time_str: str, start_time: datetime) -> datetime: @@ -179,6 +180,7 @@ def resolve_storyline( details=details, event_types=event_types, sub_details=sub_details, + storyline_id=event.id, ) ) diff --git a/src/evidenceforge/events/observation_manifest.py b/src/evidenceforge/events/observation_manifest.py new file mode 100644 index 00000000..2f8344e6 --- /dev/null +++ b/src/evidenceforge/events/observation_manifest.py @@ -0,0 +1,177 @@ +# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates +# SPDX-License-Identifier: MIT + +"""Machine-readable source-observation manifest for generated datasets.""" + +from __future__ import annotations + +import logging +from datetime import UTC, datetime +from pathlib import Path +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, ValidationError + +from evidenceforge.models.scenario import Scenario +from evidenceforge.utils.time import parse_duration + +logger = logging.getLogger(__name__) + +OBSERVATION_MANIFEST_FILENAME = "OBSERVATION_MANIFEST.json" + +ObservationManifestKind = Literal["storyline", "red_herring"] +ObservationStatusCounts = dict[str, dict[str, int]] +SourceEvidenceStatus = dict[str, ObservationStatusCounts] + + +class ObservationManifestEvent(BaseModel): + """Observation status for one storyline or red-herring cluster.""" + + kind: ObservationManifestKind + storyline_id: str + index: int = Field(ge=0) + actor: str + system: str + activity: str + event_types: list[str] = Field(default_factory=list) + source_status: ObservationStatusCounts = Field(default_factory=dict) + + model_config = ConfigDict(extra="forbid") + + @property + def visible_or_delayed_count(self) -> int: + """Return visible/delayed source-attempt count for this cluster.""" + return sum( + statuses.get("visible", 0) + statuses.get("delayed", 0) + for statuses in self.source_status.values() + ) + + @property + def non_visible_count(self) -> int: + """Return dropped/filtered/out-of-window source-attempt count for this cluster.""" + return sum( + statuses.get("dropped", 0) + + statuses.get("filtered", 0) + + statuses.get("out_of_window", 0) + for statuses in self.source_status.values() + ) + + +class ObservationManifest(BaseModel): + """Sidecar manifest describing source observation decisions for eval.""" + + schema_version: int = 1 + scenario_name: str + observation_profile: str + collection_window: dict[str, str | None] + source_summary: ObservationStatusCounts = Field(default_factory=dict) + storyline_events: list[ObservationManifestEvent] = Field(default_factory=list) + red_herring_events: list[ObservationManifestEvent] = Field(default_factory=list) + + model_config = ConfigDict(extra="forbid") + + def storyline_by_id(self) -> dict[str, ObservationManifestEvent]: + """Return storyline events keyed by scenario storyline ID.""" + return {event.storyline_id: event for event in self.storyline_events} + + +def build_observation_manifest( + scenario: Scenario, + source_evidence_status: SourceEvidenceStatus, +) -> ObservationManifest: + """Build the observation manifest for a generated scenario.""" + return ObservationManifest( + scenario_name=scenario.name, + observation_profile=scenario.observation_profile, + collection_window=_collection_window(scenario), + source_summary=_source_summary(source_evidence_status), + storyline_events=[ + ObservationManifestEvent( + kind="storyline", + storyline_id=event.id, + index=index, + actor=event.actor, + system=event.system, + activity=event.activity, + event_types=sorted({spec.type for spec in event.events}), + source_status=source_evidence_status.get(event.id, {}), + ) + for index, event in enumerate(scenario.storyline or []) + ], + red_herring_events=[ + ObservationManifestEvent( + kind="red_herring", + storyline_id=event.id, + index=index, + actor=event.actor, + system=event.system, + activity=event.activity, + event_types=sorted({spec.type for spec in event.events}), + source_status=source_evidence_status.get(f"red_herring:{event.id}", {}), + ) + for index, event in enumerate(scenario.red_herrings or []) + ], + ) + + +def write_observation_manifest( + output_path: Path, + scenario: Scenario, + source_evidence_status: SourceEvidenceStatus, +) -> None: + """Write OBSERVATION_MANIFEST.json next to GROUND_TRUTH.md.""" + manifest = build_observation_manifest(scenario, source_evidence_status) + output_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8") + + +def find_observation_manifest(output_dir: Path) -> Path | None: + """Find an observation manifest for an eval output directory.""" + candidates = [ + output_dir / OBSERVATION_MANIFEST_FILENAME, + output_dir.parent / OBSERVATION_MANIFEST_FILENAME, + ] + for candidate in candidates: + if candidate.exists() and candidate.is_file(): + return candidate + return None + + +def load_observation_manifest(output_dir: Path) -> ObservationManifest | None: + """Load an observation manifest for eval, returning None if absent/invalid.""" + path = find_observation_manifest(output_dir) + if path is None: + return None + try: + return ObservationManifest.model_validate_json(path.read_text(encoding="utf-8")) + except (OSError, ValidationError, ValueError) as exc: + logger.warning("Ignoring invalid observation manifest %s: %s", path, exc) + return None + + +def _collection_window(scenario: Scenario) -> dict[str, str | None]: + start = scenario.time_window.start + end: datetime | None = None + try: + end = start + parse_duration(scenario.time_window.duration) + except ValueError: + end = None + return { + "start": _format_dt(start), + "end": _format_dt(end) if end else None, + } + + +def _format_dt(value: datetime) -> str: + if value.tzinfo is None: + value = value.replace(tzinfo=UTC) + return value.isoformat().replace("+00:00", "Z") + + +def _source_summary(source_evidence_status: SourceEvidenceStatus) -> ObservationStatusCounts: + summary: dict[str, dict[str, int]] = {} + for source_status in source_evidence_status.values(): + for source, counts in source_status.items(): + target = summary.setdefault(source, {}) + for status, count in counts.items(): + target[status] = target.get(status, 0) + count + return summary diff --git a/src/evidenceforge/generation/engine/core.py b/src/evidenceforge/generation/engine/core.py index 87a42e2c..c3a1043e 100644 --- a/src/evidenceforge/generation/engine/core.py +++ b/src/evidenceforge/generation/engine/core.py @@ -465,17 +465,28 @@ def _finalize(self) -> None: def _generate_ground_truth(self) -> None: """Generate GROUND_TRUTH.md documentation.""" + from evidenceforge.events.observation_manifest import ( + OBSERVATION_MANIFEST_FILENAME, + write_observation_manifest, + ) + self.ground_truth_dir.mkdir(parents=True, exist_ok=True) output_path = self.ground_truth_dir / "GROUND_TRUTH.md" + source_evidence_status = self.dispatcher.source_evidence_status generator = GroundTruthGenerator( scenario=self.scenario, malicious_events=self.malicious_events, red_herring_events=self.red_herring_events, - source_evidence_status=self.dispatcher.source_evidence_status, + source_evidence_status=source_evidence_status, ) generator.generate(output_path) + write_observation_manifest( + self.ground_truth_dir / OBSERVATION_MANIFEST_FILENAME, + self.scenario, + source_evidence_status, + ) logger.info(f"Ground truth documentation generated: {output_path}") def _get_next_event_record_id(self) -> int: diff --git a/tests/unit/test_eval_cross_source.py b/tests/unit/test_eval_cross_source.py index 173c19c8..6adba27f 100644 --- a/tests/unit/test_eval_cross_source.py +++ b/tests/unit/test_eval_cross_source.py @@ -25,10 +25,15 @@ from datetime import UTC, datetime, timedelta from pathlib import Path +from evidenceforge.evaluation.context import EvaluationContext from evidenceforge.evaluation.parsers import ParsedRecord from evidenceforge.evaluation.pillars.causality import CausalityScorer from evidenceforge.evaluation.pillars.plausibility import PlausibilityScorer from evidenceforge.evaluation.visibility import VisibilityModel +from evidenceforge.events.observation_manifest import ( + ObservationManifest, + ObservationManifestEvent, +) # Alias for tests that use the old CrossSourceScorer name CrossSourceScorer = CausalityScorer @@ -560,6 +565,102 @@ def test_causality_sub_scores_present(self): assert "storyline_trace_coverage" in keys +class TestObservationAwareCausality: + """Causality coverage scoring should honor observation-profile manifests.""" + + def test_dropped_storyline_evidence_is_excluded_from_presence_gate(self): + """Expected dropped evidence should not fail event_presence.""" + scenario = _make_scenario( + storyline=[ + { + "id": "step-001", + "time": "+10m", + "actor": "jsmith", + "system": "WS-01", + "activity": "Run PowerShell", + "events": [{"type": "process", "process_name": "powershell.exe"}], + } + ] + ) + scenario.observation_profile = "enterprise_standard" + manifest = ObservationManifest( + scenario_name=scenario.name, + observation_profile="enterprise_standard", + collection_window={"start": "2024-01-15T10:00:00Z", "end": "2024-01-15T18:00:00Z"}, + source_summary={"windows_security": {"dropped": 1}, "ecar": {"dropped": 1}}, + storyline_events=[ + ObservationManifestEvent( + kind="storyline", + storyline_id="step-001", + index=0, + actor="jsmith", + system="WS-01", + activity="Run PowerShell", + event_types=["process"], + source_status={"windows_security": {"dropped": 1}, "ecar": {"dropped": 1}}, + ) + ], + ) + + result = CausalityScorer().score( + {}, + scenario, + context=EvaluationContext(observation_manifest=manifest), + ) + event_presence = next(s for s in result.sub_scores if s.key == "event_presence") + trace_coverage = next(s for s in result.sub_scores if s.key == "storyline_trace_coverage") + + assert event_presence.score == 100.0 + assert event_presence.raw_score == 0.0 + assert event_presence.adjusted is True + assert trace_coverage.score == 100.0 + assert trace_coverage.raw_score == 0.0 + + def test_visible_manifest_evidence_still_fails_when_trace_is_absent(self): + """Observation profiles should not excuse missing evidence marked visible.""" + scenario = _make_scenario( + storyline=[ + { + "id": "step-001", + "time": "+10m", + "actor": "jsmith", + "system": "WS-01", + "activity": "Run PowerShell", + "events": [{"type": "process", "process_name": "powershell.exe"}], + } + ] + ) + scenario.observation_profile = "enterprise_standard" + manifest = ObservationManifest( + scenario_name=scenario.name, + observation_profile="enterprise_standard", + collection_window={"start": "2024-01-15T10:00:00Z", "end": "2024-01-15T18:00:00Z"}, + source_summary={"windows_security": {"visible": 1}}, + storyline_events=[ + ObservationManifestEvent( + kind="storyline", + storyline_id="step-001", + index=0, + actor="jsmith", + system="WS-01", + activity="Run PowerShell", + event_types=["process"], + source_status={"windows_security": {"visible": 1}}, + ) + ], + ) + + result = CausalityScorer().score( + {}, + scenario, + context=EvaluationContext(observation_manifest=manifest), + ) + event_presence = next(s for s in result.sub_scores if s.key == "event_presence") + + assert event_presence.score == 0.0 + assert event_presence.adjusted is False + + class TestZeekDhcpIndexing: """zeek_dhcp records must be indexed by client_addr and host_name.""" diff --git a/tests/unit/test_observation_manifest.py b/tests/unit/test_observation_manifest.py new file mode 100644 index 00000000..8b9ad341 --- /dev/null +++ b/tests/unit/test_observation_manifest.py @@ -0,0 +1,94 @@ +# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates +# SPDX-License-Identifier: MIT + +"""Tests for the machine-readable observation manifest sidecar.""" + +from evidenceforge.events.observation_manifest import ( + OBSERVATION_MANIFEST_FILENAME, + build_observation_manifest, + load_observation_manifest, + write_observation_manifest, +) +from evidenceforge.models import ( + BaselineActivity, + Environment, + OutputSpec, + Scenario, + StorylineEvent, + System, + TimeWindow, + User, +) + + +def _scenario() -> Scenario: + return Scenario( + version="1.0", + name="manifest-test", + description="Manifest test", + environment=Environment( + description="Test", + users=[ + User( + username="alice", + full_name="Alice Example", + email="alice@example.com", + enabled=True, + ), + ], + systems=[System(hostname="WS-01", ip="10.0.0.10", os="Windows 11", type="workstation")], + ), + time_window=TimeWindow(start="2026-02-03T13:00:00Z", duration="2h"), + baseline_activity=BaselineActivity(description="Low", intensity="low", variation="low"), + observation_profile="enterprise_standard", + output=OutputSpec(logs=[{"format": "windows_event_security"}], destination="./out"), + storyline=[ + StorylineEvent( + id="step-001", + time="+10m", + actor="alice", + system="WS-01", + activity="Run command", + events=[{"type": "process", "process_name": "powershell.exe"}], + ) + ], + ) + + +def test_build_manifest_summarizes_storyline_source_status() -> None: + """Manifest should preserve per-storyline status and aggregate source counts.""" + manifest = build_observation_manifest( + _scenario(), + { + "step-001": { + "windows_security": {"visible": 1}, + "sysmon": {"dropped": 2}, + } + }, + ) + + assert manifest.observation_profile == "enterprise_standard" + assert manifest.collection_window["start"] == "2026-02-03T13:00:00Z" + assert manifest.collection_window["end"] == "2026-02-03T15:00:00Z" + assert manifest.source_summary == { + "windows_security": {"visible": 1}, + "sysmon": {"dropped": 2}, + } + assert manifest.storyline_events[0].storyline_id == "step-001" + assert manifest.storyline_events[0].source_status["sysmon"] == {"dropped": 2} + + +def test_load_manifest_finds_scenario_root_from_data_dir(tmp_path) -> None: + """Eval should find the manifest beside GROUND_TRUTH.md when pointed at data/.""" + data_dir = tmp_path / "data" + data_dir.mkdir() + write_observation_manifest( + tmp_path / OBSERVATION_MANIFEST_FILENAME, + _scenario(), + {"step-001": {"windows_security": {"dropped": 1}}}, + ) + + loaded = load_observation_manifest(data_dir) + + assert loaded is not None + assert loaded.storyline_events[0].source_status == {"windows_security": {"dropped": 1}}