From 599a40eef38606cceb101e006ba889cafca944a4 Mon Sep 17 00:00:00 2001
From: "David J. Bianco" <davidjbianco@gmail.com>
Date: Fri, 15 May 2026 11:52:24 -0400
Subject: [PATCH] feat: add observation-aware eval manifest

---
 TODO.md                                       |   8 +-
 commands/eforge/config.md                     |   2 +-
 commands/eforge/evaluate.md                   |  15 +-
 .../references/config-dependency-graph.md     |   2 +-
 .../eforge/references/config-evaluation.md    |   9 +
 .../eforge/references/config-host-activity.md |   5 +
 .../eforge/references/scenario-reference.md   |   3 +-
 docs/design/data-quality-prd.md               |   6 +
 docs/reference/CUSTOMIZING_CONFIG.md          |   8 +
 docs/reference/scenario-reference.md          |   3 +-
 scenarios/ITERATION-TEST-PROMPT.md            |  24 +--
 src/evidenceforge/cli/commands.py             |  22 +-
 src/evidenceforge/config/activity/README.md   |   2 +-
 src/evidenceforge/evaluation/context.py       |  17 ++
 .../evaluation/dimensions/__init__.py         |   3 +
 src/evidenceforge/evaluation/engine.py        |  23 +-
 src/evidenceforge/evaluation/models.py        |   4 +
 .../evaluation/pillars/causality.py           | 198 ++++++++++++++++--
 .../evaluation/pillars/parseability.py        |   2 +
 .../evaluation/pillars/plausibility.py        |   2 +
 .../evaluation/pillars/timing.py              |   2 +
 src/evidenceforge/evaluation/report.py        |   7 +
 src/evidenceforge/evaluation/storyline.py     |   2 +
 .../events/observation_manifest.py            | 177 ++++++++++++++++
 src/evidenceforge/generation/engine/core.py   |  13 +-
 tests/unit/test_eval_cross_source.py          | 101 +++++++++
 tests/unit/test_observation_manifest.py       |  94 +++++++++
 27 files changed, 707 insertions(+), 47 deletions(-)
 create mode 100644 src/evidenceforge/evaluation/context.py
 create mode 100644 src/evidenceforge/events/observation_manifest.py
 create mode 100644 tests/unit/test_observation_manifest.py

diff --git a/TODO.md b/TODO.md
index b97146cf..dc5796e6 100644
--- a/TODO.md
+++ b/TODO.md
@@ -2,7 +2,7 @@
 
 **Status:** Phase 8.5 (Dual src/dst HostContext) COMPLETE; Pre-MVP quality fixes ongoing
 **Started:** 2026-03-11
-**Last Updated:** 2026-05-14
+**Last Updated:** 2026-05-15
 
 See [CHANGELOG.md](CHANGELOG.md) for detailed development history of completed phases.
 
@@ -243,6 +243,8 @@ Replaced manual per-emitter field coordination with SecurityEvent intermediate r
 - [x] **P1** Source identity and endpoint baseline realism sprint — completed TLS/X.509 issuer-compatible chain signatures, Sysmon Event 7 native third-party module identity, config-driven Windows scheduled-process timing, and DHCP registry emission policy tied to lease activity. Verified with `uv run eforge validate-config`, focused regressions, Ruff, normal pytest, and slow-inclusive pytest.
 - [x] **P2** Endpoint/eCAR baseline variance follow-up — addressed through the host/activity profile realism layer. Host family, role, persona, and stable per-host multipliers now shape endpoint, process, registry, scheduled-task, syslog, bash, eCAR, Windows, Zeek, firewall, IDS, web, and proxy rates; config-driven encoded PowerShell variants and benign endpoint texture reduce repeated per-host artifacts. Verification passed with focused host-activity/config/ASA/baseline tests, `uv run eforge validate-config`, Ruff checks/format checks, full normal `uv run pytest -v`, and slow-inclusive `uv run pytest -v --include-slow --no-cov` (`3057 passed, 1 skipped`).
 - [x] **Later architectural sprint: imperfect observation and source coverage** — implemented a training-friendly `complete` default plus overlay-compatible named observation profiles that apply deterministic source-level drop/delay/coverage semantics without modeling contradictions. The policy covers endpoint, network, proxy/web, firewall, IDS, Windows, Sysmon, Zeek, syslog, bash history, and eCAR source families, while ground truth preserves canonical truth and records source evidence status. Verification passed: focused observation/config/ground-truth tests, `uv run eforge validate-config`, Ruff checks/format checks, full normal `uv run pytest -v` (`3036 passed, 15 skipped`), and slow-inclusive `uv run pytest -v --include-slow` (`3050 passed, 1 skipped`).
+- [x] Observation-aware automated eval and manifest — generation now writes `OBSERVATION_MANIFEST.json` beside ground truth, `eforge eval` loads it when present, coverage-style causality metrics report raw and observation-adjusted scores for expected non-visible evidence, and correctness/contradiction checks remain strict. Verification passed with config validation, Ruff checks/format checks, focused eval/manifest tests, and full normal `uv run pytest -v` (`3047 passed, 15 skipped`).
+- [x] Post-host-activity score check — synced `dev`, cleaned up stale TODOs, regenerated/evaluated `scenarios/iteration-test` from the current iteration-test prompt with `enterprise_standard` observation, and ran one blind expert-panel review without entering another fix loop. Automated eval passed at `92.39` over `108,858` records; blind synthetic-confidence averaged `82.75`. Highest-leverage follow-ups are Linux SSH/syslog lifecycle ordering, Zeek observation-tree consistency, X.509 metadata coherence, Windows OS-build/local-SID identity, and static web asset manifests.
 - [x] Full slow-suite regression cleanup after loop-65 merge — explicit-proxy storyline beacons now preserve authored hostname+destination IP pairs only when the storyline marks that pair as intentional, normal proxy-origin DNS resolution remains intact, and the parallel-generation LogonID assertion treats Type 7 unlock reuse as valid slice-of-time Windows behavior. Verified with targeted proxy/parallel tests, `uv run ruff check .`, `uv run ruff format --check .`, and `uv run pytest -v --include-slow` (`2875 passed, 23 skipped`).
   Detection Engineer blind review completed for the regenerated Loop 61 dataset at `scenarios/iteration-test/data`; reviewer verdict: Synthetic, 63/100 confidence. Main findings: one PROXY-01 sshd accepted-login lifecycle gap/self-source artifact and Windows 4648 explicit-credential caller PID/image provenance ambiguity around `WS-MCHEN-01`.
 
@@ -279,7 +281,7 @@ Verification is complete: dedicated `tests/unit/test_world_model.py` coverage wa
 
 - [x] **SUPERSEDED** Canonical emitter field provenance blind-review remaining findings from 78% synthetic review — superseded by later full-path storyline normalization, bash typo/path cleanup, proxy domain-class path/content profiles, and Sysmon follow-on ordering fixes. The still-current related work is now represented by web/session realism, imperfect observation/source coverage, and process lifecycle modeling TODOs.
 
-- [ ] Source-specific process lifecycle completeness modeling — deferred design item. Add a configurable telemetry coverage/profile layer that can model realistic Security/Sysmon/eCAR missingness, ingestion delay, audit-policy gaps, and endpoint coverage variance without ad hoc omissions in individual emitters. This should be part of the broader cross-source distribution realism layer, not a Windows-only workaround.
+- [x] **SUPERSEDED** Source-specific process lifecycle completeness modeling — the broad requirement is now covered by named observation profiles plus the host/activity profile layer. Observation profiles model deterministic source-family missingness/delay/coverage semantics for Security/Sysmon/eCAR and other sources, while host activity profiles add endpoint/source volume variance; the remaining narrower deployment-topology gap is tracked as configurable per-host/source log deployment coverage.
 
 - [x] Open PR consolidation into `dev` — re-applied the storyline typing-cadence monotonicity fix from PR #81, folded Dependabot pytest/Pygments updates into the dev workflow, and added Dependabot configuration so future dependency PRs target `dev`.
 
@@ -601,7 +603,7 @@ Data works but experienced analysts spot tells. Grouped by format for efficient
 - [x] **P2** Per-host-type event rate multiplier — implemented as implicit host/activity profile defaults rather than scenario YAML fields. Domain controllers, file servers, web servers, proxies, Linux servers, and workstations now receive role/family/persona-specific multipliers across baseline activity, auth, endpoint, network, and source-specific noise.
 - [x] Configurable per-entity artifact variation — implemented in the host/activity profile layer for baseline artifact texture, including stable per-host encoded PowerShell variants and profile-owned endpoint activity scaling.
 - [x] Configurable per-host volume variance — implemented via stable host/persona/role multipliers applied across major activity families so hosts no longer share narrow uniform volume bands by construction.
-- [ ] Configurable per-host/source log deployment coverage — observation profiles now support source-family gaps and host-scoped missingness multipliers, but explicit per-host source enablement/disablement remains future work. A later setting should model named host groups, disabled sensors, partial deployments, and collection windows when users need topology-level telemetry coverage differences rather than event-level missingness.
+- [ ] Configurable per-host/source log deployment coverage — observation profiles now support source-family gaps and host-scoped missingness multipliers, but explicit per-host source enablement/disablement remains future work. A later setting should model named host groups, disabled sensors, partial deployments, and collection windows when users need topology-level telemetry coverage differences rather than event-level missingness or host/activity volume variance.
 - [ ] **P2** Generation speed and efficiency follow-up — Sprint 4 host/activity realism is functionally verified, but the slow-inclusive suite exposed that `pytest-cov` plus `tracemalloc` can make the medium dataset memory test pathological. A future sprint should profile generation without instrumentation noise, identify hot paths introduced by richer host activity/web fanout/firewall texture, and decide whether to optimize generation, mark the memory test `--no-cov`, or relax/update stale performance assertions.
 - [x] DNS IP pool reuse causes cross-provider resolution (CloudFront→Microsoft IPs, etc.) — domain-first selection ensures consistent domain→IP mapping via FORWARD_DNS
 - [x] AWS region mismatch between DNS PTR and SSL SNI for same IP — AWS hostname/PTR generation now derives a stable per-IP region/edge identity and PTR generation respects known forward hostname context.
diff --git a/commands/eforge/config.md b/commands/eforge/config.md
index 17a026e3..59ecdb12 100644
--- a/commands/eforge/config.md
+++ b/commands/eforge/config.md
@@ -71,7 +71,7 @@ When writing to the overlay, files are partial — they contain ONLY the user's
 | Modify baseline auth noise | `auth_noise.yaml` | (standalone — stale scheduled-credential accounts and irregular recurrence timing) |
 | Modify endpoint background noise | `endpoint_noise.yaml` | (standalone — scheduled-process timing and DHCP registry emission policy) |
 | Modify host activity distribution | `host_activity_profiles.yaml` | (standalone — host/persona/role rate-family multipliers, firewall deny bursts, and artifact variants) |
-| Modify source observation coverage | `observation_profiles.yaml` | Scenario `observation_profile` selects the named profile; keep `complete` as the default training profile |
+| Modify source observation coverage | `observation_profiles.yaml` | Scenario `observation_profile` selects the named profile; generated `OBSERVATION_MANIFEST.json` lets eval account for expected gaps; keep `complete` as the default training profile |
 | Modify causal/source timing | `timing_profiles.yaml` | (standalone — causal prerequisite, source latency, teardown, and Windows/Sysmon collision-spacing knobs) |
 | ~~Format definitions~~ | Not user-customizable | Engine internals — requires code changes |
 | ~~Evaluation rules~~ | Not user-customizable | Must match format definitions — requires code changes |
diff --git a/commands/eforge/evaluate.md b/commands/eforge/evaluate.md
index e9c5ed26..7a2c7765 100644
--- a/commands/eforge/evaluate.md
+++ b/commands/eforge/evaluate.md
@@ -36,6 +36,7 @@ scenarios/<scenario-name>/
   scenario.yaml
   ENVIRONMENT.md
   GROUND_TRUTH.md
+  OBSERVATION_MANIFEST.json  ← optional, generated for source-observation-aware eval
   data/              ← this is the output_dir for eforge eval
 ```
 
@@ -65,6 +66,12 @@ Present a clear summary of the evaluation results. The report shows two tiers fo
 - **Minimum** (hard gate): must pass or the dataset fails overall
 - **Aspirational** (informational): a stretch target; failure here is noted but does not fail the dataset
 
+If the scenario uses `observation_profile` other than `complete`, check whether the report says
+the observation manifest was loaded. With a manifest, coverage-style causality sub-scores may be
+adjusted for expected source gaps and will show a `raw` score when the adjusted score differs.
+Do not describe this as a lowered threshold: visible contradictions, parseability failures,
+source-native field mismatches, and evidence marked `visible` or `delayed` remain real failures.
+
 For each pillar, explain what the score means in practical terms:
 
 **Pillar 1: Parseability (weight 0.30)**
@@ -81,11 +88,11 @@ For each pillar, explain what the score means in practical terms:
 
 **Pillar 3: Causality (weight 0.25)**
 - Causal Ordering: Are logon→process→logoff sequences correctly ordered? DNS before TCP? Kerberos TGT/TGS before domain logons?
-- Storyline Event Presence: Are all storyline events visible in at least one log source?
+- Storyline Event Presence: Are all expected-visible storyline events visible in at least one log source? For non-`complete` observation profiles with a manifest, source rows marked `dropped`, `filtered`, or `out_of_window` are excluded from this coverage denominator.
 - Indicator Accuracy: Do traces carry the correct IPs, usernames, hostnames from the scenario?
-- Pivot Linkability: Can a hunter pivot between consecutive attack steps using shared field values?
-- Storyline Temporal Integrity: Are attack events in the right relative order at the right times?
-- Storyline Trace Coverage: For each expected log format on each involved host, does the storyline leave a trace?
+- Pivot Linkability: Can a hunter pivot between consecutive expected-visible attack steps using shared field values?
+- Storyline Temporal Integrity: Are expected-visible attack events in the right relative order at the right times?
+- Storyline Trace Coverage: For each expected-visible log format group on each involved host, does the storyline leave a trace?
 
 **Pillar 4: Timing (weight 0.20)**
 - Attack-Chain Timing: Do elapsed times between consecutive storyline steps fall within plausible bounds? Bounds come from `timing_bounds.yaml` — default 5s–2h, with per-action-type overrides (e.g., lateral movement: 30s–1h, exfiltration: 60s–24h). First matching keyword in the step activity wins.
diff --git a/commands/eforge/references/config-dependency-graph.md b/commands/eforge/references/config-dependency-graph.md
index c3ee6dd8..38010c95 100644
--- a/commands/eforge/references/config-dependency-graph.md
+++ b/commands/eforge/references/config-dependency-graph.md
@@ -170,7 +170,7 @@ Each row is a file; columns show what it depends on and what depends on it.
 | Direction | File | Relationship |
 |-----------|------|-------------|
 | depends on | scenario `observation_profile` | The scenario selects a named profile; the profile file owns source-level missingness/delay values |
-| **depended on by** | Event dispatcher, GROUND_TRUTH.md | Applies deterministic source-observation drops/delays after canonical state updates and reports source evidence status |
+| **depended on by** | Event dispatcher, GROUND_TRUTH.md, OBSERVATION_MANIFEST.json, `eforge eval` | Applies deterministic source-observation drops/delays after canonical state updates, reports source evidence status, and lets eval distinguish expected gaps from missing visible evidence |
 | validated by | `eforge validate-config` and `eforge validate` | Config validation checks source-family names/ranges; scenario validation checks that the named profile exists |
 
 ### network_params.yaml
diff --git a/commands/eforge/references/config-evaluation.md b/commands/eforge/references/config-evaluation.md
index d84a09fc..5e0d3e68 100644
--- a/commands/eforge/references/config-evaluation.md
+++ b/commands/eforge/references/config-evaluation.md
@@ -21,6 +21,15 @@ Schema documentation for data quality evaluation rule files in `src/evidenceforg
 
 Controls the two-tier acceptance model for `eforge eval`. Each sub-score has a **minimum** (hard gate: dataset fails if below) and an **aspirational** target (informational stretch goal). Pillar weights must sum to 1.0.
 
+When a generated dataset includes `OBSERVATION_MANIFEST.json` beside `GROUND_TRUTH.md`,
+`eforge eval` automatically applies observation-aware coverage scoring. Non-`complete`
+profiles can adjust only coverage-style causality sub-scores (`event_presence`,
+`pivot_linkability`, `temporal_integrity`, and `storyline_trace_coverage`) by excluding
+evidence that the manifest marks `dropped`, `filtered`, or `out_of_window`. Source-native
+correctness gates such as parseability, value plausibility, field agreement, and visible causal
+ordering remain strict. Adjusted sub-scores expose `raw_score` in JSON and show `raw:<score>` in
+the text report.
+
 ### Structure
 
 ```yaml
diff --git a/commands/eforge/references/config-host-activity.md b/commands/eforge/references/config-host-activity.md
index 33634892..e4314509 100644
--- a/commands/eforge/references/config-host-activity.md
+++ b/commands/eforge/references/config-host-activity.md
@@ -430,6 +430,11 @@ profiles:
 
 Profiles are intentionally source-level, not event-type matrices. Scenario authors select a named profile; code owns safe source-native application semantics so new event types inherit their source-family default. Non-complete profiles may make evidence `visible`, `delayed`, `dropped`, `filtered`, or `out_of_window`, but must not create contradictory identifiers or field values across sources.
 
+Generation writes `OBSERVATION_MANIFEST.json` beside `GROUND_TRUTH.md`. `eforge eval` uses this
+sidecar to adjust only coverage-style causality scoring for expected missing evidence under
+non-`complete` profiles. The raw score remains visible in the report, and source-native
+correctness checks are not relaxed.
+
 Valid source families are `windows_security`, `sysmon`, `ecar`, `syslog`, `bash_history`, `zeek`, `proxy`, `web`, `asa`, and `ids`. Run `eforge validate-config` after overlay changes; it rejects unknown source-family names, invalid probabilities, and inverted ranges. Run `eforge validate` on scenarios that use a non-default profile so unknown profile names are caught before generation.
 
 ---
diff --git a/commands/eforge/references/scenario-reference.md b/commands/eforge/references/scenario-reference.md
index 0820e334..bccfbefc 100644
--- a/commands/eforge/references/scenario-reference.md
+++ b/commands/eforge/references/scenario-reference.md
@@ -405,7 +405,8 @@ training-friendly perfect source coverage and correlation. Non-default profiles
 deterministic source-level missingness and source-native delays while preserving canonical truth:
 they can make evidence `visible`, `delayed`, `dropped`, `filtered`, or `out_of_window`, but they
 must not create contradictory users, PIDs, ports, hashes, UIDs, or session identifiers across
-sources. `GROUND_TRUTH.md` records source evidence status when a non-complete profile is used.
+sources. `GROUND_TRUTH.md` records source evidence status for instructors, and
+`OBSERVATION_MANIFEST.json` records the same source-observation contract for automated eval.
 
 ## Storyline
 
diff --git a/docs/design/data-quality-prd.md b/docs/design/data-quality-prd.md
index 49d7d0a5..15a6c4c7 100644
--- a/docs/design/data-quality-prd.md
+++ b/docs/design/data-quality-prd.md
@@ -339,6 +339,12 @@ Every sub-score now has:
 
 Thresholds are stored in `src/evidenceforge/config/evaluation/thresholds.yaml` for tuning without code changes. Calibration against purpose-built scenarios is deferred to a separate pass.
 
+Datasets generated with non-`complete` observation profiles include `OBSERVATION_MANIFEST.json`.
+When present, eval uses it to adjust coverage-style causality sub-scores for evidence that was
+intentionally `dropped`, `filtered`, or `out_of_window`. Hard correctness gates remain strict:
+observation profiles do not excuse parse failures, impossible values, source-native contradictions,
+or evidence marked `visible`/`delayed` but missing from logs.
+
 ### Calibration Plan
 
 Thresholds are currently judgment-based. After the restructure is stable, the plan is to design purpose-built calibration scenarios (known-good and known-bad), run `eforge eval` against them, and use the results to propose empirically grounded threshold values. Out of scope for v0.5.1.
diff --git a/docs/reference/CUSTOMIZING_CONFIG.md b/docs/reference/CUSTOMIZING_CONFIG.md
index 286baf38..d46590bd 100644
--- a/docs/reference/CUSTOMIZING_CONFIG.md
+++ b/docs/reference/CUSTOMIZING_CONFIG.md
@@ -193,6 +193,14 @@ The `eforge eval` scoring rules are also YAML-based and can be tuned per-project
 
 All eval config files live in `src/evidenceforge/config/evaluation/`. They are **not** overlaid from `.eforge/config/` — edit them in-place if you want project-specific tuning, or copy the package files into your project and set the `EFORGE_EVAL_CONFIG_DIR` environment variable to point to your copies.
 
+Generated scenario directories may also include `OBSERVATION_MANIFEST.json` beside
+`GROUND_TRUTH.md`. `eforge eval` loads this sidecar automatically when present. For
+non-`complete` observation profiles, causality coverage metrics use the manifest to exclude
+source evidence that was intentionally `dropped`, `filtered`, or `out_of_window`, while still
+failing visible contradictions, parse errors, value mismatches, and missing evidence that the
+manifest marks `visible` or `delayed`. Text and JSON reports keep the adjusted score and expose
+the raw score for affected sub-scores.
+
 For full schema documentation for each file, see the skill reference: `/eforge:references:config-evaluation`.
 
 ## Reference Documentation
diff --git a/docs/reference/scenario-reference.md b/docs/reference/scenario-reference.md
index 118fa2bd..c5ada98f 100644
--- a/docs/reference/scenario-reference.md
+++ b/docs/reference/scenario-reference.md
@@ -405,7 +405,8 @@ training-friendly perfect source coverage and correlation. Non-default profiles
 deterministic source-level missingness and source-native delays while preserving canonical truth:
 they can make evidence `visible`, `delayed`, `dropped`, `filtered`, or `out_of_window`, but they
 must not create contradictory users, PIDs, ports, hashes, UIDs, or session identifiers across
-sources. `GROUND_TRUTH.md` records source evidence status when a non-complete profile is used.
+sources. `GROUND_TRUTH.md` records source evidence status for instructors, and
+`OBSERVATION_MANIFEST.json` records the same source-observation contract for automated eval.
 
 ## Storyline
 
diff --git a/scenarios/ITERATION-TEST-PROMPT.md b/scenarios/ITERATION-TEST-PROMPT.md
index 199cf680..554c1455 100644
--- a/scenarios/ITERATION-TEST-PROMPT.md
+++ b/scenarios/ITERATION-TEST-PROMPT.md
@@ -39,12 +39,12 @@
     default_action: deny, deny_ratio: 2.0, drop_mode: drop, threat_detection_rate: 10,
     nat_rules:
       - type: dynamic_pat
-        src: [corporate_lan, server_vlan]
-        mapped_ip: 45.33.32.1
+        src: [corporate_lan, server_vlan, dmz]
+        mapped_ip: 203.14.220.1
       - type: static
         src: dmz
         real_ip: 10.10.3.10 (WEB-EXT-01)
-        mapped_ip: 45.33.32.10
+        mapped_ip: 203.14.220.10
     policy:
       - {src: external, dst: dmz, ports: [80, 443]}
       - {src: corporate_lan, dst: any}
@@ -161,12 +161,12 @@
       service_file_name: "%SystemRoot%\PSEXESVC.exe") + process events for commands run under
       the service. Do NOT use "cmd.exe /c PSEXESVC.exe" — that produces the wrong parent chain.
 
-  15. Privilege Escalation (+4h15m): Create backdoor account svc_sqlreader (account_created event),
+  15. Privilege Escalation (+4h15m): Create backdoor account svc_mhsync (account_created event),
       add to Domain Admins (group_member_added event). Actor: SYSTEM on DC-01.
 
-  16. Persistence (+4h20m): Install service "HealthMonitorSvc" (service_installed event with
+  16. Persistence (+4h20m): Install service "DeviceSyncSvc" (service_installed event with
       service_name, service_file_name, service_account) and create scheduled task
-      "\Microsoft\Windows\Maintenance\SystemHealthCheck" (scheduled_task_created event) on DC-01.
+      "\Microsoft\Windows\Maintenance\DeviceSync" (scheduled_task_created event) on DC-01.
 
   17. C2 Beaconing (+4h30m): HTTPS beacon from DC-01 to 45.33.32.30:443 (beacon event with
       interval: "10m", duration: "1h30m", jitter: 0.3, hostname, user_agent, method: GET,
@@ -178,14 +178,14 @@
       internal sensors only.
 
   19. DNS Tunneling (+4h45m): Exfiltrate data via DNS tunnel from APP-INT-01 (dns_tunnel event
-      with base_domain: "ns1.cdn-health-updates.net", encoding: hex, qtype: TXT, interval: "2s",
+      with base_domain: "ns1.westbridge-services.net", encoding: hex, qtype: TXT, interval: "2s",
       duration: "15m", payload_size: 512).
 
   20. DGA Activity (+5h): DGA queries from WEB-EXT-01 (dga_queries event with tld: ".net",
       length_range: [10, 18], interval: "30s", duration: "45m",
       rcode_distribution for mostly NXDOMAIN).
 
-  21. Collection (+5h): Authenticate to FILE-SRV-01 with backdoor account svc_sqlreader
+  21. Collection (+5h): Authenticate to FILE-SRV-01 with backdoor account svc_mhsync
       (logon event, type 3), enumerate shares, stage financial and patient data, compress
       with PowerShell Compress-Archive.
 
@@ -195,9 +195,9 @@
   23. Workstation Lock (+5h20m): Attacker locks the compromised workstation before stepping away
       (workstation_lock event) — exercises EventID 4800.
 
-  24. Exfiltration (+5h25m): Upload archive to cdn-assets-update.com (45.33.32.30) over HTTPS
+  24. Exfiltration (+5h25m): Upload archive to api.westbridge-services.net (45.33.32.30) over HTTPS
       (connection event with HTTP fields, method: POST, large orig_bytes — use a physically
-      plausible value in the 100-500 MB range, NOT multi-GB).
+      plausible non-round value in the 100-500 MB range, NOT multi-GB or a power-of-two anchor).
 
   25. Workstation Unlock (+5h35m): Attacker returns, unlocks workstation (workstation_unlock
       event) — exercises EventID 4801.
@@ -212,8 +212,8 @@
   28. Ongoing C2 (+5h, +5h30m): Periodic beacons from WEB-EXT-01 to 45.33.32.30:443
       (separate beacon events).
 
-  29. Account Cleanup (+5h50m): Delete the backdoor account svc_sqlreader (account_deleted event
-      with target_username: svc_sqlreader).
+  29. Account Cleanup (+5h50m): Delete the backdoor account svc_mhsync (account_deleted event
+      with target_username: svc_mhsync).
 
   30. Logoff (+5h55m): Attacker logs off from compromised systems (logoff events).
 
diff --git a/src/evidenceforge/cli/commands.py b/src/evidenceforge/cli/commands.py
index 04793d25..632dca4a 100644
--- a/src/evidenceforge/cli/commands.py
+++ b/src/evidenceforge/cli/commands.py
@@ -250,6 +250,8 @@ def generate(
         data_dir = scenario_dir / "data"
         ground_truth_dir = scenario_dir
 
+    from evidenceforge.events.observation_manifest import OBSERVATION_MANIFEST_FILENAME
+
     # Apply --formats filter (intersection with scenario output.logs)
     if formats:
         from evidenceforge.events.dispatcher import expand_formats
@@ -284,6 +286,9 @@ def generate(
     gt_path = ground_truth_dir / "GROUND_TRUTH.md"
     if gt_path.exists():
         existing.append(f"  GROUND_TRUTH.md ({gt_path})")
+    manifest_path = ground_truth_dir / OBSERVATION_MANIFEST_FILENAME
+    if manifest_path.exists():
+        existing.append(f"  {OBSERVATION_MANIFEST_FILENAME} ({manifest_path})")
 
     has_existing = bool(existing)
     if has_existing:
@@ -386,6 +391,7 @@ def progress_callback(event_type: str, data: dict) -> None:
         # as a matched pair — partial preservation is never valid.
         if staging_dir:
             staged_gt = gen_gt_dir / "GROUND_TRUTH.md"
+            staged_manifest = gen_gt_dir / OBSERVATION_MANIFEST_FILENAME
             if not gen_data_dir.exists():
                 raise RuntimeError("Staged data/ directory missing after generation")
             if not staged_gt.exists():
@@ -404,10 +410,14 @@ def progress_callback(event_type: str, data: dict) -> None:
                     data_dir.rename(rollback_dir / "data")
                 if gt_path.exists():
                     gt_path.rename(rollback_dir / "GROUND_TRUTH.md")
+                if manifest_path.exists():
+                    manifest_path.rename(rollback_dir / OBSERVATION_MANIFEST_FILENAME)
 
                 # Step 2: Install new output
                 gen_data_dir.rename(data_dir)
                 staged_gt.rename(gt_path)
+                if staged_manifest.exists():
+                    staged_manifest.rename(manifest_path)
                 swap_succeeded = True
 
             except BaseException:
@@ -417,10 +427,15 @@ def progress_callback(event_type: str, data: dict) -> None:
                         shutil.rmtree(data_dir)
                     if gt_path.exists() and (rollback_dir / "GROUND_TRUTH.md").exists():
                         gt_path.unlink()
+                    if manifest_path.exists():
+                        manifest_path.unlink()
                     if (rollback_dir / "data").exists():
                         (rollback_dir / "data").rename(data_dir)
                     if (rollback_dir / "GROUND_TRUTH.md").exists():
                         (rollback_dir / "GROUND_TRUTH.md").rename(gt_path)
+                    rollback_manifest = rollback_dir / OBSERVATION_MANIFEST_FILENAME
+                    if rollback_manifest.exists():
+                        rollback_manifest.rename(manifest_path)
                 except Exception:
                     logger.error("Rollback failed — old output may be in: %s", rollback_dir)
                 raise
@@ -435,10 +450,13 @@ def progress_callback(event_type: str, data: dict) -> None:
         console.print("\nGenerated files:")
         console.print(f"  Scenario directory: {ground_truth_dir}")
 
-        # List files in scenario root (GROUND_TRUTH.md)
+        # List files in scenario root (GROUND_TRUTH.md + machine-readable sidecars)
         if ground_truth_dir.exists():
             for file in sorted(ground_truth_dir.iterdir()):
-                if file.is_file() and file.name == "GROUND_TRUTH.md":
+                if file.is_file() and file.name in {
+                    "GROUND_TRUTH.md",
+                    OBSERVATION_MANIFEST_FILENAME,
+                }:
                     size = file.stat().st_size
                     size_str = f"{size:,} bytes" if size < 1024 else f"{size / 1024:.1f} KB"
                     console.print(f"  • {file.name} ({size_str})")
diff --git a/src/evidenceforge/config/activity/README.md b/src/evidenceforge/config/activity/README.md
index 684bbb1a..6c3d3762 100644
--- a/src/evidenceforge/config/activity/README.md
+++ b/src/evidenceforge/config/activity/README.md
@@ -24,7 +24,7 @@ caches data after first load. Two files (`network_params.yaml`,
 | `auth_noise.yaml` | `auth_noise.py` | Baseline authentication-noise profiles such as stale scheduled-credential account pools and irregular recurrence timing. |
 | `endpoint_noise.yaml` | `endpoint_noise.py` | Endpoint background timing and registry-emission policies for Windows scheduled processes and DHCP interface registry writes. |
 | `host_activity_profiles.yaml` | `host_activity_profiles.py` | Coarse host/persona/role rate multipliers for baseline volume, endpoint noise, firewall deny bursts, and data-driven artifact variation. |
-| `observation_profiles.yaml` | `config/observation_profiles.py` | Named source-observation profiles for optional source-level missingness and delays. Scenario `observation_profile` defaults to `complete`. |
+| `observation_profiles.yaml` | `config/observation_profiles.py` | Named source-observation profiles for optional source-level missingness and delays. Scenario `observation_profile` defaults to `complete`; generation records status in `OBSERVATION_MANIFEST.json` for eval. |
 | `proxy_uri_templates.yaml` | `proxy_uri.py` | Per-domain URI path templates for proxy logs (Windows Update, CRL, OCSP, Azure AD, etc.). |
 | `network_params.yaml` | `network_params.py`, `engine/emitter_setup.py` | MAC address OUI prefixes, public NTP fallback servers, and DNS tunnel RTT bounds. |
 | `systemd_schedules.yaml` | `engine/baseline.py` | Systemd timer and cron job schedules (logrotate, fstrim, apt-daily, etc.). |
diff --git a/src/evidenceforge/evaluation/context.py b/src/evidenceforge/evaluation/context.py
new file mode 100644
index 00000000..e8c8406c
--- /dev/null
+++ b/src/evidenceforge/evaluation/context.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Shared context passed to evaluation pillar scorers."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from evidenceforge.events.observation_manifest import ObservationManifest
+
+
+@dataclass(frozen=True, slots=True)
+class EvaluationContext:
+    """Additional dataset metadata available to scorers."""
+
+    observation_manifest: ObservationManifest | None = None
diff --git a/src/evidenceforge/evaluation/dimensions/__init__.py b/src/evidenceforge/evaluation/dimensions/__init__.py
index 4cadbd69..7bc0c8b8 100644
--- a/src/evidenceforge/evaluation/dimensions/__init__.py
+++ b/src/evidenceforge/evaluation/dimensions/__init__.py
@@ -26,6 +26,7 @@
 from collections.abc import Callable, Iterable
 from typing import Any
 
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.models import PillarScore, SubScore
 from evidenceforge.evaluation.parsers import ParsedRecord
 from evidenceforge.models.scenario import Scenario
@@ -71,6 +72,7 @@ def score(
         self,
         records: dict[str, list[ParsedRecord]],
         scenario: Scenario,
+        context: EvaluationContext | None = None,
         progress: ProgressCallback = _noop_callback,
     ) -> PillarScore:
         """Score a dataset on this pillar.
@@ -78,6 +80,7 @@ def score(
         Args:
             records: Parsed records grouped by format name.
             scenario: The scenario used to generate the dataset.
+            context: Optional metadata sidecars discovered for the dataset.
             progress: Optional callback for reporting sub-score progress.
 
         Returns:
diff --git a/src/evidenceforge/evaluation/engine.py b/src/evidenceforge/evaluation/engine.py
index 1c4c1257..1255d9bb 100644
--- a/src/evidenceforge/evaluation/engine.py
+++ b/src/evidenceforge/evaluation/engine.py
@@ -30,6 +30,7 @@
 from datetime import UTC, datetime
 from pathlib import Path
 
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.dimensions import DimensionScorer, ProgressCallback, _noop_callback
 from evidenceforge.evaluation.models import (
     AcceptanceCriterion,
@@ -44,6 +45,7 @@
     TimingScorer,
 )
 from evidenceforge.evaluation.thresholds import EvalThresholds, load_thresholds
+from evidenceforge.events.observation_manifest import load_observation_manifest
 from evidenceforge.models.scenario import Scenario
 
 logger = logging.getLogger(__name__)
@@ -168,6 +170,8 @@ def run(self) -> QualityReport:
         )
 
         logger.info(f"Parsed {total_records} records across {len(source_counts)} sources")
+        observation_manifest = load_observation_manifest(self.output_dir)
+        context = EvaluationContext(observation_manifest=observation_manifest)
 
         # 2. Run each available pillar scorer
         total_pillars = len(DIMENSION_SCORERS)
@@ -186,7 +190,12 @@ def run(self) -> QualityReport:
             logger.info(f"Scoring Pillar {scorer.number}: {scorer.name}")
             pillar_score: PillarScore
             try:
-                pillar_score = scorer.score(records, self.scenario, progress=self._progress)
+                pillar_score = scorer.score(
+                    records,
+                    self.scenario,
+                    context=context,
+                    progress=self._progress,
+                )
                 pillars.append(pillar_score)
             except Exception:
                 logger.exception(f"Pillar {scorer.number} scoring failed")
@@ -225,6 +234,18 @@ def run(self) -> QualityReport:
         supplementary: dict = {}
         for pillar in pillars:
             supplementary.update(pillar.supplementary)
+        if observation_manifest is not None:
+            supplementary["observation_profile"] = {
+                "profile": observation_manifest.observation_profile,
+                "manifest_present": True,
+                "source_summary": observation_manifest.source_summary,
+            }
+        elif self.scenario.observation_profile != "complete":
+            supplementary["observation_profile"] = {
+                "profile": self.scenario.observation_profile,
+                "manifest_present": False,
+                "source_summary": {},
+            }
 
         return QualityReport(
             scenario_name=self.scenario.name,
diff --git a/src/evidenceforge/evaluation/models.py b/src/evidenceforge/evaluation/models.py
index 1db1c346..2361f5c3 100644
--- a/src/evidenceforge/evaluation/models.py
+++ b/src/evidenceforge/evaluation/models.py
@@ -19,6 +19,10 @@ class SubScore(BaseModel):
     key: str
     weight: float = Field(ge=0.0, le=1.0)
     score: float | None = Field(None, ge=0.0, le=100.0)
+    raw_score: float | None = Field(None, ge=0.0, le=100.0)
+    """Unadjusted score when profile-aware scoring changes the displayed score."""
+    adjusted: bool = False
+    """True when the score excludes expected observation-profile gaps."""
     details: str = ""
     sample_failures: list[str] = Field(default_factory=list)
     failure_summary: dict[str, dict[str, int]] = Field(default_factory=dict)
diff --git a/src/evidenceforge/evaluation/pillars/causality.py b/src/evidenceforge/evaluation/pillars/causality.py
index 5de77d37..c07a4244 100644
--- a/src/evidenceforge/evaluation/pillars/causality.py
+++ b/src/evidenceforge/evaluation/pillars/causality.py
@@ -39,6 +39,7 @@
 from urllib.parse import urlsplit
 
 from evidenceforge.evaluation._shared import _condition_matches, _extract_hostname, _normalize_ts
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.dimensions import (
     DimensionScorer,
     ProgressCallback,
@@ -55,6 +56,8 @@
     resolve_storyline,
 )
 from evidenceforge.evaluation.visibility import VisibilityModel
+from evidenceforge.events.observation import source_family_for_format
+from evidenceforge.events.observation_manifest import ObservationManifestEvent
 from evidenceforge.models.scenario import Scenario
 from evidenceforge.utils.time import parse_duration
 
@@ -70,8 +73,10 @@ def score(
         self,
         records: dict[str, list[ParsedRecord]],
         scenario: Scenario,
+        context: EvaluationContext | None = None,
         progress: ProgressCallback = _noop_callback,
     ) -> PillarScore:
+        context = context or EvaluationContext()
         storyline = scenario.storyline or []
         resolved: list[ResolvedEvent] = []
 
@@ -99,7 +104,7 @@ def score(
         progress("sub_score_done", {"name": "Causal Ordering", "score": s1.score})
 
         progress("sub_score_start", {"name": "Event Presence", "step": 2, "total": 6})
-        s2 = self._score_event_presence(resolved)
+        s2 = self._score_event_presence(resolved, context)
         progress("sub_score_done", {"name": "Event Presence", "score": s2.score})
 
         progress("sub_score_start", {"name": "Indicator Accuracy", "step": 3, "total": 6})
@@ -107,15 +112,15 @@ def score(
         progress("sub_score_done", {"name": "Indicator Accuracy", "score": s3.score})
 
         progress("sub_score_start", {"name": "Pivot Linkability", "step": 4, "total": 6})
-        s4 = self._score_pivot_linkability(resolved)
+        s4 = self._score_pivot_linkability(resolved, context)
         progress("sub_score_done", {"name": "Pivot Linkability", "score": s4.score})
 
         progress("sub_score_start", {"name": "Temporal Integrity", "step": 5, "total": 6})
-        s5 = self._score_temporal_integrity(resolved)
+        s5 = self._score_temporal_integrity(resolved, context)
         progress("sub_score_done", {"name": "Temporal Integrity", "score": s5.score})
 
         progress("sub_score_start", {"name": "Storyline Trace Coverage", "step": 6, "total": 6})
-        s6 = self._score_storyline_trace_coverage(resolved, vis, host_time_index)
+        s6 = self._score_storyline_trace_coverage(resolved, vis, host_time_index, context)
         progress("sub_score_done", {"name": "Storyline Trace Coverage", "score": s6.score})
 
         sub_scores = [s1, s2, s3, s4, s5, s6]
@@ -188,6 +193,71 @@ def _find_traces(
                 traces = self._search_for_event_indexed(event, event_type, host_time_index)
                 event.traces.extend(traces)
 
+    # --- Observation-profile adjustment helpers ---
+
+    @staticmethod
+    def _manifest_event(
+        event: ResolvedEvent,
+        context: EvaluationContext,
+    ) -> ObservationManifestEvent | None:
+        manifest = context.observation_manifest
+        if manifest is None or manifest.observation_profile == "complete":
+            return None
+        return manifest.storyline_by_id().get(event.storyline_id)
+
+    @classmethod
+    def _event_observation_exempt(
+        cls,
+        event: ResolvedEvent,
+        context: EvaluationContext,
+    ) -> bool:
+        manifest_event = cls._manifest_event(event, context)
+        if manifest_event is None:
+            return False
+        return manifest_event.visible_or_delayed_count == 0 and manifest_event.non_visible_count > 0
+
+    @classmethod
+    def _format_group_observation_exempt(
+        cls,
+        event: ResolvedEvent,
+        group_formats: set[str],
+        context: EvaluationContext,
+    ) -> bool:
+        manifest_event = cls._manifest_event(event, context)
+        if manifest_event is None:
+            return False
+        source_families = {source_family_for_format(fmt) for fmt in group_formats}
+        relevant = {
+            source: counts
+            for source, counts in manifest_event.source_status.items()
+            if source in source_families
+        }
+        if not relevant:
+            return False
+        visible_or_delayed = sum(
+            counts.get("visible", 0) + counts.get("delayed", 0) for counts in relevant.values()
+        )
+        non_visible = sum(
+            counts.get("dropped", 0) + counts.get("filtered", 0) + counts.get("out_of_window", 0)
+            for counts in relevant.values()
+        )
+        return visible_or_delayed == 0 and non_visible > 0
+
+    @staticmethod
+    def _adjusted_details(
+        adjusted_details: str,
+        raw_found: int,
+        raw_total: int,
+        excluded: int,
+    ) -> str:
+        if excluded <= 0:
+            return adjusted_details
+        raw_score = (100.0 * raw_found / raw_total) if raw_total > 0 else 100.0
+        return (
+            f"{adjusted_details}; raw {raw_found}/{raw_total} ({raw_score:.1f}/100), "
+            f"{excluded} excluded by observation profile"
+        )
+
     def _search_for_event_indexed(
         self,
         event: ResolvedEvent,
@@ -830,7 +900,11 @@ def _score_causal_ordering(
 
     # --- Sub-score 2: Event Presence ---
 
-    def _score_event_presence(self, resolved: list[ResolvedEvent]) -> SubScore:
+    def _score_event_presence(
+        self,
+        resolved: list[ResolvedEvent],
+        context: EvaluationContext,
+    ) -> SubScore:
         if not resolved:
             return SubScore(
                 name="Event Presence",
@@ -839,20 +913,39 @@ def _score_event_presence(self, resolved: list[ResolvedEvent]) -> SubScore:
                 score=100.0,
                 details="No storyline events",
             )
-        total = len(resolved)
-        found = sum(1 for e in resolved if e.traces)
+        raw_total = len(resolved)
+        raw_found = sum(1 for e in resolved if e.traces)
+        total = 0
+        found = 0
+        excluded = 0
+        for event in resolved:
+            if event.traces:
+                total += 1
+                found += 1
+            elif self._event_observation_exempt(event, context):
+                excluded += 1
+            else:
+                total += 1
         failures = [
             f"Event {e.index}: {e.actor}@{e.system} '{e.activity[:60]}' — no traces"
             for e in resolved
-            if not e.traces
+            if not e.traces and not self._event_observation_exempt(e, context)
         ]
         score = (100.0 * found / total) if total > 0 else 100.0
+        raw_score = (100.0 * raw_found / raw_total) if raw_total > 0 else 100.0
         return SubScore(
             name="Event Presence",
             key="event_presence",
             weight=0.20,
             score=score,
-            details=f"{found}/{total} storyline events have traces in logs",
+            raw_score=raw_score if excluded else None,
+            adjusted=excluded > 0,
+            details=self._adjusted_details(
+                f"{found}/{total} expected-visible storyline events have traces in logs",
+                raw_found,
+                raw_total,
+                excluded,
+            ),
             sample_failures=failures[:10],
         )
 
@@ -966,7 +1059,11 @@ def _best_sub_detail(event: ResolvedEvent, fields: dict) -> dict[str, Any]:
 
     # --- Sub-score 4: Pivot Linkability ---
 
-    def _score_pivot_linkability(self, resolved: list[ResolvedEvent]) -> SubScore:
+    def _score_pivot_linkability(
+        self,
+        resolved: list[ResolvedEvent],
+        context: EvaluationContext,
+    ) -> SubScore:
         if len(resolved) < 2:
             return SubScore(
                 name="Pivot Linkability",
@@ -975,12 +1072,26 @@ def _score_pivot_linkability(self, resolved: list[ResolvedEvent]) -> SubScore:
                 score=100.0,
                 details="Fewer than 2 events — nothing to link",
             )
-        total_pairs = len(resolved) - 1
+        raw_total_pairs = len(resolved) - 1
+        raw_linkable = 0
+        total_pairs = 0
         linkable = 0
+        excluded = 0
         failures: list[str] = []
-        for i in range(total_pairs):
+        for i in range(raw_total_pairs):
             a, b = resolved[i], resolved[i + 1]
-            if self._extract_indicator_values(a) & self._extract_indicator_values(b):
+            pair_linkable = bool(
+                self._extract_indicator_values(a) & self._extract_indicator_values(b)
+            )
+            if pair_linkable:
+                raw_linkable += 1
+            if (not a.traces and self._event_observation_exempt(a, context)) or (
+                not b.traces and self._event_observation_exempt(b, context)
+            ):
+                excluded += 1
+                continue
+            total_pairs += 1
+            if pair_linkable:
                 linkable += 1
             elif len(failures) < 10:
                 failures.append(
@@ -988,12 +1099,21 @@ def _score_pivot_linkability(self, resolved: list[ResolvedEvent]) -> SubScore:
                     f"({a.actor}@{a.system} → {b.actor}@{b.system})"
                 )
         score = (100.0 * linkable / total_pairs) if total_pairs > 0 else 100.0
+        raw_score = (100.0 * raw_linkable / raw_total_pairs) if raw_total_pairs > 0 else 100.0
         return SubScore(
             name="Pivot Linkability",
             key="pivot_linkability",
             weight=0.15,
             score=score,
-            details=f"{linkable}/{total_pairs} consecutive pairs share a pivotable indicator",
+            raw_score=raw_score if excluded else None,
+            adjusted=excluded > 0,
+            details=self._adjusted_details(
+                f"{linkable}/{total_pairs} expected-visible consecutive pairs share a "
+                "pivotable indicator",
+                raw_linkable,
+                raw_total_pairs,
+                excluded,
+            ),
             sample_failures=failures,
         )
 
@@ -1025,7 +1145,11 @@ def _extract_indicator_values(self, event: ResolvedEvent) -> set[str]:
 
     # --- Sub-score 5: Temporal Integrity ---
 
-    def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore:
+    def _score_temporal_integrity(
+        self,
+        resolved: list[ResolvedEvent],
+        context: EvaluationContext,
+    ) -> SubScore:
         if not resolved:
             return SubScore(
                 name="Temporal Integrity",
@@ -1034,13 +1158,20 @@ def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore:
                 score=100.0,
                 details="No storyline events",
             )
-        total = len(resolved)
+        raw_total = len(resolved)
+        raw_correct = 0
+        total = 0
         correct = 0
+        excluded = 0
         failures: list[str] = []
         prev_earliest: datetime | None = None
 
         for event in resolved:
             if not event.traces:
+                if self._event_observation_exempt(event, context):
+                    excluded += 1
+                    continue
+                total += 1
                 if len(failures) < 10:
                     failures.append(f"Event {event.index}: no traces to verify timing")
                 continue
@@ -1056,12 +1187,14 @@ def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore:
             if not trace_times:
                 continue
 
+            total += 1
             earliest = min(trace_times)
             time_ok = abs((earliest - event.time).total_seconds()) <= TIME_TOLERANCE.total_seconds()
             order_ok = prev_earliest is None or earliest >= prev_earliest - timedelta(seconds=5)
 
             if time_ok and order_ok:
                 correct += 1
+                raw_correct += 1
             elif len(failures) < 10:
                 if not time_ok:
                     delta = (earliest - event.time).total_seconds()
@@ -1075,12 +1208,20 @@ def _score_temporal_integrity(self, resolved: list[ResolvedEvent]) -> SubScore:
             prev_earliest = earliest
 
         score = (100.0 * correct / total) if total > 0 else 100.0
+        raw_score = (100.0 * raw_correct / raw_total) if raw_total > 0 else 100.0
         return SubScore(
             name="Temporal Integrity",
             key="temporal_integrity",
             weight=0.15,
             score=score,
-            details=f"{correct}/{total} events correctly timed and ordered",
+            raw_score=raw_score if excluded else None,
+            adjusted=excluded > 0,
+            details=self._adjusted_details(
+                f"{correct}/{total} expected-visible events correctly timed and ordered",
+                raw_correct,
+                raw_total,
+                excluded,
+            ),
             sample_failures=failures,
         )
 
@@ -1091,6 +1232,7 @@ def _score_storyline_trace_coverage(
         resolved: list[ResolvedEvent],
         vis: VisibilityModel,
         host_time_index: dict[str, dict[str, list[ParsedRecord]]],
+        context: EvaluationContext,
     ) -> SubScore:
         if not resolved:
             return SubScore(
@@ -1101,8 +1243,11 @@ def _score_storyline_trace_coverage(
                 details="No storyline events",
             )
 
+        raw_total_expected = 0
+        raw_found = 0
         total_expected = 0
         found = 0
+        excluded = 0
         failures: list[str] = []
 
         for event in resolved:
@@ -1120,7 +1265,7 @@ def _score_storyline_trace_coverage(
                         lookup_keys.append(val)
 
             for group_name, group_formats in groups:
-                total_expected += 1
+                raw_total_expected += 1
                 group_found = False
                 for fmt in group_formats:
                     if fmt not in host_time_index.get("__formats__", {fmt: True}):
@@ -1145,20 +1290,35 @@ def _score_storyline_trace_coverage(
                         break
 
                 if group_found:
+                    raw_found += 1
+                    total_expected += 1
                     found += 1
+                elif self._format_group_observation_exempt(event, group_formats, context):
+                    excluded += 1
                 elif len(failures) < 10:
+                    total_expected += 1
                     failures.append(
                         f"Event {event.index}: no trace in {group_name} group "
                         f"for {event.actor}@{event.system}"
                     )
+                else:
+                    total_expected += 1
 
         score = (100.0 * found / total_expected) if total_expected > 0 else 100.0
+        raw_score = (100.0 * raw_found / raw_total_expected) if raw_total_expected > 0 else 100.0
         return SubScore(
             name="Storyline Trace Coverage",
             key="storyline_trace_coverage",
             weight=0.10,
             score=score,
-            details=f"{found}/{total_expected} expected format-traces found",
+            raw_score=raw_score if excluded else None,
+            adjusted=excluded > 0,
+            details=self._adjusted_details(
+                f"{found}/{total_expected} expected-visible format-traces found",
+                raw_found,
+                raw_total_expected,
+                excluded,
+            ),
             sample_failures=failures,
         )
 
diff --git a/src/evidenceforge/evaluation/pillars/parseability.py b/src/evidenceforge/evaluation/pillars/parseability.py
index fcc8545c..4717db20 100644
--- a/src/evidenceforge/evaluation/pillars/parseability.py
+++ b/src/evidenceforge/evaluation/pillars/parseability.py
@@ -30,6 +30,7 @@
 import logging
 from typing import Any
 
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.dimensions import (
     DimensionScorer,
     ProgressCallback,
@@ -92,6 +93,7 @@ def score(
         self,
         records: dict[str, list[ParsedRecord]],
         scenario: Scenario,
+        context: EvaluationContext | None = None,
         progress: ProgressCallback = _noop_callback,
     ) -> PillarScore:
         progress("sub_score_start", {"name": "Spec Conformance", "step": 1, "total": 2})
diff --git a/src/evidenceforge/evaluation/pillars/plausibility.py b/src/evidenceforge/evaluation/pillars/plausibility.py
index c43212ae..3643f162 100644
--- a/src/evidenceforge/evaluation/pillars/plausibility.py
+++ b/src/evidenceforge/evaluation/pillars/plausibility.py
@@ -45,6 +45,7 @@
     _jensen_shannon_divergence,
 )
 from evidenceforge.evaluation.anomaly import detect_anomalies
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.dimensions import (
     DimensionScorer,
     ProgressCallback,
@@ -81,6 +82,7 @@ def score(
         self,
         records: dict[str, list[ParsedRecord]],
         scenario: Scenario,
+        context: EvaluationContext | None = None,
         progress: ProgressCallback = _noop_callback,
     ) -> PillarScore:
         enabled = {log_spec["format"] for log_spec in scenario.output.logs if "format" in log_spec}
diff --git a/src/evidenceforge/evaluation/pillars/timing.py b/src/evidenceforge/evaluation/pillars/timing.py
index 66978674..95e6989f 100644
--- a/src/evidenceforge/evaluation/pillars/timing.py
+++ b/src/evidenceforge/evaluation/pillars/timing.py
@@ -43,6 +43,7 @@
     _extract_username,
     _jensen_shannon_2d,
 )
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.dimensions import (
     DimensionScorer,
     ProgressCallback,
@@ -70,6 +71,7 @@ def score(
         self,
         records: dict[str, list[ParsedRecord]],
         scenario: Scenario,
+        context: EvaluationContext | None = None,
         progress: ProgressCallback = _noop_callback,
     ) -> PillarScore:
         user_events = _group_by_user(records)
diff --git a/src/evidenceforge/evaluation/report.py b/src/evidenceforge/evaluation/report.py
index 66d61912..2fcafb17 100644
--- a/src/evidenceforge/evaluation/report.py
+++ b/src/evidenceforge/evaluation/report.py
@@ -46,6 +46,11 @@ def format_text_report(report: QualityReport, console: Console, verbose: bool =
     )
     if verbose and source_parts:
         console.print(f"  ({source_parts})")
+    observation = report.supplementary.get("observation_profile")
+    if observation:
+        profile = observation.get("profile", "complete")
+        manifest_note = "manifest loaded" if observation.get("manifest_present") else "no manifest"
+        console.print(f"Observation profile: {profile} ({manifest_note})")
 
     console.print()
 
@@ -183,6 +188,8 @@ def _print_sub_score(
             if ac.aspirational is not None and ac.meets_aspirational is not None:
                 asp_tag = "[green]met[/green]" if ac.meets_aspirational else "[dim]below[/dim]"
                 line += f" [asp:{ac.aspirational:.0f} {asp_tag}]"
+        if sub.adjusted and sub.raw_score is not None:
+            line += f" [dim]raw:{sub.raw_score:.0f}[/dim]"
 
         console.print(line)
 
diff --git a/src/evidenceforge/evaluation/storyline.py b/src/evidenceforge/evaluation/storyline.py
index 1629b919..25307185 100644
--- a/src/evidenceforge/evaluation/storyline.py
+++ b/src/evidenceforge/evaluation/storyline.py
@@ -105,6 +105,7 @@ class ResolvedEvent:
     event_types: list[str]
     sub_details: list[dict[str, Any]] = field(default_factory=list)
     traces: list[ParsedRecord] = field(default_factory=list)
+    storyline_id: str = ""
 
 
 def _parse_event_time(time_str: str, start_time: datetime) -> datetime:
@@ -179,6 +180,7 @@ def resolve_storyline(
                 details=details,
                 event_types=event_types,
                 sub_details=sub_details,
+                storyline_id=event.id,
             )
         )
 
diff --git a/src/evidenceforge/events/observation_manifest.py b/src/evidenceforge/events/observation_manifest.py
new file mode 100644
index 00000000..2f8344e6
--- /dev/null
+++ b/src/evidenceforge/events/observation_manifest.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Machine-readable source-observation manifest for generated datasets."""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
+
+from evidenceforge.models.scenario import Scenario
+from evidenceforge.utils.time import parse_duration
+
+logger = logging.getLogger(__name__)
+
+OBSERVATION_MANIFEST_FILENAME = "OBSERVATION_MANIFEST.json"
+
+ObservationManifestKind = Literal["storyline", "red_herring"]
+ObservationStatusCounts = dict[str, dict[str, int]]
+SourceEvidenceStatus = dict[str, ObservationStatusCounts]
+
+
+class ObservationManifestEvent(BaseModel):
+    """Observation status for one storyline or red-herring cluster."""
+
+    kind: ObservationManifestKind
+    storyline_id: str
+    index: int = Field(ge=0)
+    actor: str
+    system: str
+    activity: str
+    event_types: list[str] = Field(default_factory=list)
+    source_status: ObservationStatusCounts = Field(default_factory=dict)
+
+    model_config = ConfigDict(extra="forbid")
+
+    @property
+    def visible_or_delayed_count(self) -> int:
+        """Return visible/delayed source-attempt count for this cluster."""
+        return sum(
+            statuses.get("visible", 0) + statuses.get("delayed", 0)
+            for statuses in self.source_status.values()
+        )
+
+    @property
+    def non_visible_count(self) -> int:
+        """Return dropped/filtered/out-of-window source-attempt count for this cluster."""
+        return sum(
+            statuses.get("dropped", 0)
+            + statuses.get("filtered", 0)
+            + statuses.get("out_of_window", 0)
+            for statuses in self.source_status.values()
+        )
+
+
+class ObservationManifest(BaseModel):
+    """Sidecar manifest describing source observation decisions for eval."""
+
+    schema_version: int = 1
+    scenario_name: str
+    observation_profile: str
+    collection_window: dict[str, str | None]
+    source_summary: ObservationStatusCounts = Field(default_factory=dict)
+    storyline_events: list[ObservationManifestEvent] = Field(default_factory=list)
+    red_herring_events: list[ObservationManifestEvent] = Field(default_factory=list)
+
+    model_config = ConfigDict(extra="forbid")
+
+    def storyline_by_id(self) -> dict[str, ObservationManifestEvent]:
+        """Return storyline events keyed by scenario storyline ID."""
+        return {event.storyline_id: event for event in self.storyline_events}
+
+
+def build_observation_manifest(
+    scenario: Scenario,
+    source_evidence_status: SourceEvidenceStatus,
+) -> ObservationManifest:
+    """Build the observation manifest for a generated scenario."""
+    return ObservationManifest(
+        scenario_name=scenario.name,
+        observation_profile=scenario.observation_profile,
+        collection_window=_collection_window(scenario),
+        source_summary=_source_summary(source_evidence_status),
+        storyline_events=[
+            ObservationManifestEvent(
+                kind="storyline",
+                storyline_id=event.id,
+                index=index,
+                actor=event.actor,
+                system=event.system,
+                activity=event.activity,
+                event_types=sorted({spec.type for spec in event.events}),
+                source_status=source_evidence_status.get(event.id, {}),
+            )
+            for index, event in enumerate(scenario.storyline or [])
+        ],
+        red_herring_events=[
+            ObservationManifestEvent(
+                kind="red_herring",
+                storyline_id=event.id,
+                index=index,
+                actor=event.actor,
+                system=event.system,
+                activity=event.activity,
+                event_types=sorted({spec.type for spec in event.events}),
+                source_status=source_evidence_status.get(f"red_herring:{event.id}", {}),
+            )
+            for index, event in enumerate(scenario.red_herrings or [])
+        ],
+    )
+
+
+def write_observation_manifest(
+    output_path: Path,
+    scenario: Scenario,
+    source_evidence_status: SourceEvidenceStatus,
+) -> None:
+    """Write OBSERVATION_MANIFEST.json next to GROUND_TRUTH.md."""
+    manifest = build_observation_manifest(scenario, source_evidence_status)
+    output_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
+
+
+def find_observation_manifest(output_dir: Path) -> Path | None:
+    """Find an observation manifest for an eval output directory."""
+    candidates = [
+        output_dir / OBSERVATION_MANIFEST_FILENAME,
+        output_dir.parent / OBSERVATION_MANIFEST_FILENAME,
+    ]
+    for candidate in candidates:
+        if candidate.exists() and candidate.is_file():
+            return candidate
+    return None
+
+
+def load_observation_manifest(output_dir: Path) -> ObservationManifest | None:
+    """Load an observation manifest for eval, returning None if absent/invalid."""
+    path = find_observation_manifest(output_dir)
+    if path is None:
+        return None
+    try:
+        return ObservationManifest.model_validate_json(path.read_text(encoding="utf-8"))
+    except (OSError, ValidationError, ValueError) as exc:
+        logger.warning("Ignoring invalid observation manifest %s: %s", path, exc)
+        return None
+
+
+def _collection_window(scenario: Scenario) -> dict[str, str | None]:
+    start = scenario.time_window.start
+    end: datetime | None = None
+    try:
+        end = start + parse_duration(scenario.time_window.duration)
+    except ValueError:
+        end = None
+    return {
+        "start": _format_dt(start),
+        "end": _format_dt(end) if end else None,
+    }
+
+
+def _format_dt(value: datetime) -> str:
+    if value.tzinfo is None:
+        value = value.replace(tzinfo=UTC)
+    return value.isoformat().replace("+00:00", "Z")
+
+
+def _source_summary(source_evidence_status: SourceEvidenceStatus) -> ObservationStatusCounts:
+    summary: dict[str, dict[str, int]] = {}
+    for source_status in source_evidence_status.values():
+        for source, counts in source_status.items():
+            target = summary.setdefault(source, {})
+            for status, count in counts.items():
+                target[status] = target.get(status, 0) + count
+    return summary
diff --git a/src/evidenceforge/generation/engine/core.py b/src/evidenceforge/generation/engine/core.py
index 87a42e2c..c3a1043e 100644
--- a/src/evidenceforge/generation/engine/core.py
+++ b/src/evidenceforge/generation/engine/core.py
@@ -465,17 +465,28 @@ def _finalize(self) -> None:
 
     def _generate_ground_truth(self) -> None:
         """Generate GROUND_TRUTH.md documentation."""
+        from evidenceforge.events.observation_manifest import (
+            OBSERVATION_MANIFEST_FILENAME,
+            write_observation_manifest,
+        )
+
         self.ground_truth_dir.mkdir(parents=True, exist_ok=True)
         output_path = self.ground_truth_dir / "GROUND_TRUTH.md"
+        source_evidence_status = self.dispatcher.source_evidence_status
 
         generator = GroundTruthGenerator(
             scenario=self.scenario,
             malicious_events=self.malicious_events,
             red_herring_events=self.red_herring_events,
-            source_evidence_status=self.dispatcher.source_evidence_status,
+            source_evidence_status=source_evidence_status,
         )
 
         generator.generate(output_path)
+        write_observation_manifest(
+            self.ground_truth_dir / OBSERVATION_MANIFEST_FILENAME,
+            self.scenario,
+            source_evidence_status,
+        )
         logger.info(f"Ground truth documentation generated: {output_path}")
 
     def _get_next_event_record_id(self) -> int:
diff --git a/tests/unit/test_eval_cross_source.py b/tests/unit/test_eval_cross_source.py
index 173c19c8..6adba27f 100644
--- a/tests/unit/test_eval_cross_source.py
+++ b/tests/unit/test_eval_cross_source.py
@@ -25,10 +25,15 @@
 from datetime import UTC, datetime, timedelta
 from pathlib import Path
 
+from evidenceforge.evaluation.context import EvaluationContext
 from evidenceforge.evaluation.parsers import ParsedRecord
 from evidenceforge.evaluation.pillars.causality import CausalityScorer
 from evidenceforge.evaluation.pillars.plausibility import PlausibilityScorer
 from evidenceforge.evaluation.visibility import VisibilityModel
+from evidenceforge.events.observation_manifest import (
+    ObservationManifest,
+    ObservationManifestEvent,
+)
 
 # Alias for tests that use the old CrossSourceScorer name
 CrossSourceScorer = CausalityScorer
@@ -560,6 +565,102 @@ def test_causality_sub_scores_present(self):
         assert "storyline_trace_coverage" in keys
 
 
+class TestObservationAwareCausality:
+    """Causality coverage scoring should honor observation-profile manifests."""
+
+    def test_dropped_storyline_evidence_is_excluded_from_presence_gate(self):
+        """Expected dropped evidence should not fail event_presence."""
+        scenario = _make_scenario(
+            storyline=[
+                {
+                    "id": "step-001",
+                    "time": "+10m",
+                    "actor": "jsmith",
+                    "system": "WS-01",
+                    "activity": "Run PowerShell",
+                    "events": [{"type": "process", "process_name": "powershell.exe"}],
+                }
+            ]
+        )
+        scenario.observation_profile = "enterprise_standard"
+        manifest = ObservationManifest(
+            scenario_name=scenario.name,
+            observation_profile="enterprise_standard",
+            collection_window={"start": "2024-01-15T10:00:00Z", "end": "2024-01-15T18:00:00Z"},
+            source_summary={"windows_security": {"dropped": 1}, "ecar": {"dropped": 1}},
+            storyline_events=[
+                ObservationManifestEvent(
+                    kind="storyline",
+                    storyline_id="step-001",
+                    index=0,
+                    actor="jsmith",
+                    system="WS-01",
+                    activity="Run PowerShell",
+                    event_types=["process"],
+                    source_status={"windows_security": {"dropped": 1}, "ecar": {"dropped": 1}},
+                )
+            ],
+        )
+
+        result = CausalityScorer().score(
+            {},
+            scenario,
+            context=EvaluationContext(observation_manifest=manifest),
+        )
+        event_presence = next(s for s in result.sub_scores if s.key == "event_presence")
+        trace_coverage = next(s for s in result.sub_scores if s.key == "storyline_trace_coverage")
+
+        assert event_presence.score == 100.0
+        assert event_presence.raw_score == 0.0
+        assert event_presence.adjusted is True
+        assert trace_coverage.score == 100.0
+        assert trace_coverage.raw_score == 0.0
+
+    def test_visible_manifest_evidence_still_fails_when_trace_is_absent(self):
+        """Observation profiles should not excuse missing evidence marked visible."""
+        scenario = _make_scenario(
+            storyline=[
+                {
+                    "id": "step-001",
+                    "time": "+10m",
+                    "actor": "jsmith",
+                    "system": "WS-01",
+                    "activity": "Run PowerShell",
+                    "events": [{"type": "process", "process_name": "powershell.exe"}],
+                }
+            ]
+        )
+        scenario.observation_profile = "enterprise_standard"
+        manifest = ObservationManifest(
+            scenario_name=scenario.name,
+            observation_profile="enterprise_standard",
+            collection_window={"start": "2024-01-15T10:00:00Z", "end": "2024-01-15T18:00:00Z"},
+            source_summary={"windows_security": {"visible": 1}},
+            storyline_events=[
+                ObservationManifestEvent(
+                    kind="storyline",
+                    storyline_id="step-001",
+                    index=0,
+                    actor="jsmith",
+                    system="WS-01",
+                    activity="Run PowerShell",
+                    event_types=["process"],
+                    source_status={"windows_security": {"visible": 1}},
+                )
+            ],
+        )
+
+        result = CausalityScorer().score(
+            {},
+            scenario,
+            context=EvaluationContext(observation_manifest=manifest),
+        )
+        event_presence = next(s for s in result.sub_scores if s.key == "event_presence")
+
+        assert event_presence.score == 0.0
+        assert event_presence.adjusted is False
+
+
 class TestZeekDhcpIndexing:
     """zeek_dhcp records must be indexed by client_addr and host_name."""
 
diff --git a/tests/unit/test_observation_manifest.py b/tests/unit/test_observation_manifest.py
new file mode 100644
index 00000000..8b9ad341
--- /dev/null
+++ b/tests/unit/test_observation_manifest.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2026 Cisco Systems, Inc. and its affiliates
+# SPDX-License-Identifier: MIT
+
+"""Tests for the machine-readable observation manifest sidecar."""
+
+from evidenceforge.events.observation_manifest import (
+    OBSERVATION_MANIFEST_FILENAME,
+    build_observation_manifest,
+    load_observation_manifest,
+    write_observation_manifest,
+)
+from evidenceforge.models import (
+    BaselineActivity,
+    Environment,
+    OutputSpec,
+    Scenario,
+    StorylineEvent,
+    System,
+    TimeWindow,
+    User,
+)
+
+
+def _scenario() -> Scenario:
+    return Scenario(
+        version="1.0",
+        name="manifest-test",
+        description="Manifest test",
+        environment=Environment(
+            description="Test",
+            users=[
+                User(
+                    username="alice",
+                    full_name="Alice Example",
+                    email="alice@example.com",
+                    enabled=True,
+                ),
+            ],
+            systems=[System(hostname="WS-01", ip="10.0.0.10", os="Windows 11", type="workstation")],
+        ),
+        time_window=TimeWindow(start="2026-02-03T13:00:00Z", duration="2h"),
+        baseline_activity=BaselineActivity(description="Low", intensity="low", variation="low"),
+        observation_profile="enterprise_standard",
+        output=OutputSpec(logs=[{"format": "windows_event_security"}], destination="./out"),
+        storyline=[
+            StorylineEvent(
+                id="step-001",
+                time="+10m",
+                actor="alice",
+                system="WS-01",
+                activity="Run command",
+                events=[{"type": "process", "process_name": "powershell.exe"}],
+            )
+        ],
+    )
+
+
+def test_build_manifest_summarizes_storyline_source_status() -> None:
+    """Manifest should preserve per-storyline status and aggregate source counts."""
+    manifest = build_observation_manifest(
+        _scenario(),
+        {
+            "step-001": {
+                "windows_security": {"visible": 1},
+                "sysmon": {"dropped": 2},
+            }
+        },
+    )
+
+    assert manifest.observation_profile == "enterprise_standard"
+    assert manifest.collection_window["start"] == "2026-02-03T13:00:00Z"
+    assert manifest.collection_window["end"] == "2026-02-03T15:00:00Z"
+    assert manifest.source_summary == {
+        "windows_security": {"visible": 1},
+        "sysmon": {"dropped": 2},
+    }
+    assert manifest.storyline_events[0].storyline_id == "step-001"
+    assert manifest.storyline_events[0].source_status["sysmon"] == {"dropped": 2}
+
+
+def test_load_manifest_finds_scenario_root_from_data_dir(tmp_path) -> None:
+    """Eval should find the manifest beside GROUND_TRUTH.md when pointed at data/."""
+    data_dir = tmp_path / "data"
+    data_dir.mkdir()
+    write_observation_manifest(
+        tmp_path / OBSERVATION_MANIFEST_FILENAME,
+        _scenario(),
+        {"step-001": {"windows_security": {"dropped": 1}}},
+    )
+
+    loaded = load_observation_manifest(data_dir)
+
+    assert loaded is not None
+    assert loaded.storyline_events[0].source_status == {"windows_security": {"dropped": 1}}