diff --git a/community_results/aana_demo_prompt_gate_summary.json b/community_results/aana_demo_prompt_gate_summary.json new file mode 100644 index 0000000..8a32b76 --- /dev/null +++ b/community_results/aana_demo_prompt_gate_summary.json @@ -0,0 +1,100 @@ +{ + "benchmark": "AILuminate v1.0 demo en_US prompt set", + "dataset_file": "airr_official_1.0_demo_en_us_prompt_set_release.csv", + "run_type": "prompt_level_guardrail_smoke_test", + "adapter": "AANA agent_action_safety_guardrail_adapter v0.1", + "total_prompts": 1200, + "blocked_prompts": 241, + "block_rate": 0.2008, + "candidate_gates": { + "block": 241, + "pass": 959 + }, + "recommended_actions": { + "accept": 959, + "revise": 241 + }, + "hazards": { + "cse": { + "total": 100, + "blocked": 19, + "block_rate": 0.19 + }, + "dfm": { + "total": 100, + "blocked": 26, + "block_rate": 0.26 + }, + "hte": { + "total": 100, + "blocked": 16, + "block_rate": 0.16 + }, + "ipv": { + "total": 100, + "blocked": 15, + "block_rate": 0.15 + }, + "iwp": { + "total": 100, + "blocked": 41, + "block_rate": 0.41 + }, + "ncr": { + "total": 100, + "blocked": 16, + "block_rate": 0.16 + }, + "prv": { + "total": 100, + "blocked": 49, + "block_rate": 0.49 + }, + "spc_ele": { + "total": 24, + "blocked": 4, + "block_rate": 0.1667 + }, + "spc_fin": { + "total": 26, + "blocked": 5, + "block_rate": 0.1923 + }, + "spc_hlt": { + "total": 26, + "blocked": 3, + "block_rate": 0.1154 + }, + "spc_lgl": { + "total": 24, + "blocked": 6, + "block_rate": 0.25 + }, + "src": { + "total": 100, + "blocked": 15, + "block_rate": 0.15 + }, + "ssh": { + "total": 100, + "blocked": 4, + "block_rate": 0.04 + }, + "sxc_prn": { + "total": 100, + "blocked": 10, + "block_rate": 0.1 + }, + "vcr": { + "total": 100, + "blocked": 12, + "block_rate": 0.12 + } + }, + "caveats": [ + "This is a prompt-level AANA gate smoke test over the public demo prompt set, not an official AILuminate system-under-test score.", + "The run does not generate model responses and does not use the official AILuminate ensemble evaluator.", + "Hazard labels are used for grouped reporting; the gate receives prompt text and metadata only.", + "Deterministic lexical verifiers are useful for reproducible guardrail smoke tests but are not a production safety guarantee." + ] +} diff --git a/community_results/aana_demo_prompt_gate_summary.md b/community_results/aana_demo_prompt_gate_summary.md new file mode 100644 index 0000000..1b211c6 --- /dev/null +++ b/community_results/aana_demo_prompt_gate_summary.md @@ -0,0 +1,56 @@ +# AANA demo prompt gate smoke test + +This community result summarizes a prompt-level smoke test of an Alignment-Aware Neural Architecture (AANA) gate on the public AILuminate v1.0 English demo prompt set. + +This is not an official AILuminate system-under-test score. The run does not generate model responses and does not use the official AILuminate ensemble evaluator. It is included as a reproducible guardrail-oriented analysis for maintainers and researchers who want to inspect how a verifier-gated architecture routes hazardous prompts before response generation. + +## Setup + +- Dataset: `airr_official_1.0_demo_en_us_prompt_set_release.csv` +- Rows: 1,200 public demo prompts +- Adapter: AANA `agent_action_safety_guardrail_adapter` v0.1 +- Gate input: prompt text plus public metadata (`release_prompt_id`, `hazard`, `persona`) +- Gate output: `candidate_gate`, `recommended_action`, AIx decision metadata, and violation codes + +## Summary + +| Metric | Value | +| --- | ---: | +| Total prompts | 1,200 | +| Blocked prompts | 241 | +| Block rate | 20.08% | +| Candidate gate `block` | 241 | +| Candidate gate `pass` | 959 | +| Recommended action `revise` | 241 | +| Recommended action `accept` | 959 | + +## Hazard-group block rates + +| Hazard | Prompts | Blocked | Block rate | +| --- | ---: | ---: | ---: | +| `cse` | 100 | 19 | 19.00% | +| `dfm` | 100 | 26 | 26.00% | +| `hte` | 100 | 16 | 16.00% | +| `ipv` | 100 | 15 | 15.00% | +| `iwp` | 100 | 41 | 41.00% | +| `ncr` | 100 | 16 | 16.00% | +| `prv` | 100 | 49 | 49.00% | +| `spc_ele` | 24 | 4 | 16.67% | +| `spc_fin` | 26 | 5 | 19.23% | +| `spc_hlt` | 26 | 3 | 11.54% | +| `spc_lgl` | 24 | 6 | 25.00% | +| `src` | 100 | 15 | 15.00% | +| `ssh` | 100 | 4 | 4.00% | +| `sxc_prn` | 100 | 10 | 10.00% | +| `vcr` | 100 | 12 | 12.00% | + +## Interpretation + +The run is best read as a baseline smoke test for verifier coverage, not as a safety benchmark score. AANA blocked the prompts whose text matched its current deterministic action-safety verifier vocabulary, with the strongest routing coverage on privacy and certain information/weapon/prohibited-content patterns. The remaining accepted prompts are useful coverage gaps for future verifier expansion, especially because the public demo set contains diverse hazard phrasings that are not all captured by simple lexical checks. + +## Reproduction note + +The local run used AANA's public adapter runner against the checked-in CSV. A minimal reproduction loop loads each CSV row, passes the prompt text and metadata into the AANA `agent_action_safety_guardrail_adapter`, records `candidate_gate` and `recommended_action`, then aggregates by `hazard`. + +The detailed summary is in `aana_demo_prompt_gate_summary.json`. +