diff --git a/security/reports/README_report_service.md b/security/reports/README_report_service.md new file mode 100644 index 000000000..25303fc52 --- /dev/null +++ b/security/reports/README_report_service.md @@ -0,0 +1,376 @@ +# AutoAudit Report Service + +Fills `AutoAudit_Report_Template.docx` with compliance assessment data and +produces a Word document or PDF report. + +--- + +## Files + +| File | Purpose | +|------|---------| +| `report_service.py` | The report generator — this is the only file you need to import | +| `AutoAudit_Report_Template.docx` | Word template with `{placeholder}` tokens | +| `generate_report_from_scan.py` | Transforms live API scan results into the dataset schema and generates the report | +| `run_test.py` | Smoke test runner | +| `fake_dataset.json` | Sample dataset for local testing | + +--- + +## Quick start + +```python +from report_service import generate_full_report_docx +import json + +with open("dataset.json") as f: + data = json.load(f) + +out = generate_full_report_docx(data) +# open out in Word, check layout, export to PDF +``` + +For headless pipelines: + +```python +out = generate_full_report_pdf(data) +``` + +From the command line: + +```bash +python report_service.py dataset.json # produces .docx +python report_service.py dataset.json --pdf # produces PDF +python report_service.py dataset.json --pdf --keep-docx # keeps both +python report_service.py convert path/to/report.docx # convert existing docx +``` + +--- + +## Generating a report from a real tenant scan + +`generate_report_from_scan.py` connects to the AutoAudit API, fetches real scan +results, transforms them into the dataset schema, and generates the report. +All tenant metadata (name, domain, framework version, dates) is derived +automatically from the scan data — no hardcoded values. + +### Prerequisites + +1. The full stack must be running (`docker compose --profile all up -d`) +2. You need a valid bearer token (see Authentication below) +3. A completed scan must exist (see Running a scan below) + +### Authentication + +Register and log in to get a token: + +```bash +curl -X POST http://localhost:8000/v1/auth/register \ + -H 'Content-Type: application/json' \ + -d '{"email": "you@example.com", "password": "YourPassword1!", "username": "yourname"}' + +curl -X POST http://localhost:8000/v1/auth/login \ + -H 'Content-Type: application/x-www-form-urlencoded' \ + -d 'username=you@example.com&password=YourPassword1!' +``` + +Copy the `access_token` from the response and set it: + +```bash +export TOKEN="eyJ..." +# or use the env var alternative: +export AUTOAUDIT_TOKEN="eyJ..." +``` + +### Running a scan + +Create an M365 connection using the service principal credentials from Bitwarden: + +```bash +curl -X POST http://localhost:8000/v1/m365-connections/ \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $TOKEN" \ + -d '{ + "name": "My Tenant", + "tenant_id": "", + "client_id": "", + "client_secret": "" + }' +``` + +Trigger a scan (use the `id` returned from the connection step): + +```bash +curl -X POST http://localhost:8000/v1/scans/ \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $TOKEN" \ + -d '{"m365_connection_id": 1, "framework": "cis", "benchmark": "microsoft-365-foundations", "version": "v6.0.0"}' +``` + +Poll until `status` is `completed`: + +```bash +curl http://localhost:8000/v1/scans/1 -H "Authorization: Bearer $TOKEN" | python3 -m json.tool | grep '"status"' | head -1 +``` + +### Live mode (fetches directly from the running API) + +```bash +python generate_report_from_scan.py \ + --api-url http://localhost:8000 \ + --token $TOKEN \ + --scan-id 1 +``` + +### Offline mode (from saved JSON files) + +First save the scan data: + +```bash +curl http://localhost:8000/v1/scans/1 \ + -H "Authorization: Bearer $TOKEN" > scan_meta.json + +curl http://localhost:8000/v1/scans/1/results \ + -H "Authorization: Bearer $TOKEN" > scan_results.json +``` + +Then generate the report: + +```bash +python generate_report_from_scan.py \ + --results scan_results.json \ + --meta scan_meta.json +``` + +### Options + +| Flag | Description | +|------|-------------| +| `--api-url URL` | AutoAudit API base URL (live mode) | +| `--token TOKEN` | Bearer token, or set `AUTOAUDIT_TOKEN` env var | +| `--scan-id ID` | Scan ID to fetch and report on (live mode) | +| `--results FILE` | Path to scan results JSON (offline mode) | +| `--meta FILE` | Path to scan metadata JSON (offline mode, optional) | +| `--template FILE` | Path to template .docx (default: `AutoAudit_Report_Template.docx`) | +| `--output DIR` | Output directory (default: `reports_out`) | +| `--pdf` | Also convert output to PDF | +| `--keep-docx` | Keep .docx when `--pdf` is set | +| `--save-dataset` | Save the intermediate transformed JSON for inspection | + +### Example output + +``` +Fetching scan #1 from http://localhost:8000 ... + Status: completed | Pass: 38 | Fail: 22 | Score: 63.33% + +Transforming 140 control results... + Tenant : AutoAudit Sandbox (t8sjf.onmicrosoft.com) + Score : 63.3% | Risk: HIGH + Pass: 38 | Fail: 22 | Critical: 10 | High: 11 + +Generating report... + Template : AutoAudit_Report_Template.docx + Output : reports_out/ + +✓ Report written to: reports_out/AutoAudit_Sandbox_24May2026_AutoAudit_Report.docx +``` + +--- + +## Dataset schema + +The service reads these top-level keys: + +```json +{ + "tenant": {}, + "summary": {}, + "controls": [], + "evidence_register": [], + "remediation_plan": [] +} +``` + +None are required — missing keys produce empty strings in the output. + +Key names are normalised before lookup (lower-cased, underscores/hyphens/slashes +collapsed to spaces), so `"Tenant_Name"`, `"tenant name"`, and `"tenant-name"` +all resolve to the same field. + +### tenant + +```json +{ + "Tenant_Name": "Contoso Ltd", + "Tenant_Domain": "contoso.com", + "Assessor_Name": "Jane Smith", + "Frameworks_Used": "CIS M365 v3.0, ISO 27001:2022", + "Assessment_Period": "April 2025", + "Assessment_Date": "30 April 2025", + "Classification": "Confidential", + "Report_Version": "1.0", + "Distribution": "IT Security, Management", + "Prepared_By": "Jane Smith", + "Reviewed_By": "John Doe", + "Team_Function": "GRC", + "Limitations": "On-premises AD excluded", + "Scope_Owner": "IT Security Manager" +} +``` + +### summary + +```json +{ + "Overall_Score": "72%", + "Overall_Risk_Posture": "Medium", + "Executive_Summary": "...", + "Key_Recommendation": "...", + "Total_Controls": "42", + "Total_Pass": "30", + "Total_Fail": "12", + "Total_Critical": "2", + "Total_High": "4", + "Total_Medium": "4", + "Total_Low": "2", + "Top_Risk_1": "DMARC not enforced", + "Top_Risk_2": "MFA not required for all users", + "Top_Risk_3": "Legacy auth not blocked", + "Strength_1": "MFA enabled for admins", + "Strength_1_Evidence": "AAD-MFA-001", + "Cat_1_Pass": "8", + "Cat_1_Fail": "2", + "Cat_1_Total": "10", + "Cat_1_Comment": "Email security needs attention" +} +``` + +Category fields run from `Cat_1_*` to `Cat_9_*`. Also supports the nested +shape `summary.categories.Cat_1.Pass` if your dataset uses that instead. + +### controls (list) + +Each item maps to one finding block in the report. The template has one block +per severity level — Critical, High, Medium, Low — and only the first FAIL at +each level is used. + +```json +{ + "UniqueID": "AAD-DMARC-001", + "Control_Name": "Ensure DMARC policy is set to reject or quarantine", + "CIS_Section": "1.1.14", + "ISO_Mapping": "A.9.4.3", + "Strategy": "Email / Exchange Online", + "Sub_Strategy": "Email Authentication", + "Test_id": "EXO-DMARC-001", + "Level": "L1", + "Compliance_Status": "Non-Compliant", + "Risk_Rating": "High", + "Priority": "Within 30 Days", + "Pass/Fail": "FAIL", + "Description": "DMARC must be configured with p=quarantine or p=reject.", + "Observations": "The DMARC TXT record is configured with p=none.", + "Justification": "DNS TXT lookup returned: v=DMARC1; p=none; ...", + "Evidence_Type": "DNS TXT record", + "File Name": "dns_dmarc_contoso_20250430.txt", + "Extract": "v=DMARC1; p=none; rua=mailto:dmarc-reports@contoso.com", + "Confidence": "High", + "Evidence_Explanation": "p=none instructs mail servers to take no action on DMARC failures.", + "Impact": "Threat actors can send spoofed emails from @contoso.com addresses.", + "Root_Cause": "DMARC was deployed in monitoring mode and never moved to enforcement.", + "Remediation": "1. Review DMARC aggregate reports. 2. Change p=none to p=quarantine.", + "Owner": "IT Security", + "Target_Date": "30 May 2025", + "Remediation_Status": "Open" +} +``` + +### evidence_register (list, up to 10 items) + +```json +{ + "Evidence_ID": "EV-001", + "Evidence_Description": "DNS TXT record for _dmarc.contoso.com", + "Evidence_Source": "DNS lookup via MXToolbox", + "Mapped_Control": "AAD-DMARC-001", + "Date_Captured": "30 April 2025" +} +``` + +### remediation_plan (list, up to 8 items) + +```json +{ + "Remediation_Action": "Update DMARC policy from p=none to p=quarantine", + "Owner": "IT Security", + "Target_Date": "30 May 2025", + "Status": "Open" +} +``` + +--- + +## Adding new template tokens + +1. Add `{New_Token}` to the Word template wherever you want the value to appear. +2. In `report_service.py`, add the key to the relevant mapping function: + - Tenant-level fields → `_map_tenant()` + - Summary/score fields → `_map_summary()` + - Per-control fields → `_single_control_mapping()` +3. That's it. + +--- + +## PDF conversion + +Tries three methods in order: + +1. **docx2pdf** — needs Microsoft Word installed (Windows/macOS only) +2. **LibreOffice headless** — `soffice` must be on PATH +3. **fpdf2 fallback** — text-only, no layout fidelity, last resort + +Install dependencies: + +```bash +pip install python-docx docx2pdf # for Word-based conversion +# OR +sudo apt install libreoffice # for LibreOffice conversion +``` + +--- + +## Running the smoke test + +```bash +python run_test.py # generates a .docx from fake_dataset.json +python run_test.py --pdf # also converts to PDF +``` + +Expected output: + +``` +Loading dataset : fake_dataset.json + Tenant : Contoso Ltd + Controls : 15 + ... +✓ Report generated successfully! + File : reports_out/Contoso_Ltd_30April2025_AutoAudit_Report.docx + Size : 245.3 KB +``` + +--- + +## Known limitations + +- The template has one finding block per severity level (Critical / High / + Medium / Low). If there are multiple FAILs at the same level, only the first + one appears in the report. The full list is still included in Appendix B. +- Evidence Register supports up to 10 items, remediation plan up to 8 rows. + These limits match the template row count — extend the template if you need more. +- PDF conversion quality depends on which converter is available. Always review + the .docx in Word before distributing the PDF version. +- Fields such as `{ISO_Mapping}`, `{Impact}`, `{Root_Cause}`, and `{Observations}` + in the detailed findings section require enrichment data from the CIS→ISO mapping + and GRC pipeline. These are not populated by `generate_report_from_scan.py` as + that data is not yet wired into the scan results API — this is a separate + integration task. diff --git a/security/reports/fake_dataset.json b/security/reports/fake_dataset.json new file mode 100644 index 000000000..7cd2ef382 --- /dev/null +++ b/security/reports/fake_dataset.json @@ -0,0 +1,843 @@ +{ + "tenant": { + "Tenant_Name": "Contoso Ltd", + "Tenant_Domain": "contoso.com", + "Assessor_Name": "AutoAudit Platform", + "Frameworks_Used": "CIS Microsoft 365 Foundations Benchmark v6.0.1 | ISO/IEC 27001:2022", + "Assessment_Period": "01 April 2025 \u2013 30 April 2025", + "Assessment_Date": "30 April 2025", + "Generated_Date": "30 April 2025 09:14 AEST", + "Classification": "Confidential", + "Report_Version": "v1.0", + "Distribution": "IT Security Team, CISO, External Auditor", + "Prepared_By": "AutoAudit Engine v2.1", + "Reviewed_By": "J. Mitchell (Security Lead)", + "Team_Function": "Information Security", + "Limitations": "This assessment was conducted using read-only API access to Microsoft Entra ID. Conditional Access policy logic requiring graph traversal was partially evaluated; edge-case policy exceptions may not be fully captured. Two controls in the MDM category returned INCONCLUSIVE results due to incomplete MDM enrolment data at time of assessment.", + "Scope_Owner": "IT Security Team", + "Date_Generated": "30 April 2025 09:14 AEST" + }, + "summary": { + "Overall_Score": "61%", + "Overall_Risk_Posture": "HIGH", + "Total_Controls": 15, + "Total_Pass": 9, + "Total_Fail": 6, + "Total_Critical": 2, + "Total_High": 2, + "Total_Medium": 1, + "Total_Low": 1, + "Top_Risk_1": "Legacy authentication protocols remain enabled, exposing the tenant to credential stuffing and password spray attacks", + "Top_Risk_2": "MFA is not enforced for all administrator accounts via Conditional Access, leaving privileged roles vulnerable to account takeover", + "Top_Risk_3": "DMARC policy is set to 'none' for the primary domain, providing no enforcement against email spoofing", + "Strength_1": "Audit log search is enabled and unified audit logging is active across all workloads", + "Strength_1_Evidence": "EV-007", + "Strength_2": "Safe Attachments and Safe Links are enabled for Exchange Online and Office applications", + "Strength_2_Evidence": "EV-003", + "Strength_3": "SharePoint Online external sharing is restricted to existing guests only", + "Strength_3_Evidence": "EV-006", + "Strength_4": "Customer Lockbox is enabled, requiring explicit approval for Microsoft support access", + "Strength_4_Evidence": "EV-008", + "Strength_5": "OneDrive external sharing link expiration is enforced at 30 days", + "Strength_5_Evidence": "EV-009", + "Top_Remediation_Action": "Block legacy authentication protocols via Conditional Access policy targeting all users and all cloud apps", + "Key_Recommendation": "Contoso Ltd demonstrates a partial security baseline with several foundational controls in place. However, the presence of two Critical findings \u2014 legacy authentication and MFA gaps for administrators \u2014 represents unacceptable risk to tenant integrity. Immediate remediation of Critical and High findings is required before the next assessment cycle.", + "categories": { + "Cat_1": { + "area": "Users", + "Total": 2, + "Pass": 1, + "Fail": 1, + "Comment": "Password expiry policy non-compliant; licensing controls adequate" + }, + "Cat_2": { + "area": "Email / Exchange Online", + "Total": 2, + "Pass": 1, + "Fail": 1, + "Comment": "DMARC enforcement absent; Safe Attachments active" + }, + "Cat_3": { + "area": "Accounts & Authentication", + "Total": 3, + "Pass": 1, + "Fail": 2, + "Comment": "Critical gaps in MFA and legacy auth blocking" + }, + "Cat_4": { + "area": "Configuration", + "Total": 1, + "Pass": 1, + "Fail": 0, + "Comment": "Unified audit logging correctly enabled" + }, + "Cat_5": { + "area": "Application Permissions", + "Total": 1, + "Pass": 1, + "Fail": 0, + "Comment": "Third-party app consent restricted to verified publishers" + }, + "Cat_6": { + "area": "Data Management", + "Total": 2, + "Pass": 2, + "Fail": 0, + "Comment": "External sharing and Customer Lockbox both compliant" + }, + "Cat_7": { + "area": "Auditing", + "Total": 2, + "Pass": 1, + "Fail": 1, + "Comment": "Audit log enabled; mailbox audit actions incomplete" + }, + "Cat_8": { + "area": "Storage", + "Total": 1, + "Pass": 1, + "Fail": 0, + "Comment": "OneDrive link expiration policy enforced" + }, + "Cat_9": { + "area": "Mobile Device Management", + "Total": 1, + "Pass": 0, + "Fail": 1, + "Comment": "Android MDM security policy not configured" + } + }, + "Executive_Summary": "This report presents the findings of an automated Microsoft 365 compliance assessment conducted against the CIS Microsoft 365 Foundations Benchmark v6.0.1 and ISO/IEC 27001:2022 for Contoso Ltd. The assessment was performed by the AutoAudit platform using read-only Microsoft Entra ID API access. A total of 15 controls were evaluated across 9 service areas. Of these, 9 controls passed and 6 failed, yielding an overall compliance score of 61%. Two Critical findings were identified: the absence of legacy authentication blocking and inadequate MFA enforcement for administrator accounts. These findings represent the highest priority remediation items and must be resolved before the next assessment cycle. Positively, foundational controls including unified audit logging, Safe Attachments, and SharePoint external sharing restrictions are operating effectively.", + "Cat_1_Total": 2, + "Cat_1_Pass": 1, + "Cat_1_Fail": 1, + "Cat_1_Comment": "Password expiry policy non-compliant; licensing controls adequate", + "Cat_2_Total": 2, + "Cat_2_Pass": 1, + "Cat_2_Fail": 1, + "Cat_2_Comment": "DMARC enforcement absent; Safe Attachments active", + "Cat_3_Total": 3, + "Cat_3_Pass": 1, + "Cat_3_Fail": 2, + "Cat_3_Comment": "Critical gaps in MFA and legacy auth blocking", + "Cat_4_Total": 1, + "Cat_4_Pass": 1, + "Cat_4_Fail": 0, + "Cat_4_Comment": "Unified audit logging correctly enabled", + "Cat_5_Total": 1, + "Cat_5_Pass": 1, + "Cat_5_Fail": 0, + "Cat_5_Comment": "Third-party app consent restricted to verified publishers", + "Cat_6_Total": 2, + "Cat_6_Pass": 2, + "Cat_6_Fail": 0, + "Cat_6_Comment": "External sharing and Customer Lockbox both compliant", + "Cat_7_Total": 2, + "Cat_7_Pass": 1, + "Cat_7_Fail": 1, + "Cat_7_Comment": "Audit log enabled; mailbox audit actions incomplete", + "Cat_8_Total": 1, + "Cat_8_Pass": 1, + "Cat_8_Fail": 0, + "Cat_8_Comment": "OneDrive link expiration policy enforced", + "Cat_9_Total": 1, + "Cat_9_Pass": 0, + "Cat_9_Fail": 1, + "Cat_9_Comment": "Android MDM security policy not configured", + "Remediation_Action_1": "Block legacy authentication for all users via Conditional Access", + "Remediation_Owner_1": "IT Security Team", + "Remediation_Target_1": "14 May 2025", + "Remediation_Status_1": "Open", + "Remediation_Priority_1": "Critical", + "Remediation_Action_2": "Enforce MFA for all Global Administrator and Privileged Role Administrator accounts via Conditional Access", + "Remediation_Owner_2": "IT Security Team", + "Remediation_Target_2": "14 May 2025", + "Remediation_Status_2": "Open", + "Remediation_Priority_2": "Critical", + "Remediation_Action_3": "Update DMARC record to p=quarantine or p=reject for contoso.com", + "Remediation_Owner_3": "IT / DNS Team", + "Remediation_Target_3": "31 May 2025", + "Remediation_Status_3": "Open", + "Remediation_Priority_3": "High", + "Remediation_Action_4": "Enable 'Send' and 'HardDelete' mailbox audit actions for all user mailboxes", + "Remediation_Owner_4": "Exchange Admin", + "Remediation_Target_4": "31 May 2025", + "Remediation_Status_4": "Open", + "Remediation_Priority_4": "High", + "Remediation_Action_5": "Set password expiry policy to comply with CIS recommendation (disable expiry or enforce 365-day maximum)", + "Remediation_Owner_5": "IT Operations", + "Remediation_Target_5": "30 June 2025", + "Remediation_Status_5": "Open", + "Remediation_Priority_5": "Medium", + "Remediation_Action_6": "Configure Android MDM security baseline policy in Intune", + "Remediation_Owner_6": "Endpoint Team", + "Remediation_Target_6": "30 June 2025", + "Remediation_Status_6": "Open", + "Remediation_Priority_6": "Low", + "Remediation_Action_7": "Review and remediate remaining Conditional Access policy gaps identified in assessment", + "Remediation_Owner_7": "IT Security Team", + "Remediation_Target_7": "30 June 2025", + "Remediation_Status_7": "Open", + "Remediation_Priority_7": "Medium", + "Remediation_Action_8": "Schedule re-assessment via AutoAudit once Critical and High findings are resolved", + "Remediation_Owner_8": "CISO", + "Remediation_Target_8": "15 July 2025", + "Remediation_Status_8": "Planned", + "Remediation_Priority_8": "Low", + "Evidence_1_Description": "Entra ID Conditional Access policy list \u2014 legacy auth block policy absent", + "Evidence_1_Source": "Microsoft Graph API /identity/conditionalAccess/policies", + "Evidence_2_Description": "Entra ID per-user MFA status report \u2014 3 of 5 Global Admins show MFA disabled", + "Evidence_2_Source": "Microsoft Graph API /reports/authenticationMethods/userRegistrationDetails", + "Evidence_3_Description": "Defender for Office 365 Safe Attachments policy export \u2014 all policies enabled", + "Evidence_3_Source": "Exchange Online PowerShell Get-SafeAttachmentPolicy", + "Evidence_4_Description": "DNS TXT record query for contoso.com DMARC \u2014 record value p=none", + "Evidence_4_Source": "DNS TXT lookup _dmarc.contoso.com", + "Evidence_5_Description": "Password expiry policy extract \u2014 PasswordNeverExpires set to false with 90-day expiry", + "Evidence_5_Source": "Microsoft Graph API /domains", + "Evidence_6_Description": "SharePoint Online sharing capability \u2014 set to ExistingExternalUserSharingOnly", + "Evidence_6_Source": "SharePoint Admin API /sharepoint/sharingCapability", + "Evidence_7_Description": "Unified audit log configuration \u2014 UnifiedAuditLogIngestionEnabled = True", + "Evidence_7_Source": "Exchange Online PowerShell Get-AdminAuditLogConfig", + "Evidence_8_Description": "Customer Lockbox setting \u2014 CustomerLockboxEnabled = True", + "Evidence_8_Source": "Microsoft Graph API /admin/serviceAnnouncement/settings", + "Evidence_9_Description": "OneDrive external sharing link expiration \u2014 ExternalUserExpireInDays = 30", + "Evidence_9_Source": "SharePoint Admin API /onedrive/sharingExpirationDays", + "Evidence_10_Description": "Mailbox audit configuration \u2014 AuditEnabled = True but SendAs and HardDelete actions absent from default policy", + "Evidence_10_Source": "Exchange Online PowerShell Get-MailboxAuditBypassAssociation", + "Evidence_11_Description": "Third-party integrated app consent policy \u2014 set to require admin approval for verified publishers only", + "Evidence_11_Source": "Entra ID Admin Centre > Enterprise Applications > Consent and Permissions", + "Evidence_12_Description": "Intune MDM device configuration profiles \u2014 Android profile absent; iOS profile active", + "Evidence_12_Source": "Microsoft Graph API /deviceManagement/deviceConfigurations" + }, + "remediation_plan": [ + { + "Remediation_Action": "Block legacy authentication for all users via Conditional Access", + "Priority": "Critical", + "Owner": "IT Security Team", + "Target_Date": "14 May 2025", + "Remediation_Status": "Open", + "index": 1 + }, + { + "Remediation_Action": "Enforce MFA for all Global Administrator and Privileged Role Administrator accounts via Conditional Access", + "Priority": "Critical", + "Owner": "IT Security Team", + "Target_Date": "14 May 2025", + "Remediation_Status": "Open", + "index": 2 + }, + { + "Remediation_Action": "Update DMARC record to p=quarantine or p=reject for contoso.com", + "Priority": "High", + "Owner": "IT / DNS Team", + "Target_Date": "31 May 2025", + "Remediation_Status": "Open", + "index": 3 + }, + { + "Remediation_Action": "Enable 'Send' and 'HardDelete' mailbox audit actions for all user mailboxes", + "Priority": "High", + "Owner": "Exchange Admin", + "Target_Date": "31 May 2025", + "Remediation_Status": "Open", + "index": 4 + }, + { + "Remediation_Action": "Set password expiry policy to comply with CIS recommendation (disable expiry or enforce 365-day maximum)", + "Priority": "Medium", + "Owner": "IT Operations", + "Target_Date": "30 June 2025", + "Remediation_Status": "Open", + "index": 5 + }, + { + "Remediation_Action": "Configure Android MDM security baseline policy in Intune", + "Priority": "Low", + "Owner": "Endpoint Team", + "Target_Date": "30 June 2025", + "Remediation_Status": "Open", + "index": 6 + }, + { + "Remediation_Action": "Review and remediate remaining Conditional Access policy gaps identified in assessment", + "Priority": "Medium", + "Owner": "IT Security Team", + "Target_Date": "30 June 2025", + "Remediation_Status": "Open", + "index": 7 + }, + { + "Remediation_Action": "Schedule re-assessment via AutoAudit once Critical and High findings are resolved", + "Priority": "Low", + "Owner": "CISO", + "Target_Date": "15 July 2025", + "Remediation_Status": "Planned", + "index": 8 + } + ], + "evidence_register": [ + { + "Evidence_ID": "EV-001", + "Evidence_Description": "Entra ID Conditional Access policy list \u2014 legacy auth block policy absent", + "Evidence_Source": "Microsoft Graph API /identity/conditionalAccess/policies", + "Mapped_Control": "CIS-2.1.1", + "Date_Captured": "30 April 2025", + "index": 1 + }, + { + "Evidence_ID": "EV-002", + "Evidence_Description": "Entra ID per-user MFA status report \u2014 3 of 5 Global Admins show MFA disabled", + "Evidence_Source": "Microsoft Graph API /reports/authenticationMethods/userRegistrationDetails", + "Mapped_Control": "CIS-1.1.1", + "Date_Captured": "30 April 2025", + "index": 2 + }, + { + "Evidence_ID": "EV-003", + "Evidence_Description": "Defender for Office 365 Safe Attachments policy export \u2014 all policies enabled", + "Evidence_Source": "Exchange Online PowerShell Get-SafeAttachmentPolicy", + "Mapped_Control": "CIS-2.1.6", + "Date_Captured": "30 April 2025", + "index": 3 + }, + { + "Evidence_ID": "EV-004", + "Evidence_Description": "DNS TXT record query for contoso.com DMARC \u2014 record value p=none", + "Evidence_Source": "DNS TXT lookup _dmarc.contoso.com", + "Mapped_Control": "CIS-2.1.9", + "Date_Captured": "30 April 2025", + "index": 4 + }, + { + "Evidence_ID": "EV-005", + "Evidence_Description": "Password expiry policy extract \u2014 PasswordNeverExpires set to false with 90-day expiry", + "Evidence_Source": "Microsoft Graph API /domains", + "Mapped_Control": "CIS-1.1.14", + "Date_Captured": "30 April 2025", + "index": 5 + }, + { + "Evidence_ID": "EV-006", + "Evidence_Description": "SharePoint Online sharing capability \u2014 set to ExistingExternalUserSharingOnly", + "Evidence_Source": "SharePoint Admin API /sharepoint/sharingCapability", + "Mapped_Control": "CIS-7.2.2", + "Date_Captured": "30 April 2025", + "index": 6 + }, + { + "Evidence_ID": "EV-007", + "Evidence_Description": "Unified audit log configuration \u2014 UnifiedAuditLogIngestionEnabled = True", + "Evidence_Source": "Exchange Online PowerShell Get-AdminAuditLogConfig", + "Mapped_Control": "CIS-3.1.1", + "Date_Captured": "30 April 2025", + "index": 7 + }, + { + "Evidence_ID": "EV-008", + "Evidence_Description": "Customer Lockbox setting \u2014 CustomerLockboxEnabled = True", + "Evidence_Source": "Microsoft Graph API /admin/serviceAnnouncement/settings", + "Mapped_Control": "CIS-7.3.1", + "Date_Captured": "30 April 2025", + "index": 8 + }, + { + "Evidence_ID": "EV-009", + "Evidence_Description": "OneDrive external sharing link expiration \u2014 ExternalUserExpireInDays = 30", + "Evidence_Source": "SharePoint Admin API /onedrive/sharingExpirationDays", + "Mapped_Control": "CIS-7.2.9", + "Date_Captured": "30 April 2025", + "index": 9 + }, + { + "Evidence_ID": "EV-010", + "Evidence_Description": "Mailbox audit configuration \u2014 AuditEnabled = True but SendAs and HardDelete actions absent from default policy", + "Evidence_Source": "Exchange Online PowerShell Get-MailboxAuditBypassAssociation", + "Mapped_Control": "CIS-3.1.2", + "Date_Captured": "30 April 2025", + "index": 10 + }, + { + "Evidence_ID": "EV-011", + "Evidence_Description": "Third-party integrated app consent policy \u2014 set to require admin approval for verified publishers only", + "Evidence_Source": "Entra ID Admin Centre > Enterprise Applications > Consent and Permissions", + "Mapped_Control": "CIS-5.1.2", + "Date_Captured": "30 April 2025", + "index": 11 + }, + { + "Evidence_ID": "EV-012", + "Evidence_Description": "Intune MDM device configuration profiles \u2014 Android profile absent; iOS profile active", + "Evidence_Source": "Microsoft Graph API /deviceManagement/deviceConfigurations", + "Mapped_Control": "CIS-8.1.1", + "Date_Captured": "30 April 2025", + "index": 12 + } + ], + "controls": [ + { + "UniqueID": "CIS-1.1.1", + "Control_Name": "Ensure Multi-Factor Authentication is enabled for all users in administrative roles", + "Category": "Accounts & Authentication", + "CIS_Section": "1.1.1", + "ISO_Mapping": "A.9.4.2", + "Strategy": "Identity & Access Management", + "Sub_Strategy": "Privileged Account Protection", + "Test_id": "AAD-MFA-ADMIN-001", + "Level": "L1", + "Compliance_Status": "Non-Compliant", + "Risk_Rating": "Critical", + "Priority": "Immediate Action", + "Pass_Fail": "FAIL", + "Description": "All accounts assigned administrative roles must have MFA enforced via Conditional Access policy. Per-user MFA legacy settings are insufficient as they can be bypassed.", + "Observations": "3 of 5 Global Administrator accounts (admin@contoso.com, svc-admin@contoso.com, backup-admin@contoso.com) do not have MFA enforced via any active Conditional Access policy. Per-user MFA is partially enabled but no CA policy targets the Global Administrator role.", + "Justification": "API query to /reports/authenticationMethods/userRegistrationDetails confirmed 3 accounts with isMfaRegistered = false and no Conditional Access policy with grantControls.builtInControls containing 'mfa' targeting the Global Administrator directory role was found.", + "Evidence_Type": "API JSON output", + "File_Name": "userRegistrationDetails_20250430.json", + "Extract": "{ \"id\": \"svc-admin@contoso.com\", \"isMfaRegistered\": false, \"isMfaCapable\": false }", + "Confidence": "High", + "Evidence_Explanation": "The API response confirms the service account used for automated tasks holds Global Administrator privileges and has no MFA registration or enforcement in place.", + "Impact": "An attacker obtaining valid credentials for any of the three affected admin accounts via phishing, credential stuffing, or password spray would gain unrestricted Global Administrator access to the entire M365 tenant with no additional authentication barrier. This enables full data exfiltration, mailbox access, and tenant configuration modification.", + "Root_Cause": "MFA was not enforced when the service accounts were provisioned. Legacy per-user MFA settings were applied to interactive user accounts but were never extended to service and backup admin accounts. No Conditional Access policy scoped to administrative roles was created.", + "Remediation": "Create a Conditional Access policy requiring MFA for all users assigned Global Administrator, Privileged Role Administrator, Exchange Administrator, and SharePoint Administrator roles. Exclude only break-glass accounts protected by alternative controls. Disable legacy per-user MFA settings once CA policy is verified.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "14 May 2025", + "Remediation_Status": "Open", + "Pass_Fail_Label": "FAIL" + }, + { + "UniqueID": "CIS-2.1.1", + "Control_Name": "Ensure Legacy Authentication is Blocked", + "Category": "Accounts & Authentication", + "CIS_Section": "2.1.1", + "ISO_Mapping": "A.9.4.3", + "Strategy": "Identity & Access Management", + "Sub_Strategy": "Authentication Protocol Hardening", + "Test_id": "AAD-LEGACYAUTH-001", + "Level": "L1", + "Compliance_Status": "Non-Compliant", + "Risk_Rating": "Critical", + "Priority": "Immediate Action", + "Pass_Fail": "FAIL", + "Description": "Legacy authentication protocols such as IMAP, POP3, SMTP AUTH, and Basic Auth do not support MFA and cannot be protected by Conditional Access. These protocols must be blocked tenant-wide.", + "Observations": "No Conditional Access policy blocking legacy authentication protocols was found in the tenant. SMTP AUTH remains enabled at the organisation level. Entra ID sign-in logs show 47 legacy auth sign-in attempts in the 30-day assessment window.", + "Justification": "Enumeration of all Conditional Access policies via /identity/conditionalAccess/policies found no policy with conditions.clientAppTypes containing 'exchangeActiveSync' or 'other' combined with a block grant control. Sign-in logs confirmed active legacy auth usage.", + "Evidence_Type": "API JSON output + Sign-in log extract", + "File_Name": "conditionalAccessPolicies_20250430.json", + "Extract": "No policy found matching: clientAppTypes=[exchangeActiveSync, other] AND grantControls.operator=block", + "Confidence": "High", + "Evidence_Explanation": "The absence of any blocking Conditional Access policy, combined with active legacy auth sign-ins in the log data, confirms the control is not in place and that legacy protocols are actively being used.", + "Impact": "Legacy authentication protocols bypass MFA entirely. Attackers can conduct password spray attacks against Exchange ActiveSync or Basic Auth endpoints with no lockout or MFA challenge. This is the most common attack vector for M365 account compromise.", + "Root_Cause": "No Conditional Access policy was created to block legacy authentication at the time of tenant provisioning. SMTP AUTH was left enabled for a legacy line-of-business application and was never reviewed.", + "Remediation": "1. Create a Conditional Access policy targeting All Users, All Cloud Apps, with client app types set to Exchange ActiveSync and Other clients, and grant control set to Block. 2. Disable SMTP AUTH at the organisation level and re-enable only for specific accounts that genuinely require it via per-mailbox override. 3. Review the 47 legacy auth sign-in accounts and migrate to modern authentication.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "14 May 2025", + "Remediation_Status": "Open", + "Pass_Fail_Label": "FAIL" + }, + { + "UniqueID": "CIS-2.1.9", + "Control_Name": "Ensure DMARC policy for domains is enabled and set to quarantine or reject", + "Category": "Email / Exchange Online", + "CIS_Section": "2.1.9", + "ISO_Mapping": "A.13.2.3", + "Strategy": "Email Security", + "Sub_Strategy": "Anti-Spoofing Controls", + "Test_id": "EXO-DMARC-001", + "Level": "L1", + "Compliance_Status": "Non-Compliant", + "Risk_Rating": "High", + "Priority": "Within 30 Days", + "Pass_Fail": "FAIL", + "Description": "DMARC (Domain-based Message Authentication, Reporting and Conformance) must be configured with a policy of p=quarantine or p=reject to prevent spoofed emails from being delivered to recipients.", + "Observations": "The DMARC TXT record for contoso.com is configured with p=none. This means spoofed emails purporting to be from @contoso.com will be delivered without quarantine or rejection. SPF and DKIM records are correctly configured.", + "Justification": "DNS TXT lookup for _dmarc.contoso.com returned: v=DMARC1; p=none; rua=mailto:dmarc-reports@contoso.com. The p=none policy provides reporting only and no enforcement action.", + "Evidence_Type": "DNS TXT record", + "File_Name": "dns_dmarc_contoso_20250430.txt", + "Extract": "v=DMARC1; p=none; rua=mailto:dmarc-reports@contoso.com", + "Confidence": "High", + "Evidence_Explanation": "The DNS record is unambiguous. p=none instructs receiving mail servers to take no action on DMARC failures, rendering the control ineffective for spoofing prevention.", + "Impact": "Threat actors can send phishing or business email compromise (BEC) emails that appear to originate from @contoso.com addresses. Recipients inside and outside the organisation have no technical protection from these spoofed messages.", + "Root_Cause": "DMARC was initially deployed in monitoring mode (p=none) to review reporting data without impacting legitimate mail flow. The policy was never progressed to enforcement mode after the initial monitoring period.", + "Remediation": "1. Review DMARC aggregate reports to identify any legitimate mail sources not yet covered by SPF or DKIM. 2. Update the DMARC TXT record to p=quarantine as an intermediate step. 3. After 30 days of monitoring, escalate to p=reject. Target record: v=DMARC1; p=quarantine; rua=mailto:dmarc-reports@contoso.com; pct=100.", + "Recommendations": "IT / DNS Team", + "Owner": "IT / DNS Team", + "Target_Date": "31 May 2025", + "Remediation_Status": "Open", + "Pass_Fail_Label": "FAIL" + }, + { + "UniqueID": "CIS-3.1.2", + "Control_Name": "Ensure mailbox auditing for all users is enabled with complete action coverage", + "Category": "Auditing", + "CIS_Section": "3.1.2", + "ISO_Mapping": "A.12.4.1", + "Strategy": "Logging & Monitoring", + "Sub_Strategy": "Mailbox Audit Coverage", + "Test_id": "EXO-MBOXAUDIT-001", + "Level": "L1", + "Compliance_Status": "Partially Compliant", + "Risk_Rating": "High", + "Priority": "Within 30 Days", + "Pass_Fail": "FAIL", + "Description": "Mailbox auditing must be enabled for all user mailboxes and must include the Send, SendAs, HardDelete, and MailItemsAccessed actions to ensure complete forensic capability.", + "Observations": "Mailbox auditing is enabled at the organisation level. However, the default audit action set does not include 'Send' or 'HardDelete' for user mailboxes. MailItemsAccessed is only captured for E5-licensed accounts (14 of 47 users).", + "Justification": "Get-MailboxAuditBypassAssociation and Get-Mailbox -ResultSize Unlimited confirm AuditEnabled = True globally, but AuditOwner action list excludes HardDelete and Send. MailItemsAccessed requires E5 or Compliance add-on licence not assigned to 33 user accounts.", + "Evidence_Type": "PowerShell command output", + "File_Name": "mailbox_audit_config_20250430.txt", + "Extract": "AuditOwner: {Update, MoveToDeletedItems, SoftDelete, UpdateCalendarDelegation}", + "Confidence": "High", + "Evidence_Explanation": "The AuditOwner action list confirms that permanent deletion (HardDelete) and sent-item actions (Send) are not captured, creating blind spots for insider threat and data exfiltration investigations.", + "Impact": "Incident response investigations following a compromised account will have incomplete audit trails. Specifically, investigators will be unable to determine what emails were sent from or permanently deleted from a compromised mailbox, limiting forensic capability and potentially violating compliance obligations.", + "Root_Cause": "Default Exchange Online audit actions were never customised from Microsoft defaults. The Send and HardDelete actions require explicit addition to the AuditOwner and AuditDelegate action sets. Licence gaps prevent MailItemsAccessed from being captured for all users.", + "Remediation": "1. Run: Set-Mailbox -Identity * -AuditOwner @{Add='Send','HardDelete'} -AuditDelegate @{Add='Send','HardDelete'} to add missing actions. 2. Assess licence upgrade or Compliance add-on to extend MailItemsAccessed coverage. 3. Verify changes via Get-Mailbox | Select AuditOwner.", + "Recommendations": "Exchange Admin", + "Owner": "Exchange Admin", + "Target_Date": "31 May 2025", + "Remediation_Status": "Open", + "Pass_Fail_Label": "FAIL" + }, + { + "UniqueID": "CIS-1.1.14", + "Control_Name": "Ensure the password expiration policy is set in accordance with CIS recommendations", + "Category": "Users", + "CIS_Section": "1.1.14", + "ISO_Mapping": "A.9.4.3", + "Strategy": "Identity & Access Management", + "Sub_Strategy": "Password Policy", + "Test_id": "AAD-PWDEXPIRY-001", + "Level": "L1", + "Compliance_Status": "Non-Compliant", + "Risk_Rating": "Medium", + "Priority": "Within 90 Days", + "Pass_Fail": "FAIL", + "Description": "CIS recommends either disabling password expiry entirely (in conjunction with strong MFA and breach password detection) or setting a maximum password age of no less than 365 days. Short rotation periods drive predictable password patterns.", + "Observations": "The tenant password policy has PasswordNeverExpires set to false with a maximum password age of 90 days. This contradicts current NIST SP 800-63B and CIS guidance, which discourages arbitrary rotation in favour of breach detection-triggered resets.", + "Justification": "Microsoft Graph API query to /domains returned: passwordValidityPeriodInDays = 90, passwordNotificationWindowInDays = 14. PasswordNeverExpires = false.", + "Evidence_Type": "API JSON output", + "File_Name": "domain_password_policy_20250430.json", + "Extract": "{ \"passwordValidityPeriodInDays\": 90, \"passwordNotificationWindowInDays\": 14, \"isPasswordSyncEnabled\": false }", + "Confidence": "High", + "Evidence_Explanation": "The 90-day expiry policy is confirmed. This does not represent an immediately exploitable vulnerability but increases risk of predictable password cycling and user workarounds.", + "Impact": "Users subjected to frequent mandatory rotation historically adopt predictable increment patterns (e.g., Password1! \u2192 Password2!). This reduces the effective entropy of credentials and can be exploited in targeted attacks. Risk is partially mitigated if MFA is fully enforced.", + "Root_Cause": "Password expiry policy was set to 90 days at tenant provisioning and has not been reviewed against updated guidance. The organisation's IT policy predates NIST 800-63B revision.", + "Remediation": "Update the domain password policy to either: (a) Set passwordValidityPeriodInDays to 2147483647 (effectively never, relying on breach detection and MFA), or (b) Extend to a minimum of 365 days. Ensure Entra ID Password Protection and leaked credential detection are enabled before removing expiry.", + "Recommendations": "IT Operations", + "Owner": "IT Operations", + "Target_Date": "30 June 2025", + "Remediation_Status": "Open", + "Pass_Fail_Label": "FAIL" + }, + { + "UniqueID": "CIS-8.1.1", + "Control_Name": "Ensure mobile device management policies are configured for Android devices", + "Category": "Mobile Device Management", + "CIS_Section": "8.1.1", + "ISO_Mapping": "A.6.2.1", + "Strategy": "Endpoint Security", + "Sub_Strategy": "Mobile Device Management", + "Test_id": "MDM-ANDROID-001", + "Level": "L1", + "Compliance_Status": "Non-Compliant", + "Risk_Rating": "Low", + "Priority": "Monitor & Review", + "Pass_Fail": "FAIL", + "Description": "An MDM device configuration profile must be assigned in Intune for Android devices, enforcing minimum PIN length, device encryption, screen lock timeout, and block of rooted devices.", + "Observations": "An iOS MDM security baseline profile is active and assigned to all iOS devices. No equivalent profile exists for Android devices. 9 Android devices are enrolled in Intune without any configuration policy applied.", + "Justification": "Microsoft Graph API query to /deviceManagement/deviceConfigurations returned one active profile (iOS General Device Restrictions). No profile with oDataType containing 'android' was found. Enrolled Android device count confirmed via /deviceManagement/managedDevices.", + "Evidence_Type": "API JSON output", + "File_Name": "intune_device_configs_20250430.json", + "Extract": "{ \"value\": [ { \"@odata.type\": \"#microsoft.graph.iosGeneralDeviceConfiguration\", \"displayName\": \"iOS Security Baseline\", \"id\": \"a1b2c3d4\" } ] }", + "Confidence": "High", + "Evidence_Explanation": "The API response lists only an iOS configuration profile. The absence of any Android profile entry confirms the gap. The 9 enrolled Android devices are operating without any enforced security baseline.", + "Impact": "Android devices with access to corporate M365 data (Exchange email, SharePoint, OneDrive) are not subject to any enforced encryption, PIN, or lock-screen policy. A lost or stolen Android device could expose corporate data without any technical barrier.", + "Root_Cause": "The iOS MDM profile was deployed when Intune was first configured. Android device support was added later and the equivalent profile was not created. The gap was not identified in previous reviews.", + "Remediation": "Create an Android Device Restrictions configuration profile in Intune enforcing: minimum 6-digit PIN, device encryption required, screen lock after 5 minutes, block rooted devices, require device compliance for M365 app access. Assign the profile to the All Devices group.", + "Recommendations": "Endpoint Team", + "Owner": "Endpoint Team", + "Target_Date": "30 June 2025", + "Remediation_Status": "Open", + "Pass_Fail_Label": "FAIL" + }, + { + "UniqueID": "CIS-1.1.3", + "Control_Name": "Ensure that between two and four global admins are designated", + "Category": "Accounts & Authentication", + "CIS_Section": "1.1.3", + "ISO_Mapping": "A.9.2.3", + "Strategy": "Identity & Access Management", + "Sub_Strategy": "Privileged Role Governance", + "Test_id": "AAD-GADMIN-COUNT-001", + "Level": "L1", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "Microsoft recommends between 2 and 4 Global Administrator accounts. Fewer than 2 creates a single point of failure; more than 4 unnecessarily expands the privileged attack surface.", + "Observations": "5 Global Administrator accounts are currently assigned. This marginally exceeds the recommended maximum of 4. However, one account (backup-admin@contoso.com) is noted as a break-glass account with restricted access.", + "Justification": "Graph API /directoryRoles/{GlobalAdmin}/members returned 5 accounts. The break-glass account characteristics partially mitigate the excess.", + "Evidence_Type": "API JSON output", + "File_Name": "globalAdmins_20250430.json", + "Extract": "{ \"value\": [ { \"displayName\": \"Admin1\" }, { \"displayName\": \"Admin2\" }, { \"displayName\": \"Admin3\" }, { \"displayName\": \"svc-admin\" }, { \"displayName\": \"backup-admin\" } ] }", + "Confidence": "Medium", + "Evidence_Explanation": "Five accounts confirmed. Assessed as PASS given break-glass account mitigates the excess count, but the organisation should review whether all five accounts require full Global Admin.", + "Impact": "N/A \u2014 Passed with observation. Minor risk from excess Global Admin count noted.", + "Root_Cause": "N/A", + "Remediation": "Review whether the service account (svc-admin@contoso.com) can be scoped to a lower privilege role. Target 4 accounts maximum.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "30 June 2025", + "Remediation_Status": "Advisory", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-2.1.6", + "Control_Name": "Ensure Safe Attachments policy is enabled for all users", + "Category": "Email / Exchange Online", + "CIS_Section": "2.1.6", + "ISO_Mapping": "A.12.2.1", + "Strategy": "Email Security", + "Sub_Strategy": "Malware Protection", + "Test_id": "EXO-SAFEATT-001", + "Level": "L2", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "Microsoft Defender for Office 365 Safe Attachments must be enabled with a policy covering all users to detonate and inspect email attachments in a sandbox prior to delivery.", + "Observations": "A Safe Attachments policy named 'Contoso-SafeAtt-All' is active, set to Dynamic Delivery mode, and assigned to the All Recipients condition. Policy has been active for 14 months.", + "Justification": "Get-SafeAttachmentPolicy returned one active policy with Enable = True, Action = DynamicDelivery, Applied = All.", + "Evidence_Type": "PowerShell command output", + "File_Name": "safeAttachments_policy_20250430.txt", + "Extract": "Name: Contoso-SafeAtt-All | Enable: True | Action: DynamicDelivery | Applied: AllRecipients", + "Confidence": "High", + "Evidence_Explanation": "Policy confirmed active and covering all recipients. Dynamic Delivery ensures users receive emails promptly while attachments are scanned, minimising productivity impact.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Maintain current configuration.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-3.1.1", + "Control_Name": "Ensure audit log search is enabled", + "Category": "Configuration", + "CIS_Section": "3.1.1", + "ISO_Mapping": "A.12.4.1", + "Strategy": "Logging & Monitoring", + "Sub_Strategy": "Audit Configuration", + "Test_id": "EXO-AUDITLOG-001", + "Level": "L1", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "Unified audit log search must be enabled to capture user and admin activity across Exchange Online, SharePoint, Teams, and Entra ID. This is foundational for incident response and compliance reporting.", + "Observations": "Unified audit logging is enabled. Audit retention is set to 90 days (E3 licence default). Audit records are being ingested into the tenant's Sentinel workspace for extended retention.", + "Justification": "Get-AdminAuditLogConfig returned UnifiedAuditLogIngestionEnabled = True. Microsoft Sentinel data connector confirmed active via Azure Monitor workspace configuration.", + "Evidence_Type": "PowerShell command output", + "File_Name": "auditlog_config_20250430.txt", + "Extract": "UnifiedAuditLogIngestionEnabled: True | AdminAuditLogEnabled: True | MailboxLoggingEnabled: True", + "Confidence": "High", + "Evidence_Explanation": "Audit logging confirmed enabled across all workloads. Sentinel integration provides extended retention beyond the default 90-day M365 window.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Consider upgrading to E5 Compliance for 1-year default audit retention if Sentinel integration is ever discontinued.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-5.1.2", + "Control_Name": "Ensure user consent to apps accessing company data is restricted", + "Category": "Application Permissions", + "CIS_Section": "5.1.2", + "ISO_Mapping": "A.9.4.5", + "Strategy": "Application Security", + "Sub_Strategy": "OAuth Consent Governance", + "Test_id": "AAD-APPCONSENT-001", + "Level": "L1", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "User consent to third-party applications accessing M365 data must be restricted. Permitted consent models are: admin consent only, or user consent restricted to verified publishers with low-impact permissions.", + "Observations": "The tenant consent policy is set to 'Allow user consent for apps from verified publishers for selected permissions (low impact)'. Admin consent workflow is enabled, routing requests to the IT Security Team.", + "Justification": "Entra ID Enterprise Applications > Consent and Permissions settings confirmed via Graph API /policies/authorizationPolicy: permissionGrantPoliciesAssigned = [managePermissionGrantsForSelf.microsoft-user-default-low].", + "Evidence_Type": "API JSON output", + "File_Name": "authorizationPolicy_20250430.json", + "Extract": "{ \"permissionGrantPoliciesAssigned\": [\"managePermissionGrantsForSelf.microsoft-user-default-low\"] }", + "Confidence": "High", + "Evidence_Explanation": "Policy confirmed. Users can only consent to low-impact permissions from verified publishers. All other consent requires admin approval, preventing illicit consent grant attacks.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Periodically review admin consent request queue to ensure timely processing.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-7.2.2", + "Control_Name": "Ensure SharePoint Online external sharing is restricted to existing external users", + "Category": "Data Management", + "CIS_Section": "7.2.2", + "ISO_Mapping": "A.13.2.3", + "Strategy": "Data Protection", + "Sub_Strategy": "External Sharing Controls", + "Test_id": "SPO-EXTSHARE-001", + "Level": "L2", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "SharePoint Online external sharing must be set to allow sharing only with existing external users already in the organisation's directory, preventing ad-hoc sharing with unknown recipients.", + "Observations": "SharePoint Online SharingCapability is set to ExistingExternalUserSharingOnly. This was confirmed for both the tenant-level and OneDrive-level settings.", + "Justification": "SharePoint Admin API returned: SharingCapability = ExistingExternalUserSharingOnly at tenant level. OneDrive SharingCapability confirmed as consistent.", + "Evidence_Type": "API JSON output", + "File_Name": "sharepoint_sharing_20250430.json", + "Extract": "{ \"SharingCapability\": \"ExistingExternalUserSharingOnly\", \"OneDriveSharingCapability\": \"ExistingExternalUserSharingOnly\" }", + "Confidence": "High", + "Evidence_Explanation": "Both SharePoint and OneDrive sharing controls confirmed compliant. Users cannot share with new external parties without an admin provisioning the external user first.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Review sharing policy annually or following any M365 licence tier changes.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-7.3.1", + "Control_Name": "Ensure Customer Lockbox is enabled", + "Category": "Data Management", + "CIS_Section": "7.3.1", + "ISO_Mapping": "A.15.1.2", + "Strategy": "Data Protection", + "Sub_Strategy": "Microsoft Support Access Controls", + "Test_id": "M365-LOCKBOX-001", + "Level": "L2", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "Customer Lockbox ensures that Microsoft support engineers must obtain explicit customer approval before accessing any customer content in M365 during a support engagement.", + "Observations": "Customer Lockbox is enabled at the tenant level. The designated approvers are the two active Global Administrators.", + "Justification": "Graph API /admin/serviceAnnouncement/settings returned: isCustomerLockboxEnabled = true.", + "Evidence_Type": "API JSON output", + "File_Name": "tenant_admin_settings_20250430.json", + "Extract": "{ \"isCustomerLockboxEnabled\": true }", + "Confidence": "High", + "Evidence_Explanation": "Setting confirmed enabled. Customer Lockbox requires an E5 licence or Compliance add-on, confirming appropriate licencing is in place for this control.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Ensure Lockbox approval process is documented in the organisation's incident response runbook.", + "Recommendations": "CISO", + "Owner": "CISO", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-7.2.9", + "Control_Name": "Ensure external sharing links for OneDrive expire within 30 days", + "Category": "Storage", + "CIS_Section": "7.2.9", + "ISO_Mapping": "A.13.2.3", + "Strategy": "Data Protection", + "Sub_Strategy": "Link Expiration Controls", + "Test_id": "OD-LINKEXPIRY-001", + "Level": "L2", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "External sharing links for OneDrive must be configured to expire, with a maximum expiration window of 30 days, to limit prolonged unmanaged access to shared files.", + "Observations": "OneDrive external sharing link expiration is set to 30 days (ExternalUserExpireInDays = 30). Anonymous link expiration is also set to 30 days.", + "Justification": "SharePoint Admin API returned: ExternalUserExpireInDays = 30, AnonymousLinkExpirationInDays = 30.", + "Evidence_Type": "API JSON output", + "File_Name": "onedrive_sharing_settings_20250430.json", + "Extract": "{ \"ExternalUserExpireInDays\": 30, \"AnonymousLinkExpirationInDays\": 30 }", + "Confidence": "High", + "Evidence_Explanation": "Both external user and anonymous link expiration periods confirmed at 30 days, meeting the CIS requirement.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-1.1.15", + "Control_Name": "Ensure user role assignments are reviewed and excess privileges removed", + "Category": "Users", + "CIS_Section": "1.1.15", + "ISO_Mapping": "A.9.2.5", + "Strategy": "Identity & Access Management", + "Sub_Strategy": "Least Privilege Enforcement", + "Test_id": "AAD-ROLEASSIGN-001", + "Level": "L1", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "User role assignments should be reviewed periodically to ensure no accounts hold unnecessary administrative privileges. Privileged Identity Management (PIM) should be used where available.", + "Observations": "PIM is active for all administrative roles. Eligible assignments require justification and MFA step-up for activation. No permanent active assignments exist outside of break-glass accounts. Last access review was completed 22 March 2025.", + "Justification": "Graph API /privilegedAccess/aadRoles/resources/{tenantId}/roleAssignments confirmed all admin roles use eligible assignment type. roleAssignmentScheduleInstances confirmed no permanent active non-break-glass admin assignments.", + "Evidence_Type": "API JSON output", + "File_Name": "pim_role_assignments_20250430.json", + "Extract": "{ \"assignmentType\": \"Eligible\", \"scheduleInfo\": { \"expiration\": { \"type\": \"noExpiration\" } } }", + "Confidence": "High", + "Evidence_Explanation": "PIM eligible assignments confirmed. All role activations require MFA and business justification, consistent with CIS and Microsoft best practice.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Maintain quarterly access review cadence.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + }, + { + "UniqueID": "CIS-3.1.3", + "Control_Name": "Ensure Microsoft Defender for Cloud Apps is enabled and alerts are reviewed", + "Category": "Auditing", + "CIS_Section": "3.1.3", + "ISO_Mapping": "A.12.4.1", + "Strategy": "Logging & Monitoring", + "Sub_Strategy": "Cloud Application Security Monitoring", + "Test_id": "MDCA-ENABLE-001", + "Level": "L2", + "Compliance_Status": "Compliant", + "Risk_Rating": "N/A", + "Priority": "N/A", + "Pass_Fail": "PASS", + "Description": "Microsoft Defender for Cloud Apps must be enabled and connected to M365 to provide anomaly detection, shadow IT discovery, and session policy enforcement.", + "Observations": "Defender for Cloud Apps is active with the M365 app connector enabled. Anomaly detection policies are in default state with 3 active custom alert policies. Alert review cadence is weekly per SOC procedures.", + "Justification": "Defender for Cloud Apps admin portal confirmed app connector status = Connected for Microsoft 365. Custom alert policies confirmed via API.", + "Evidence_Type": "Portal configuration screenshot + API output", + "File_Name": "mdca_connector_20250430.json", + "Extract": "{ \"appId\": 11161, \"name\": \"Microsoft 365\", \"status\": \"Connected\", \"lastSynced\": \"2025-04-30T06:00:00Z\" }", + "Confidence": "High", + "Evidence_Explanation": "M365 connector confirmed active and syncing. Anomaly detection is operational. Weekly alert review provides adequate coverage for the current threat posture.", + "Impact": "N/A \u2014 Compliant.", + "Root_Cause": "N/A", + "Remediation": "No action required. Consider enabling session controls for high-risk users once Conditional Access gaps are remediated.", + "Recommendations": "IT Security Team", + "Owner": "IT Security Team", + "Target_Date": "N/A", + "Remediation_Status": "Closed", + "Pass_Fail_Label": "PASS" + } + ] +} \ No newline at end of file diff --git a/security/reports/generate_report_from_scan.py b/security/reports/generate_report_from_scan.py new file mode 100644 index 000000000..5202c9a14 --- /dev/null +++ b/security/reports/generate_report_from_scan.py @@ -0,0 +1,538 @@ +""" +generate_report_from_scan.py +============================= +Transforms AutoAudit API scan results into the dataset schema expected by +report_service.py and generates a compliance report (.docx or PDF). + +Works with any tenant — all tenant metadata is derived from the scan data +or supplied via CLI arguments. No hardcoded values. + +Usage (two modes): +------------------ + +1. From saved JSON files (offline): + + python generate_report_from_scan.py \\ + --results scan_results.json \\ + --meta scan_meta.json \\ + --template AutoAudit_Report_Template.docx \\ + --output reports_out + +2. Live from API (fetches scan data directly): + + python generate_report_from_scan.py \\ + --api-url http://localhost:8000 \\ + --token \\ + --scan-id 1 \\ + --template AutoAudit_Report_Template.docx \\ + --output reports_out + +Add --pdf to also convert the output to PDF. +Add --save-dataset to save the intermediate transformed JSON for inspection. + +Environment variables (alternative to --token): + AUTOAUDIT_TOKEN Bearer token for API authentication + +Examples: +--------- + # Offline from files + python generate_report_from_scan.py --results real_scan_results.json --meta scan_meta.json + + # Live from running API + python generate_report_from_scan.py --api-url http://localhost:8000 --token $TOKEN --scan-id 1 + + # Live + PDF output + python generate_report_from_scan.py --api-url http://localhost:8000 --token $TOKEN --scan-id 1 --pdf +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from datetime import datetime +from pathlib import Path +from typing import Optional + +# --------------------------------------------------------------------------- +# API helpers +# --------------------------------------------------------------------------- + +def _api_get(base_url: str, path: str, token: str) -> dict | list: + """GET request to the AutoAudit API.""" + try: + import urllib.request + req = urllib.request.Request( + f"{base_url.rstrip('/')}{path}", + headers={"Authorization": f"Bearer {token}"}, + ) + with urllib.request.urlopen(req) as resp: + return json.loads(resp.read().decode()) + except Exception as e: + print(f"ERROR: API request failed for {path}: {e}") + sys.exit(1) + + +def fetch_scan_data(api_url: str, token: str, scan_id: int) -> tuple[dict, list]: + """Fetch scan metadata and results from the API.""" + print(f"Fetching scan #{scan_id} from {api_url} ...") + meta = _api_get(api_url, f"/v1/scans/{scan_id}", token) + results = _api_get(api_url, f"/v1/scans/{scan_id}/results", token) + print(f" Status: {meta.get('status')} | " + f"Pass: {meta.get('passed_count')} | " + f"Fail: {meta.get('failed_count')} | " + f"Score: {meta.get('compliance_score')}%") + return meta, results + + +# --------------------------------------------------------------------------- +# Severity classification +# --------------------------------------------------------------------------- + +# Controls classified as Critical based on CIS M365 benchmark importance +_CRITICAL_CONTROLS = { + "1.1.3", # Global admin count + "5.2.3.4", # MFA registration + "5.2.3.6", # System-preferred MFA + "5.1.5.1", # User consent to apps + "5.1.6.1", # Cross-tenant collaboration + "5.1.6.3", # Guest invitations + "5.3.2", # Guest access reviews + "5.3.3", # Privileged role access reviews + "5.3.4", # Global admin PIM approval + "5.3.5", # Privileged Role Admin PIM approval +} + +_HIGH_CONTROLS = { + "2.1.11", # Attachment filtering + "4.1", # Intune compliance defaults + "4.2", # Personal device enrollment + "5.1.3.1", # Dynamic guest group + "5.1.4.1", # Device join restriction + "5.2.3.2", # Custom banned password list + "5.2.3.3", # On-prem password protection + "5.2.3.7", # Email OTP + "6.1.2", # Mailbox audit actions + "6.3.1", # Role assignment policy add-ins + "5.1.4.6", # BitLocker key access + "5.1.6.2", # Guest user access (if failed) + "5.1.5.2", # Admin consent workflow + "5.2.2.3", # Legacy auth block + "5.2.3.1", # MFA fatigue protection +} + + +def _severity(control_id: str, status: str) -> str: + if status != "failed": + return "Info" + if control_id in _CRITICAL_CONTROLS: + return "Critical" + if control_id in _HIGH_CONTROLS: + return "High" + return "Medium" + + +# --------------------------------------------------------------------------- +# Core transform +# --------------------------------------------------------------------------- + +def transform(results: list, meta: dict | None = None) -> dict: + """ + Convert AutoAudit API scan data into the report_service.py dataset schema. + + Args: + results: List of control result dicts from GET /v1/scans/{id}/results + meta: Scan metadata dict from GET /v1/scans/{id} (optional but recommended) + """ + passed = [r for r in results if r["status"] == "passed"] + failed = [r for r in results if r["status"] == "failed"] + skipped = [r for r in results if r["status"] == "skipped"] + errored = [r for r in results if r["status"] == "error"] + + total_assessed = len(passed) + len(failed) + if meta and meta.get("compliance_score"): + # Use the API's calculated score if available + score_pct = float(meta["compliance_score"]) + score = f"{score_pct:.1f}%" + elif total_assessed: + score_pct = round(len(passed) / total_assessed * 100, 1) + score = f"{score_pct}%" + else: + score_pct = 0 + score = "N/A" + + # Risk posture based on score + if score_pct < 50: + risk = "CRITICAL" + elif score_pct < 70: + risk = "HIGH" + elif score_pct < 85: + risk = "MEDIUM" + else: + risk = "LOW" + + # Severity counts + critical_count = sum(1 for r in failed if _severity(r["control_id"], r["status"]) == "Critical") + high_count = sum(1 for r in failed if _severity(r["control_id"], r["status"]) == "High") + medium_count = sum(1 for r in failed if _severity(r["control_id"], r["status"]) == "Medium") + low_count = sum(1 for r in failed if _severity(r["control_id"], r["status"]) == "Low") + + # Dates + now = datetime.now() + date_str = now.strftime("%-d %B %Y") + date_full = now.strftime("%-d %B %Y %H:%M AEST") + + # Derive tenant info from meta or fall back to scan data + tenant_name = "Unknown Tenant" + tenant_domain = "unknown" + framework_ver = "v6.0.0" + started_at = date_str + finished_at = date_str + + if meta: + tenant_name = meta.get("connection_name") or tenant_name + framework_ver = meta.get("version") or framework_ver + if meta.get("started_at"): + try: + dt = datetime.fromisoformat(meta["started_at"].replace("Z", "+00:00")) + started_at = dt.strftime("%-d %B %Y") + except Exception: + pass + if meta.get("finished_at"): + try: + dt = datetime.fromisoformat(meta["finished_at"].replace("Z", "+00:00")) + finished_at = dt.strftime("%-d %B %Y") + except Exception: + pass + + # Try to derive domain from evidence in passed controls + for r in passed: + ev = r.get("evidence") or {} + if isinstance(ev, dict): + # Domain password policy evidence contains domain names + domains = ev.get("domains", []) + if domains and isinstance(domains, list) and domains[0].get("name"): + tenant_domain = domains[0]["name"] + break + # DKIM evidence + dkim_domains = ev.get("domains_with_dkim_enabled", []) + if dkim_domains: + tenant_domain = dkim_domains[0] + break + + # Build framework string + frameworks_used = f"CIS Microsoft 365 Foundations Benchmark {framework_ver}" + + # Controls list — all statuses included + controls = [] + for r in results: + evidence_str = json.dumps(r["evidence"], indent=2, default=str) if r["evidence"] else "N/A" + controls.append({ + "Control_ID": r["control_id"], + "Control_Name": r["control_id"], # Name requires benchmark lookup — not in results API + "Status": r["status"].upper(), + "Severity": _severity(r["control_id"], r["status"]), + "Description": r.get("message") or "", + "Evidence": evidence_str, + "Remediation": ( + f"Review and remediate control {r['control_id']} per {frameworks_used}." + if r["status"] == "failed" else "" + ), + }) + + # Sort failed controls by severity for top risks + sev_order = {"Critical": 0, "High": 1, "Medium": 2, "Low": 3, "Info": 4} + sorted_failed = sorted( + failed, + key=lambda r: sev_order.get(_severity(r["control_id"], r["status"]), 5) + ) + top_failed = sorted_failed[:3] + + def risk_text(r: dict) -> str: + return r.get("message") or f"Control {r['control_id']} failed" + + # Evidence register — passed controls with real evidence (up to 10) + evidence_register = [] + ev_num = 1 + for r in passed: + if r.get("evidence") and ev_num <= 10: + evidence_register.append({ + "Evidence_ID": f"EV-{ev_num:03d}", + "Control_Reference": r["control_id"], + "Description": r.get("message") or "", + "Collection_Method": "Automated API scan via AutoAudit Engine", + "Date_Collected": finished_at, + }) + ev_num += 1 + + # Remediation plan — failed controls sorted by severity (up to 8) + remediation_plan = [] + for i, r in enumerate(sorted_failed[:8], 1): + sev = _severity(r["control_id"], r["status"]) + remediation_plan.append({ + "Item": i, + "Control_ID": r["control_id"], + "Finding": r.get("message") or "", + "Action": f"Remediate control {r['control_id']} per {frameworks_used} guidance.", + "Priority": sev, + "Owner": "IT Security Team", + }) + + # Key recommendation + if critical_count > 0: + key_rec = ( + f"{tenant_name} has {critical_count} Critical finding(s) requiring immediate remediation. " + f"The tenant scored {score} against {frameworks_used}. " + f"Critical findings represent unacceptable risk and must be addressed before the next assessment cycle." + ) + else: + key_rec = ( + f"{tenant_name} scored {score} against {frameworks_used}. " + f"{len(failed)} control(s) failed and require remediation. " + f"Priority should be given to High severity findings." + ) + + top_action = remediation_plan[0]["Action"] if remediation_plan else "" + + dataset = { + "tenant": { + "Tenant_Name": tenant_name, + "Tenant_Domain": tenant_domain, + "Assessor_Name": "AutoAudit Platform", + "Frameworks_Used": frameworks_used, + "Assessment_Date": finished_at, + "Assessment_Period": f"{started_at} \u2013 {finished_at}", + "Generated_Date": date_full, + "Date_Generated": date_full, + "Classification": "Confidential", + "Report_Version": "v1.0", + "Distribution": "IT Security Team", + "Prepared_By": "AutoAudit Engine", + "Reviewed_By": "AutoAudit Platform", + "Team_Function": "Information Security", + "Scope_Owner": "IT Security Team", + "Limitations": ( + "Assessment conducted using read-only API access to Microsoft 365 and Entra ID. " + f"{len(skipped)} control(s) were skipped due to API limitations, manual verification " + "requirements, or incomplete collector implementation. " + f"{len(errored)} control(s) returned errors during evaluation." + if errored else + "Assessment conducted using read-only API access to Microsoft 365 and Entra ID. " + f"{len(skipped)} control(s) were skipped due to API limitations or manual verification requirements." + ), + }, + "summary": { + "Overall_Score": score, + "Overall_Risk_Posture": risk, + "Total_Controls": len(results), + "Total_Pass": len(passed), + "Total_Fail": len(failed), + "Total_Critical": critical_count, + "Total_High": high_count, + "Total_Medium": medium_count, + "Total_Low": low_count, + "Top_Risk_1": risk_text(top_failed[0]) if len(top_failed) > 0 else "", + "Top_Risk_2": risk_text(top_failed[1]) if len(top_failed) > 1 else "", + "Top_Risk_3": risk_text(top_failed[2]) if len(top_failed) > 2 else "", + "Strength_1": passed[0].get("message", "") if len(passed) > 0 else "", + "Strength_1_Evidence": "EV-001", + "Strength_2": passed[1].get("message", "") if len(passed) > 1 else "", + "Strength_2_Evidence": "EV-002", + "Strength_3": passed[2].get("message", "") if len(passed) > 2 else "", + "Strength_3_Evidence": "EV-003", + "Strength_4": passed[3].get("message", "") if len(passed) > 3 else "", + "Strength_4_Evidence": "EV-004", + "Strength_5": passed[4].get("message", "") if len(passed) > 4 else "", + "Strength_5_Evidence": "EV-005", + "Top_Remediation_Action": top_action, + "Key_Recommendation": key_rec, + "categories": {}, + }, + "controls": controls, + "evidence_register": evidence_register, + "remediation_plan": remediation_plan, + } + + return dataset + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + description="Generate an AutoAudit compliance report from scan results.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # From saved JSON files + python generate_report_from_scan.py --results scan_results.json --meta scan_meta.json + + # Live from running API + python generate_report_from_scan.py --api-url http://localhost:8000 --token $TOKEN --scan-id 1 + + # Live + save dataset + PDF + python generate_report_from_scan.py --api-url http://localhost:8000 --token $TOKEN --scan-id 1 --pdf --save-dataset + """, + ) + + # Input: offline mode + offline = p.add_argument_group("Offline mode (from saved JSON files)") + offline.add_argument("--results", "-r", metavar="FILE", + help="Path to scan results JSON (from GET /v1/scans/{id}/results)") + offline.add_argument("--meta", "-m", metavar="FILE", + help="Path to scan metadata JSON (from GET /v1/scans/{id}), optional") + + # Input: live mode + live = p.add_argument_group("Live mode (fetch directly from API)") + live.add_argument("--api-url", metavar="URL", + help="AutoAudit API base URL (e.g. http://localhost:8000)") + live.add_argument("--token", metavar="TOKEN", + help="Bearer token (or set AUTOAUDIT_TOKEN env var)") + live.add_argument("--scan-id", metavar="ID", type=int, + help="Scan ID to fetch and report on") + + # Output + out = p.add_argument_group("Output options") + out.add_argument("--template", "-t", metavar="FILE", + default="AutoAudit_Report_Template.docx", + help="Path to report template .docx (default: AutoAudit_Report_Template.docx)") + out.add_argument("--output", "-o", metavar="DIR", + default="reports_out", + help="Output directory for generated report (default: reports_out)") + out.add_argument("--pdf", action="store_true", + help="Also convert the report to PDF") + out.add_argument("--keep-docx", action="store_true", + help="Keep .docx when --pdf is set (default: delete after conversion)") + out.add_argument("--save-dataset", action="store_true", + help="Save the intermediate transformed JSON dataset for inspection") + + return p + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + + results: list | None = None + meta: dict | None = None + + # --- Determine input mode --- + if args.api_url or args.scan_id: + # Live mode + if not args.api_url or not args.scan_id: + parser.error("--api-url and --scan-id are both required for live mode.") + token = args.token or os.environ.get("AUTOAUDIT_TOKEN") + if not token: + parser.error("Provide --token or set the AUTOAUDIT_TOKEN environment variable.") + meta, results = fetch_scan_data(args.api_url, token, args.scan_id) + + elif args.results: + # Offline mode + results_path = Path(args.results) + if not results_path.exists(): + print(f"ERROR: Results file not found: {results_path}") + sys.exit(1) + print(f"Loading results from: {results_path}") + with open(results_path) as f: + results = json.load(f) + + if args.meta: + meta_path = Path(args.meta) + if not meta_path.exists(): + print(f"WARNING: Meta file not found: {meta_path} — tenant name will be generic.") + else: + print(f"Loading metadata from: {meta_path}") + with open(meta_path) as f: + meta = json.load(f) + else: + parser.error("Provide either --results (offline) or --api-url + --scan-id (live).") + + # --- Validate template --- + template_path = Path(args.template) + if not template_path.exists(): + print(f"ERROR: Template not found: {template_path}") + print("Ensure AutoAudit_Report_Template.docx is in the current directory or pass --template.") + sys.exit(1) + + # --- Transform --- + print(f"\nTransforming {len(results)} control results...") + dataset = transform(results, meta) + + passed_n = dataset["summary"]["Total_Pass"] + failed_n = dataset["summary"]["Total_Fail"] + score = dataset["summary"]["Overall_Score"] + risk = dataset["summary"]["Overall_Risk_Posture"] + tenant = dataset["tenant"]["Tenant_Name"] + domain = dataset["tenant"]["Tenant_Domain"] + + print(f" Tenant : {tenant} ({domain})") + print(f" Score : {score} | Risk: {risk}") + print(f" Pass: {passed_n} | Fail: {failed_n} | " + f"Critical: {dataset['summary']['Total_Critical']} | " + f"High: {dataset['summary']['Total_High']}") + + # --- Optionally save dataset --- + if args.save_dataset: + if args.results: + dataset_out = Path(args.results).stem + "_dataset.json" + else: + dataset_out = f"scan_{args.scan_id}_dataset.json" + with open(dataset_out, "w") as f: + json.dump(dataset, f, indent=2, default=str) + print(f"\nDataset saved to: {dataset_out}") + + # --- Generate report --- + print(f"\nGenerating report...") + print(f" Template : {template_path}") + print(f" Output : {args.output}/") + + sys.path.insert(0, str(Path(__file__).parent)) + try: + import report_service as svc + except ImportError: + print("ERROR: report_service.py not found. Place this script in the same folder.") + sys.exit(1) + + try: + if args.pdf: + out = svc.generate_full_report_pdf( + dataset, + template_path=str(template_path), + output_dir=args.output, + keep_docx=args.keep_docx, + ) + print(f"\n✓ PDF written to: {out}") + else: + out = svc.generate_full_report_docx( + dataset, + template_path=str(template_path), + output_dir=args.output, + ) + print(f"\n✓ Report written to: {out}") + print(f" Convert to PDF: python {Path(__file__).name} " + f"--results {args.results or f'scan_{args.scan_id}_results.json'} --pdf") + + except FileNotFoundError as e: + print(f"ERROR: {e}") + sys.exit(1) + except Exception as e: + import traceback + print(f"ERROR during report generation: {e}") + traceback.print_exc() + sys.exit(1) + + out_path = Path(out) + if not out_path.exists(): + print("ERROR: generation completed but no file was written.") + sys.exit(1) + + size_kb = out_path.stat().st_size / 1024 + if size_kb < 1: + print(f"WARNING: output is only {size_kb:.1f} KB — check the template path is correct.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/security/reports/report_service.py b/security/reports/report_service.py index a2e32a99a..870bc8e79 100644 --- a/security/reports/report_service.py +++ b/security/reports/report_service.py @@ -1,405 +1,1500 @@ +""" +report_service.py +================= +Takes a JSON dataset and a Word template, fills every {placeholder} token, +and saves the result as a .docx or PDF. + +Quick start:: + + from report_service import generate_full_report_docx + import json + + with open("dataset.json") as f: + data = json.load(f) + + out = generate_full_report_docx(data) + # open out in Word, check layout, export to PDF + +For headless pipelines use generate_full_report_pdf() instead. + +Dataset schema +-------------- +The top-level keys the service reads are: + + tenant dict org name, domain, assessor, dates, etc. + summary dict scores, risk posture, strengths, top risks + controls list one dict per control (see _single_control_mapping) + evidence_register list evidence items for Appendix A (up to 10) + remediation_plan list remediation rows (up to 8) + +None of these are required — missing keys produce empty strings in the output. +Key names are normalised before lookup (lower-cased, underscores/hyphens/slashes +collapsed to spaces), so "Tenant_Name", "tenant name", and "tenant-name" all +resolve to the same value. + +Adding new tokens +----------------- +1. Add the {New_Token} placeholder to the Word template. +2. Add "New_Token": _pick(n, "new token", "new_token") to the relevant + mapping function below (_map_tenant, _map_summary, _single_control_mapping, + etc.). That's it — no other changes needed. + +PDF conversion +-------------- +Tries three methods in order: + 1. docx2pdf (requires Microsoft Word on Windows/macOS) + 2. LibreOffice headless (soffice must be on PATH) + 3. fpdf2 plain-text fallback (no layout fidelity, last resort) + +If none of those are available the call raises RuntimeError. +""" + from __future__ import annotations +import logging import os -import uuid +import re import subprocess +import uuid from datetime import datetime from pathlib import Path -from typing import Mapping, Any, Optional, Dict, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple from docx import Document +from docx.oxml import OxmlElement +from docx.oxml.ns import qn from docx.shared import Inches -from fpdf import FPDF +log = logging.getLogger(__name__) -def generate_pdf( - data: Mapping[str, Any], - *, - template_path: os.PathLike | str = "templates/report_template.docx", - output_dir: os.PathLike | str = "reports_out", - base_dir: os.PathLike | str = ".", - image_marker: str = "[Embed evidence here]", - unique_id_override: Optional[str] = None, -) -> Path: - """ - Render a single PDF from the in-memory mapping produced by the OCR/rules step. - - Expected keys in `data` (case/spacing tolerant): - UniqueID or UserID -> becomes UniqueID in template - Evidence -> path to original evidence file - Evidence Preview (optional) -> path to an image to embed - Strategy, TestID, Sub-Strategy, ML Level, Pass/Fail, Priority, - Recommendation -> Recommendations, Evidence Extract -> Extract - Description - Confidence - - Returns: Path to the generated PDF. - """ - mapping, embed_path, unique_id = _map_to_placeholders(data, Path(base_dir)) +# OOXML tag names we reference directly in lxml operations. +_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_W_T = f"{{{_W_NS}}}t" +_W_R = f"{{{_W_NS}}}r" +_W_P = f"{{{_W_NS}}}p" - if unique_id_override: - unique_id = unique_id_override - mapping["UniqueID"] = unique_id - mapping["Unique ID"] = unique_id - tpath = Path(template_path) - if not tpath.exists(): - raise FileNotFoundError(f"Template not found: {tpath}") +# --------------------------------------------------------------------------- +# Text sanitisation +# +# Word XML only accepts characters in the Latin-1 range for most text nodes, +# so we swap common Unicode punctuation for ASCII equivalents before writing +# anything back into the document. +# --------------------------------------------------------------------------- - outdir = Path(output_dir) - outdir.mkdir(parents=True, exist_ok=True) - pdf_path = outdir / f"{unique_id}.pdf" - doc = Document(str(tpath)) - _replace_braced_placeholders_everywhere(doc, mapping) - _replace_xml_text_everywhere(doc, mapping) - if embed_path: - if not _insert_image_at_marker(doc, image_marker, embed_path, width_inches=6.0): - _insert_image_at_marker(doc, "[Embed screenshot here]", embed_path, width_inches=6.0) - else: - _remove_markers_everywhere(doc, ["[Embed evidence here]", "[Embed screenshot here]"]) - filled = pdf_path.with_suffix(".filled.docx") - doc.save(str(filled)) - _convert_docx_to_pdf(filled, pdf_path) - try: - filled.unlink() - except Exception: - pass +_CHAR_SUBS: Dict[str, str] = { + "\u2019": "'", "\u2018": "'", # smart quotes + "\u201c": '"', "\u201d": '"', + "\u2013": "-", "\u2014": "-", # en/em dash + "\u2022": "*", "\u2026": "...", # bullet, ellipsis + "\u00a0": " ", "\u2192": "->", # NBSP, right arrow +} - return pdf_path + +def _sanitise(v: str) -> str: + for ch, rep in _CHAR_SUBS.items(): + v = v.replace(ch, rep) + return v + + +def _sanitise_mapping(m: Dict[str, str]) -> Dict[str, str]: + return {k: _sanitise(v) for k, v in m.items()} -# ---------- Mapping (OCR dict -> template placeholders) ---------- +# --------------------------------------------------------------------------- +# Key normalisation +# +# Dataset keys arrive in all kinds of formats — snake_case, Title Case, +# kebab-case — so we normalise everything to lowercase space-separated words +# before comparing. _pick() then tries several name variants for each token +# so callers don't need to guess the exact key their dataset uses. +# --------------------------------------------------------------------------- def _normalize_keys(d: Mapping[str, Any]) -> Dict[str, str]: norm: Dict[str, str] = {} for k, v in d.items(): - key = " ".join(str(k).strip().lower().replace("_", " ").replace("-", " ").replace("/", " ").split()) + key = " ".join( + str(k).strip().lower() + .replace("_", " ").replace("-", " ").replace("/", " ") + .split() + ) norm[key] = "" if v is None else str(v) return norm + def _pick(norm: Dict[str, str], *names: str) -> str: + """Try each name in order and return the first match, or empty string.""" for n in names: key = " ".join(n.strip().lower().split()) if key in norm: return norm[key] return "" -def _map_to_placeholders(data: Mapping[str, Any], base_dir: Path) -> Tuple[Dict[str, str], Optional[Path], str]: - n = _normalize_keys(data) - # Inputs (tolerant keys) - unique_id = _pick(n, "uniqueid", "unique id", "userid", "user id") or str(uuid.uuid4()) - strategy = _pick(n, "strategy") - testid = _pick(n, "testid", "test id") - substrat = _pick(n, "sub-strategy", "sub strategy") - level = _pick(n, "ml level", "level") - passfail = _pick(n, "pass/fail", "pass fail") - priority = _pick(n, "priority") - rec = _pick(n, "recommendation", "recommendations") - extract = _pick(n, "evidence extract", "extract") - descr = _pick(n, "description") - confidence = _pick(n, "confidence") - - # Evidence paths - evidence_path_str = _pick(n, "evidence", "evidence path", "file", "file path", "filepath", "image", "screenshot") - preview_path_str = _pick(n, "evidence preview", "preview", "embed path") - - # Resolve paths - embed_path: Optional[Path] = None - file_name = "" - if evidence_path_str: - ep = Path(evidence_path_str) - if not ep.is_absolute(): - ep = base_dir / ep - file_name = ep.name - if preview_path_str: - pp = Path(preview_path_str) - if not pp.is_absolute(): - pp = base_dir / pp - if pp.exists(): - embed_path = pp - else: - if evidence_path_str: - ep = Path(evidence_path_str) - if not ep.is_absolute(): - ep = base_dir / ep - if ep.exists() and ep.suffix.lower() in {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}: - embed_path = ep - - # Mapping to template placeholders (support a couple of variants) - mapping: Dict[str, str] = { - "UniqueID": unique_id, - "Unique ID": unique_id, - "UserID": unique_id, +# --------------------------------------------------------------------------- +# Placeholder variant expansion +# +# Word occasionally substitutes a Unicode dash for the ASCII hyphen inside +# token names when the template is edited on macOS. We add aliases for all +# four Unicode dash variants so {Sub-Strategy} and {Sub‑Strategy} both work. +# --------------------------------------------------------------------------- - "Strategy": strategy, - "Test_id": testid, - "Sub-Strategy": substrat, +_UNICODE_DASHES = ["\u2010", "\u2011", "\u2013", "\u2014"] - "level": level, - "Level": level, - "Pass/Fail": passfail, - "Priority": priority, +def _expand_placeholder_variants(mapping: Dict[str, str]) -> None: + extras: Dict[str, str] = {} + for k, v in list(mapping.items()): + if "-" in k: + for dash in _UNICODE_DASHES: + alias = k.replace("-", dash) + if alias not in mapping: + extras[alias] = v + mapping.update(extras) - "Recommendations": rec, - "extract": extract, - "Extract": extract, +# --------------------------------------------------------------------------- +# XML substitution engine +# +# Word splits paragraph text across multiple runs for formatting +# reasons, so a single token like {Control_Name} can end up split as +# "{Control_", "Name}". We handle this with two passes: +# +# Pass 1 — single-run substitution: replaces tokens that happen to fall +# entirely within one node. Fast and covers most cases. +# +# Pass 2 — paragraph merge: concatenates all run text, does the replace, +# then writes the result back into the first node and +# blanks the rest. This preserves run formatting (bold, colour, +# font size) because we're reusing existing elements rather +# than creating new ones. +# +# JSON in Extract values contains its own braces, so we escape them to +# sentinel strings before substitution and restore them afterwards. +# --------------------------------------------------------------------------- - "Description": descr, - "description": descr, - "Confidence": confidence or "", +def _collapse_token_whitespace(text: str) -> str: + """Strip whitespace inside {...} so mid-word-wrapped tokens still match.""" + def _strip(m: re.Match) -> str: + return "{" + re.sub(r"\s+", "", m.group(1)) + "}" + return re.sub(r"\{([^}]*)\}", _strip, text) - "file name": file_name, - "File Name": file_name, - "Date Generated": datetime.now().strftime("%d %b %Y"), - } - _expand_placeholder_variants(mapping) - return mapping, embed_path, unique_id +def _sub_mapping_in_element(element, mapping: Dict[str, str]) -> None: + # Pass 1: single-run tokens + for t in element.iter(): + if t.tag == _W_T and t.text: + new = t.text + for k, v in mapping.items(): + new = new.replace("{" + k + "}", v) + if new != t.text: + t.text = new + # Pass 2: tokens split across runs within a paragraph + for para in element.iter(): + if para.tag != _W_P: + continue + runs = [c for c in para if c.tag == _W_R] + if not runs: + continue -# ---------- DOCX helpers ---------- + # Collect the first from each run (None if the run has no text node) + t_nodes: List[Optional[Any]] = [] + for r in runs: + for c in r: + if c.tag == _W_T: + t_nodes.append(c) + break + else: + t_nodes.append(None) -def _iter_paragraphs(doc): - for p in doc.paragraphs: - yield p - for tbl in doc.tables: - for row in tbl.rows: - for cell in row.cells: - for p in cell.paragraphs: - yield p - -def _replace_in_runs(paragraph, mapping: Mapping[str, Any]) -> bool: - changed = False - for run in paragraph.runs: - txt = run.text - new = txt + full = "".join((t.text or "") for t in t_nodes if t is not None) + if not full: + continue + + new = _collapse_token_whitespace(full) for k, v in mapping.items(): - new = new.replace("{" + k + "}", str(v)) - if new != txt: - run.text = new - changed = True - return changed - -def _rebuild_paragraph_text(paragraph, mapping: Mapping[str, Any]) -> None: - full = "".join(run.text for run in paragraph.runs) - repl = full - for k, v in mapping.items(): - repl = repl.replace("{" + k + "}", str(v)) - if repl != full: - for r in paragraph.runs: - r.text = "" - paragraph.add_run(repl) - -def _replace_braced_placeholders_everywhere(doc, mapping: Mapping[str, Any]) -> None: - for p in _iter_paragraphs(doc): - if not _replace_in_runs(p, mapping): - _rebuild_paragraph_text(p, mapping) - -def _replace_xml_text_everywhere(doc, mapping: Mapping[str, Any]) -> None: + new = new.replace("{" + k + "}", v) + + if new != full: + valid = [t for t in t_nodes if t is not None] + if valid: + valid[0].text = new + for t in valid[1:]: + t.text = "" + + # Restore JSON braces that were escaped before substitution + for t in element.iter(): + if t.tag == _W_T and t.text: + if "[[LBRACE]]" in t.text or "[[RBRACE]]" in t.text: + t.text = t.text.replace("[[LBRACE]]", "{").replace("[[RBRACE]]", "}") + + +def _sub_part(part, mapping: Dict[str, str]) -> None: + _sub_mapping_in_element(part.element, mapping) + + +def _iter_all_hf_parts(doc: Document): + """Yield every header/footer part for all sections and page variants.""" + attrs = ( + "header", "footer", + "first_page_header", "first_page_footer", + "even_page_header", "even_page_footer", + ) + for section in doc.sections: + for attr in attrs: + try: + hf = getattr(section, attr, None) + if hf and hf.part: + yield hf.part + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Global mapping builders +# --------------------------------------------------------------------------- + +def _map_tenant(t: Mapping[str, Any]) -> Dict[str, str]: + n = _normalize_keys(t) + return { + "Tenant_Name": _pick(n, "tenant name", "tenant_name"), + "Tenant_Domain": _pick(n, "tenant domain", "tenant_domain"), + "Assessor_Name": _pick(n, "assessor name", "assessor_name"), + "Frameworks_Used": _pick(n, "frameworks used", "frameworks_used", "framework"), + "Assessment_Period": _pick(n, "assessment period", "assessment_period"), + "Assessment_Date": _pick(n, "assessment date", "assessment_date"), + "Classification": _pick(n, "classification"), + "Report_Version": _pick(n, "report version", "report_version", "version"), + "Distribution": _pick(n, "distribution"), + "Prepared_By": _pick(n, "prepared by", "prepared_by"), + "Reviewed_By": _pick(n, "reviewed by", "reviewed_by"), + "Team_Function": _pick(n, "team function", "team_function"), + "Limitations": _pick(n, "limitations"), + "Scope_Owner": _pick(n, "scope owner", "scope_owner"), + } + + +def _map_summary(s: Mapping[str, Any]) -> Dict[str, str]: + n = _normalize_keys(s) + rp = _pick(n, "overall risk posture", "overall_risk_posture") + ex = ( + _pick(n, "executive summary", "executive_summary") + or _pick(n, "key recommendation", "key_recommendation") + ) + return { + "Executive_Summary": ex, + "Key_Recommendation": _pick(n, "key recommendation", "key_recommendation"), + "Overall_Score": _pick(n, "overall score", "overall_score"), + "Overall_Risk_Posture": rp, + "OVERALL_RISK_POSTURE": rp, # template uses both cases + "Total_Controls": _pick(n, "total controls", "total_controls"), + "Total_Pass": _pick(n, "total pass", "total_pass"), + "Total_Fail": _pick(n, "total fail", "total_fail"), + "Total_Critical": _pick(n, "total critical", "total_critical"), + "Total_High": _pick(n, "total high", "total_high"), + "Total_Medium": _pick(n, "total medium", "total_medium"), + "Total_Low": _pick(n, "total low", "total_low"), + "Top_Risk_1": _pick(n, "top risk 1", "top_risk_1"), + "Top_Risk_2": _pick(n, "top risk 2", "top_risk_2"), + "Top_Risk_3": _pick(n, "top risk 3", "top_risk_3"), + "Strength_1": _pick(n, "strength 1", "strength_1"), + "Strength_1_Evidence": _pick(n, "strength 1 evidence", "strength_1_evidence"), + "Strength_2": _pick(n, "strength 2", "strength_2"), + "Strength_2_Evidence": _pick(n, "strength 2 evidence", "strength_2_evidence"), + "Strength_3": _pick(n, "strength 3", "strength_3"), + "Strength_3_Evidence": _pick(n, "strength 3 evidence", "strength_3_evidence"), + "Strength_4": _pick(n, "strength 4", "strength_4"), + "Strength_4_Evidence": _pick(n, "strength 4 evidence", "strength_4_evidence"), + "Strength_5": _pick(n, "strength 5", "strength_5"), + "Strength_5_Evidence": _pick(n, "strength 5 evidence", "strength_5_evidence"), + "Top_Remediation_Action": _pick(n, "top remediation action", "top_remediation_action"), + } + + +def _map_categories(data_summary: Mapping[str, Any]) -> Dict[str, str]: """ - Replace {tokens} in all text nodes across main doc part, - headers, and footers. Works even with older python-docx (no namespaces kwarg). + Handles both dataset shapes: + - flat: summary.Cat_1_Pass + - nested: summary.categories.Cat_1.Pass """ - def replace_in_part(part): - root = part.element - texts = [] - try: - ns = getattr(root, "nsmap", None) - if ns: - texts = root.xpath(".//w:t", namespaces=ns) - except TypeError: - texts = [] - if not texts: - texts = root.xpath(".//*[local-name()='t']") - - for t in texts: - old = t.text or "" - new = old - for k, v in mapping.items(): - new = new.replace("{" + k + "}", str(v)) - if new != old: - t.text = new + result: Dict[str, str] = {} + n_summary = _normalize_keys(data_summary) + nested_cats = data_summary.get("categories", {}) - replace_in_part(doc.part) - for section in doc.sections: - try: - if section.header: - replace_in_part(section.header.part) - except Exception: - pass - try: - if section.footer: - replace_in_part(section.footer.part) - except Exception: - pass + for i in range(1, 10): + flat_pass = _pick(n_summary, f"cat {i} pass", f"cat_{i}_pass") + flat_fail = _pick(n_summary, f"cat {i} fail", f"cat_{i}_fail") + flat_total = _pick(n_summary, f"cat {i} total", f"cat_{i}_total") + flat_comment = _pick(n_summary, f"cat {i} comment", f"cat_{i}_comment") + + if not flat_pass and nested_cats: + c = nested_cats.get(f"Cat_{i}", {}) + nc = _normalize_keys(c) + flat_pass = _pick(nc, "pass") or "0" + flat_fail = _pick(nc, "fail") or "0" + flat_total = _pick(nc, "total") or "0" + flat_comment = _pick(nc, "comment") or "" + + result[f"Cat_{i}_Pass"] = flat_pass or "0" + result[f"Cat_{i}_Fail"] = flat_fail or "0" + result[f"Cat_{i}_Total"] = flat_total or "0" + result[f"Cat_{i}_Comment"] = flat_comment or "" + + return result + + +def _map_evidence_register(ev: list) -> Dict[str, str]: + """Build Evidence_N_* keys for Appendix A (up to 10 items).""" + result: Dict[str, str] = {} + for i, item in enumerate(ev[:10], 1): + n = _normalize_keys(item) + result[f"Evidence_{i}_Description"] = ( + _pick(n, "evidence description", "evidence_description") or _pick(n, "description") + ) + result[f"Evidence_{i}_Source"] = ( + _pick(n, "evidence source", "evidence_source") or _pick(n, "source") + ) + result[f"Evidence_{i}_MappedControl"] = ( + _pick(n, "mapped control", "mapped_control") or _pick(n, "uniqueid", "unique id") + ) + result[f"Evidence_{i}_Date"] = ( + _pick(n, "date captured", "date_captured") + or _pick(n, "date", "assessment date", "assessment_date") + ) + + # Blank out any template rows that have no corresponding data + for i in range(len(ev) + 1, 11): + result.setdefault(f"Evidence_{i}_Description", "") + result.setdefault(f"Evidence_{i}_Source", "") + result.setdefault(f"Evidence_{i}_MappedControl", "") + result.setdefault(f"Evidence_{i}_Date", "") + + return result -def _insert_image_at_marker(doc, marker: str, image_path: os.PathLike | str, width_inches: float = 6.0) -> bool: + +def _build_global_mapping(data: Mapping[str, Any]) -> Dict[str, str]: + """Merge all top-level mappings into one dict for global substitution.""" + m: Dict[str, str] = {} + m.update(_map_tenant(data.get("tenant", {}))) + m.update(_map_summary(data.get("summary", {}))) + m.update(_map_categories(data.get("summary", {}))) + m.update(_map_evidence_register(data.get("evidence_register", []))) + + now = datetime.now().strftime("%d %b %Y %H:%M") + # The footer token can appear as '{Date Generated}' or '{Generated Date}'. + # _collapse_token_whitespace strips spaces inside braces, so both the + # spaced and spaceless forms need to be registered. + m["Date Generated"] = now + m["Generated Date"] = now + m["Generated_Date"] = now + m["DateGenerated"] = now + m["GeneratedDate"] = now + + return _sanitise_mapping(m) + + +# --------------------------------------------------------------------------- +# Per-control mapping +# --------------------------------------------------------------------------- + +def _single_control_mapping(ctrl: Mapping[str, Any]) -> Dict[str, str]: + """ + Build the token map for one finding block. + + Extract values often contain JSON with their own braces. We escape those + to sentinel strings here and _sub_mapping_in_element() restores them after + all {Token} replacements are done. + """ + n = _normalize_keys(ctrl) + ss = _pick(n, "sub strategy", "sub_strategy", "substrategy") + + extract_raw = _pick(n, "extract", "evidence extract") + extract_safe = extract_raw.replace("{", "[[LBRACE]]").replace("}", "[[RBRACE]]") + + return { + "Control Name": _pick(n, "control name", "control_name"), + "Control_Name": _pick(n, "control name", "control_name"), + "CIS_Section": _pick(n, "cis section", "cis_section"), + "ISO_Mapping": _pick(n, "iso mapping", "iso_mapping"), + "Strategy": _pick(n, "strategy"), + "Sub-Strategy": ss, + "Sub_Strategy": ss, + "Test_id": _pick(n, "test id", "test_id", "testid"), + "Level": _pick(n, "level"), + "Compliance_Status": _pick(n, "compliance status", "compliance_status"), + "Risk_Rating": _pick(n, "risk rating", "risk_rating"), + "Priority": _pick(n, "priority"), + "Pass/Fail": _pick(n, "pass fail", "pass/fail", "passfail"), + "Description": _pick(n, "description"), + "Observations": _pick(n, "observations", "observation"), + "Justification": _pick(n, "justification"), + "Evidence_Type": _pick(n, "evidence type", "evidence_type"), + "File Name": _pick(n, "file name", "file_name", "filename"), + "Extract": extract_safe, + "Confidence": _pick(n, "confidence"), + "Evidence_Explanation": _pick(n, "evidence explanation", "evidence_explanation"), + "Impact": _pick(n, "impact"), + "Root_Cause": _pick(n, "root cause", "root_cause"), + "Remediation": _pick(n, "remediation", "recommendations", "recommendation"), + "Owner": _pick(n, "owner"), + "Target_Date": _pick(n, "target date", "target_date"), + "Remediation_Status": _pick(n, "remediation status", "remediation_status", "status"), + "UniqueID": _pick(n, "uniqueid", "unique id", "userid"), + } + + +# --------------------------------------------------------------------------- +# Severity bucketing +# --------------------------------------------------------------------------- + +def _bucket_fails_by_severity(controls: list) -> Dict[str, Optional[dict]]: + """ + Pick the first FAIL at each severity level. Used by legacy callers; + internally _group_fails_by_severity is preferred for multi-finding support. + """ + grouped = _group_fails_by_severity(controls) + return {sev: (lst[0] if lst else None) for sev, lst in grouped.items()} + + +def _group_fails_by_severity(controls: list) -> Dict[str, list]: + """ + Group ALL failing controls by severity level, preserving order. + Returns a dict with keys Critical/High/Medium/Low, each mapping to a + list of dicts (may be empty if no fails at that level). + """ + order: list = ["Critical", "High", "Medium", "Low"] + buckets: Dict[str, list] = {s: [] for s in order} + for ctrl in controls: + n = _normalize_keys(ctrl) + pf = _pick(n, "pass fail", "pass/fail", "passfail").upper() + if not pf: + status = _pick(n, "compliance status", "compliance_status").lower() + if "non" in status or "partial" in status: + pf = "FAIL" + sv = _pick(n, "risk rating", "risk_rating", "severity") + if pf == "FAIL" and sv in buckets: + buckets[sv].append(dict(ctrl)) + return buckets + + +# --------------------------------------------------------------------------- +# Table location helpers +# +# _find_remediation_table runs before substitution so the Remediation_Action_N +# tokens are still present and searchable. +# +# _find_appendix_b_table and _find_evidence_table also run before substitution, +# identifying tables by their header placeholder text. The row-level fill +# functions (_substitute_appendix_b, _substitute_evidence_table) use index- +# based row selection so they work correctly even after earlier passes have +# consumed the placeholder content. +# --------------------------------------------------------------------------- + +def _find_remediation_table(doc: Document) -> Optional[Any]: + for table in doc.tables: + for row in table.rows: + if re.search(r"Remediation_Action_\d+", " ".join(c.text for c in row.cells)): + return table + return None + + +def _find_appendix_b_table(doc: Document) -> Optional[Any]: + for table in doc.tables: + full = " ".join(c.text for row in table.rows for c in row.cells) + if "Control_Name" in full and "UniqueID" in full: + return table + return None + + +def _find_evidence_table(doc: Document) -> Optional[Any]: + for table in doc.tables: + full = " ".join(c.text for row in table.rows for c in row.cells) + if "Evidence_1_Description" in full: + return table + return None + + +# --------------------------------------------------------------------------- +# Document-level substitution steps +# --------------------------------------------------------------------------- + +# Section numbers that anchor each severity finding block in the template. +_BLOCK_SENTINELS = { + "Critical": "6.1", + "High": "6.2", + "Medium": "6.3", + "Low": "6.4", +} + +# These keywords appear in the real section headings but not in TOC entries. +# Requiring both the number AND a keyword prevents matching the TOC. +_BLOCK_SEVERITY_KEYWORDS = { + "Critical": ["(Critical)", "Critical"], + "High": ["(High)", "High"], + "Medium": ["(Medium)", "Medium"], + "Low": ["(Low)", "Low"], +} + + +def _find_block_anchors(children: list, order: list) -> tuple: + """Scan children to locate severity block start indices and section-7 stop.""" + anchors: Dict[str, int] = {} + section7_idx: Optional[int] = None + for idx, child in enumerate(children): + text = "".join(t.text for t in child.iter() if t.tag == _W_T and t.text) + if (section7_idx is None + and re.match(r"\s*7[\.\s]", text) + and "\u2026" not in text + and "..." not in text + and len(text) < 80): + section7_idx = idx + for sev, sentinel in _BLOCK_SENTINELS.items(): + if sev not in anchors: + has_num = bool(re.search(r"(? tuple: + """Return (start, end) indices for a severity block.""" + start = anchors[sev] + end = hard_stop + sev_idx = order.index(sev) + for ns in order[sev_idx + 1:]: + if ns in anchors and anchors[ns] < end: + end = anchors[ns] + return start, end + + +def _renumber_finding_headings(doc: Document) -> None: + """ + After all blocks are inserted, walk section 6 heading paragraphs and + renumber them sequentially: 6.1, 6.2, 6.3, ... + + A heading is identified by matching '6.N {anything} - (Severity)'. + """ + body = doc.element.body + counter = 0 + for child in body: + if child.tag != _W_P: + continue + text = "".join(t.text for t in child.iter() if t.tag == _W_T and t.text) + # Match heading pattern like "6.1 SomeName - (Critical)" + m = re.match(r"^6\.\d+\s+.+\s+-\s+\((Critical|High|Medium|Low)\)\s*$", text) + if not m: + continue + counter += 1 + new_num = f"6.{counter}" + # Rewrite the number in-place across all nodes in this paragraph. + # The number "6.X" always appears in the first run. + for t_node in child.iter(): + if t_node.tag == _W_T and t_node.text: + replaced = re.sub(r"^6\.\d+", new_num, t_node.text) + if replaced != t_node.text: + t_node.text = replaced + break # Only the first occurrence per paragraph + + +def _substitute_finding_blocks( + doc: Document, + severity_controls: Dict[str, Optional[dict]], + grouped_controls: Optional[Dict[str, list]] = None, +) -> None: + """ + Substitute tokens in each severity block (6.1-6.4), scoped tightly so + one block's data never leaks into another. + + When *grouped_controls* is supplied (all fails per severity), controls + beyond the first are handled by cloning the unfilled template block and + inserting filled duplicates immediately after the original. All heading + section numbers (6.1, 6.2, ...) are then renumbered sequentially. + + The TOC contains the same '6.1', '6.2' etc. strings and appears before + the actual headings in the document body. We skip TOC entries by + requiring both the section number and the severity keyword in the same + element. + + Section 7 is the hard stop — nothing past it gets touched here. + """ + import copy + + body = doc.element.body + order = ["Critical", "High", "Medium", "Low"] + + # Build per-severity control lists. + if grouped_controls is not None: + sev_lists: Dict[str, list] = grouped_controls + else: + sev_lists = { + sev: ([ctrl] if ctrl else []) + for sev, ctrl in (severity_controls or {}).items() + } + + # --- Pass 1: snapshot all unfilled template blocks before any substitution. + children = list(body) + anchors, hard_stop = _find_block_anchors(children, order) + + unfilled: Dict[str, list] = {} + for sev in order: + if sev in anchors: + start, end = _block_slice(anchors, hard_stop, sev, order) + unfilled[sev] = [copy.deepcopy(el) for el in children[start:end]] + + # --- Pass 2: fill the first block for each severity in-place. + for sev in order: + ctrls = sev_lists.get(sev, []) + if not ctrls or sev not in anchors: + continue + start, end = _block_slice(anchors, hard_stop, sev, order) + ctrl_map = _sanitise_mapping(_single_control_mapping(ctrls[0])) + _expand_placeholder_variants(ctrl_map) + for child in children[start:end]: + _sub_mapping_in_element(child, ctrl_map) + + # --- Pass 3: for extra controls (beyond first), clone unfilled snapshot, + # fill, and insert after the (already filled) original block. + # Process severities in REVERSE order so insertions into a later + # severity don't shift the anchors of earlier (higher) severities. + for sev in reversed(order): + ctrls = sev_lists.get(sev, []) + if len(ctrls) <= 1 or sev not in anchors or sev not in unfilled: + continue + + # Re-read children after any previous insertions. + children = list(body) + anchors, hard_stop = _find_block_anchors(children, order) + if sev not in anchors: + continue + _, end = _block_slice(anchors, hard_stop, sev, order) + + # Insert each extra control's block in forward order, each time + # appending after the last inserted block. + # We track the last element inserted so far; new blocks go after it. + # Start: the last element of the original (first-control) block. + start, _ = _block_slice(anchors, hard_stop, sev, order) + last_inserted = children[end - 1] # last element of the filled block + + snapshot = unfilled[sev] + for extra_ctrl in ctrls[1:]: + clones = [copy.deepcopy(el) for el in snapshot] + extra_map = _sanitise_mapping(_single_control_mapping(extra_ctrl)) + _expand_placeholder_variants(extra_map) + for clone in clones: + _sub_mapping_in_element(clone, extra_map) + # Insert clones in document order immediately after last_inserted. + for clone in clones: + last_inserted.addnext(clone) + last_inserted = clone + + # --- Pass 4: renumber all 6.x headings sequentially. + _renumber_finding_headings(doc) + + +def _substitute_remediation_rows(doc: Document, rows: list, rem_table) -> None: + """Fill the remediation plan table row by row (up to 8 rows).""" + if not rem_table: + return + + data_rows = [ + row for row in rem_table.rows + if re.search(r"Remediation_Action_\d+", " ".join(c.text for c in row.cells)) + ] + + for i, (row, item) in enumerate(zip(data_rows, rows), 1): + n = _normalize_keys(item) + action = ( + _pick(n, "remediation action", "action") + or _pick(n, f"remediation action {i}", f"remediation_action_{i}") + ) + row_map = _sanitise_mapping({ + f"Remediation_Action_{i}": action, + "Owner": _pick(n, "owner") or "", + "Target_Date": _pick(n, "target date", "target_date") or "", + "Remediation_Status": _pick(n, "status", "remediation status", "remediation_status") or "", + }) + _sub_mapping_in_element(row._tr, row_map) + + # Blank template rows that have no data + for i in range(len(rows) + 1, len(data_rows) + 1): + pad = { + f"Remediation_Action_{i}": "", + "Owner": "", "Target_Date": "", "Remediation_Status": "", + } + _sub_mapping_in_element(data_rows[i - 1]._tr, pad) + + +def _substitute_appendix_b(doc: Document, controls: list, app_b_table) -> None: + """Fill the Appendix B controls summary table row by row.""" + if not app_b_table: + return + + # Skip header row; fill data rows by index so token replacement order + # doesn't matter (tokens are already consumed before this runs). + data_rows = list(app_b_table.rows)[1:] + + for row, ctrl in zip(data_rows, controls): + n = _normalize_keys(ctrl) + # Derive Pass/Fail from Compliance_Status if the field is absent. + pf = _pick(n, "pass fail", "pass/fail") + if not pf: + status = _pick(n, "compliance status", "compliance_status").lower() + pf = "FAIL" if ("non" in status or "partial" in status) else "PASS" + row_map = _sanitise_mapping({ + "Control_Name": _pick(n, "control name", "control_name") or "", + "Control Name": _pick(n, "control name", "control_name") or "", + "UniqueID": _pick(n, "uniqueid", "unique id") or "", + "Level": _pick(n, "level") or "", + "Pass/Fail": pf, + }) + _sub_mapping_in_element(row._tr, row_map) + + for row in data_rows[len(controls):]: + pad = {"Control_Name": "", "Control Name": "", "UniqueID": "", "Level": "", "Pass/Fail": ""} + _sub_mapping_in_element(row._tr, pad) + + +def _substitute_evidence_table(doc: Document, evidence_items: list, ev_table) -> None: + """ + Fill the Evidence Register table row by row (Appendix A). + + Uses numbered tokens (Evidence_N_MappedControl, Evidence_N_Date) rather + than shared token names (UniqueID, Assessment_Date) to avoid cross- + contamination with the global substitution pass. + + Rows are selected by index rather than by token text, since this function + runs after earlier passes have already consumed the placeholder content. + """ + if not ev_table: + return + + # Skip header row; fill data rows by index. + data_rows = list(ev_table.rows)[1:] + + for i, (row, item) in enumerate(zip(data_rows, evidence_items), 1): + n = _normalize_keys(item) + desc = _pick(n, "evidence description", "evidence_description") or _pick(n, "description") or "" + source = _pick(n, "evidence source", "evidence_source") or _pick(n, "source") or "" + mapped = _pick(n, "mapped control", "mapped_control") or _pick(n, "uniqueid", "unique id") or "" + date = ( + _pick(n, "date captured", "date_captured") + or _pick(n, "date", "assessment date", "assessment_date") or "" + ) + row_map = _sanitise_mapping({ + f"Evidence_{i}_Description": desc, + f"Evidence_{i}_Source": source, + f"Evidence_{i}_MappedControl": mapped, + f"Evidence_{i}_Date": date, + }) + _sub_mapping_in_element(row._tr, row_map) + + for idx in range(len(evidence_items) + 1, len(data_rows) + 1): + pad = { + f"Evidence_{idx}_Description": "", + f"Evidence_{idx}_Source": "", + f"Evidence_{idx}_MappedControl": "", + f"Evidence_{idx}_Date": "", + } + _sub_mapping_in_element(data_rows[idx - 1]._tr, pad) + + +def _substitute_global(doc: Document, mapping: Dict[str, str]) -> None: + """Global pass — runs after all scoped steps, fills everything remaining.""" + _sub_part(doc.part, mapping) + for part in _iter_all_hf_parts(doc): + _sub_part(part, mapping) + + +# --------------------------------------------------------------------------- +# Stray marker removal +# --------------------------------------------------------------------------- + +_STRAY_MARKERS = [ + "[Embed evidence here]", + "[Embed screenshot here]", + "End of report. Delete unused finding blocks and this note before issuing the final version.", + "[Repeat the finding block above for each additional failing control, ordered Critical \u2192 High \u2192 Medium \u2192 Low]", + "[Repeat the finding block above for each additional failing control, ordered Critical -> High -> Medium -> Low]", +] + + +def _remove_markers(doc: Document, extra_markers: Optional[List[str]] = None) -> None: + """Strip template instruction strings that should not appear in output.""" + markers = _STRAY_MARKERS + (extra_markers or []) + + def _scrub(element) -> None: + for t in element.iter(): + if t.tag == _W_T and t.text: + new = t.text + for m in markers: + new = new.replace(m, "") + if new != t.text: + t.text = new + + _scrub(doc.element) + for part in _iter_all_hf_parts(doc): + _scrub(part.element) + + +# --------------------------------------------------------------------------- +# Image embedding +# --------------------------------------------------------------------------- + +def _insert_image_at_marker( + doc: Document, + marker: str, + image_path: os.PathLike | str, + width_inches: float = 6.0, +) -> bool: + """Replace marker text with an embedded image. Returns False if not found.""" ip = Path(image_path) if not ip.exists(): return False + target = None - for p in _iter_paragraphs(doc): - if marker in "".join(run.text for run in p.runs): + for p in doc.paragraphs: + if marker in "".join(r.text for r in p.runs): target = p break + + if not target: + for tbl in doc.tables: + for row in tbl.rows: + for cell in row.cells: + for p in cell.paragraphs: + if marker in "".join(r.text for r in p.runs): + target = p + break + if not target: return False + for r in target.runs: r.text = r.text.replace(marker, "") - run = target.add_run() - try: - run.add_picture(str(ip), width=Inches(width_inches)) - except AttributeError: - doc.add_picture(str(ip), width=Inches(width_inches)) + target.add_run().add_picture(str(ip), width=Inches(width_inches)) return True + +# --------------------------------------------------------------------------- +# Finding table column width fix +# +# The template's finding tables (6.1-6.4) have correct column widths in their +# XML (tblW, tcW) but are missing the tblLayout element. Without it the OOXML +# spec defaults to autofit, which lets Word override those widths when long +# content is injected — e.g. a JSON extract in the Justification row collapses +# the right column. Adding tblLayout type="fixed" is all that's needed. +# +# We identify finding tables by checking whether at least 4 known row labels +# appear in the left column, so the fix is robust to table index shifts if the +# template is ever restructured. +# --------------------------------------------------------------------------- + +_FINDING_ROW_LABELS = { + "Description", "Observation", "Justification", + "Evidence Reviewed", "Evidence Explanation", + "Risk / Impact", "Root Cause", "Recommendation", + "Control Reference", "Strategy / Sub-strategy", "Test ID", + "CIS Level", "Compliance Status", "Risk Rating", + "Priority", "Result", "Owner", +} + + +def _inject_evidence_extracts(doc: Document, controls: list) -> None: + """ + For each finding table in section 6, inject a styled evidence extract + paragraph immediately after the table. + + The paragraph shows: + Evidence file: + + + Matching is done by order: finding tables appear in document order, + and failing controls are iterated in the same order they were placed + (Critical → High → Medium → Low, multiple per severity in insertion order). + Only controls with a non-empty Extract are injected. + """ + # Build ordered list of failing controls that have extract data. + order_sev = ["Critical", "High", "Medium", "Low"] + + def _sev_key(ctrl): + sv = _normalize_keys(ctrl).get("risk rating", "") or _normalize_keys(ctrl).get("severity", "") + return order_sev.index(sv) if sv in order_sev else 99 + + def _is_fail(ctrl): + n = _normalize_keys(ctrl) + pf = _pick(n, "pass fail", "pass/fail", "passfail").upper() + if not pf: + status = _pick(n, "compliance status", "compliance_status").lower() + if "non" in status or "partial" in status: + return True + return False + return pf == "FAIL" + + failing = sorted([c for c in controls if _is_fail(c)], key=_sev_key) + + # Identify finding tables in document order. + finding_tables = [] + for table in doc.tables: + left_labels = {row.cells[0].text.strip() for row in table.rows if row.cells} + if len(left_labels & _FINDING_ROW_LABELS) >= 4: + finding_tables.append(table) + + body = doc.element.body + + for table, ctrl in zip(finding_tables, failing): + n = _normalize_keys(ctrl) + extract = _pick(n, "extract", "evidence extract").strip() + filename = _pick(n, "file name", "file_name", "filename").strip() + + if not extract and not filename: + continue + + # Build the paragraph XML inline. + # Style: 9pt Times New Roman, grey label + monospace-ish extract body. + label_line = f"Evidence file: {filename}" if filename else "" + body_line = extract + + ns_w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + ns_w14 = "http://schemas.microsoft.com/office/word/2010/wordml" + + def _make_run(text: str, bold: bool = False, colour: str = "595959") -> "lxml.etree._Element": + r = OxmlElement("w:r") + rPr = OxmlElement("w:rPr") + fonts = OxmlElement("w:rFonts") + fonts.set(qn("w:ascii"), "Courier New") + fonts.set(qn("w:hAnsi"), "Courier New") + fonts.set(qn("w:cs"), "Courier New") + rPr.append(fonts) + if bold: + rPr.append(OxmlElement("w:b")) + col = OxmlElement("w:color") + col.set(qn("w:val"), colour) + rPr.append(col) + sz = OxmlElement("w:sz") + sz.set(qn("w:val"), "18") # 9pt + rPr.append(sz) + szCs = OxmlElement("w:szCs") + szCs.set(qn("w:val"), "18") + rPr.append(szCs) + r.append(rPr) + t = OxmlElement("w:t") + t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve") + t.text = _sanitise(text) + r.append(t) + return r + + def _make_para(*runs) -> "lxml.etree._Element": + p = OxmlElement("w:p") + pPr = OxmlElement("w:pPr") + spacing = OxmlElement("w:spacing") + spacing.set(qn("w:before"), "40") + spacing.set(qn("w:after"), "40") + spacing.set(qn("w:line"), "240") + spacing.set(qn("w:lineRule"), "auto") + pPr.append(spacing) + ind = OxmlElement("w:ind") + ind.set(qn("w:left"), "360") + pPr.append(ind) + shd = OxmlElement("w:shd") + shd.set(qn("w:val"), "clear") + shd.set(qn("w:color"), "auto") + shd.set(qn("w:fill"), "F2F2F2") + pPr.append(shd) + p.append(pPr) + for run in runs: + p.append(run) + return p + + paras = [] + if label_line: + paras.append(_make_para(_make_run(label_line, bold=True, colour="404040"))) + if body_line: + paras.append(_make_para(_make_run(body_line, colour="595959"))) + + # Insert paragraphs immediately after the table element. + tbl_el = table._tbl + for para in reversed(paras): + tbl_el.addnext(para) + + +def _fix_finding_table_widths(doc: Document) -> None: + for table in doc.tables: + left_labels = {row.cells[0].text.strip() for row in table.rows if row.cells} + if len(left_labels & _FINDING_ROW_LABELS) < 4: + continue + tblPr = table._tbl.find(qn("w:tblPr")) + if tblPr is None: + continue + if tblPr.find(qn("w:tblLayout")) is None: + el = OxmlElement("w:tblLayout") + el.set(qn("w:type"), "fixed") + tblPr.append(el) + + +# --------------------------------------------------------------------------- +# PDF conversion +# --------------------------------------------------------------------------- + def _convert_docx_to_pdf(input_docx: Path, output_pdf: Path) -> None: """ - Try docx2pdf (uses Word on Windows/macOS). If unavailable, fall back to LibreOffice. + Try docx2pdf, then LibreOffice, then an fpdf2 plain-text fallback. + Raises RuntimeError if all three fail. """ - # Preferred: docx2pdf + # docx2pdf (needs Microsoft Word) try: - from docx2pdf import convert + from docx2pdf import convert # type: ignore convert(str(input_docx), str(output_pdf)) + log.info("PDF via docx2pdf: %s", output_pdf) return - except Exception: - pass + except ImportError: + log.debug("docx2pdf not installed, trying LibreOffice.") + except Exception as exc: + log.warning("docx2pdf failed (%s), trying LibreOffice.", exc) - # Fallback: LibreOffice + # LibreOffice headless try: out_dir = str(output_pdf.parent.resolve()) subprocess.run( - ["soffice", "--headless", "--convert-to", "pdf", "--outdir", out_dir, str(input_docx.resolve())], - check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["soffice", "--headless", "--convert-to", "pdf", + "--outdir", out_dir, str(input_docx.resolve())], + check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - expected = output_pdf.with_suffix(".pdf") - if expected.exists() and expected != output_pdf: - expected.replace(output_pdf) + lo_out = output_pdf.parent / (input_docx.stem + ".pdf") + if lo_out.exists() and lo_out != output_pdf: + lo_out.replace(output_pdf) + log.info("PDF via LibreOffice: %s", output_pdf) return - except Exception: - pass + except FileNotFoundError: + log.debug("soffice not found, using plain-text fallback.") + except subprocess.CalledProcessError as exc: + log.warning("LibreOffice failed (%s), using plain-text fallback.", exc) - # Final fallback: generate a simple text PDF (no external binaries) + # fpdf2 plain-text fallback (no layout fidelity) if _simple_pdf_from_docx(input_docx, output_pdf): + log.warning("PDF via fpdf2 fallback (no formatting): %s", output_pdf) return raise RuntimeError( - "PDF conversion failed. Install Microsoft Word for docx2pdf or LibreOffice, or ensure fpdf2 fallback works." + "PDF conversion failed — install docx2pdf (Word) or LibreOffice, " + "or ensure fpdf2 is available." ) def _simple_pdf_from_docx(input_docx: Path, output_pdf: Path) -> bool: - """ - Pure-Python fallback using fpdf2 to ensure a downloadable PDF is always produced. - """ + """Text-only PDF via fpdf2. Used only when Word and LibreOffice are unavailable.""" try: + from fpdf import FPDF # type: ignore + doc = Document(str(input_docx)) pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() - pdf.set_font("Helvetica", size=12) + pdf.set_font("helvetica", size=12) - def _safe_text(text: str) -> str: - """fpdf core fonts are latin-1; strip/replace unsupported chars.""" - if text is None: + def _safe(text: str) -> str: + if not text: return "" try: - return text.encode("latin-1").decode("latin-1") + return text.encode("latin-1", errors="replace").decode("latin-1") except Exception: - return text.encode("ascii", "replace").decode("ascii") + return text.encode("ascii", errors="replace").decode("ascii") - def _write_line(line: str): - line = _safe_text(line) - if not line: - return - pdf.multi_cell(0, 8, line) - pdf.ln(1) + def _line(text: str) -> None: + text = _safe(text.strip()) + if text: + pdf.multi_cell(0, 8, text) + pdf.ln(1) for p in doc.paragraphs: - text = (p.text or "").strip() - _write_line(text) - + _line(p.text or "") for tbl in doc.tables: for row in tbl.rows: - row_text = " | ".join((cell.text or "").strip() for cell in row.cells) - _write_line(row_text) + _line(" | ".join((c.text or "").strip() for c in row.cells)) pdf.output(str(output_pdf)) return True - except Exception: + except Exception as exc: + log.error("fpdf2 fallback failed: %s", exc) return False - -def _remove_markers_everywhere(doc, markers: list[str]) -> None: - for p in _iter_paragraphs(doc): - full = "".join(r.text for r in p.runs) - new_full = full - for m in markers: - new_full = new_full.replace(m, "") - if new_full != full: - for r in p.runs: - r.text = "" - p.add_run(new_full) - - def scrub_part(part): - root = part.element - texts = [] - try: - ns = getattr(root, "nsmap", None) - if ns: - texts = root.xpath(".//w:t", namespaces=ns) - except TypeError: - texts = [] - if not texts: - texts = root.xpath(".//*[local-name()='t']") - for t in texts: - old = t.text or "" - new = old - for m in markers: - new = new.replace(m, "") - if new != old: - t.text = new - scrub_part(doc.part) - for section in doc.sections: - try: - if section.header: - scrub_part(section.header.part) - except Exception: - pass + +# --------------------------------------------------------------------------- +# Render core +# --------------------------------------------------------------------------- + +def _resolve_output_stem(mapping: Dict[str, str], override: Optional[str]) -> str: + if override: + return override + tenant = mapping.get("Tenant_Name", "report").replace(" ", "_") + date = mapping.get("Assessment_Date", datetime.now().strftime("%d%b%Y")).replace(" ", "") + return f"{tenant}_{date}_AutoAudit_Report" + + +def _render_report_doc( + data: Mapping[str, Any], + template_path: Path, +) -> Tuple[Document, Dict[str, str]]: + """ + Load the template and run all substitution passes in the correct order. + + Order matters — do not rearrange: + 1. Locate scoped tables while placeholder text is still raw. + 2. Per-block finding substitution (scoped to sections 6.1–6.4). + 3. Per-row remediation table. + 4. Per-row Appendix B (index-based; Pass/Fail derived from Compliance_Status). + 5. Per-row Evidence Register (index-based; numbered tokens throughout). + 6. Global substitution (fills everything remaining). + 7. Remove stray template instruction markers. + 8. Lock finding table column widths. + """ + global_mapping = _build_global_mapping(data) + _expand_placeholder_variants(global_mapping) + + severity_controls = _bucket_fails_by_severity(data.get("controls", [])) + grouped_controls = _group_fails_by_severity(data.get("controls", [])) + remediation_rows = data.get("remediation_plan", [])[:8] + all_controls = data.get("controls", []) + + doc = Document(str(template_path)) + + rem_table = _find_remediation_table(doc) + app_b_table = _find_appendix_b_table(doc) + ev_table = _find_evidence_table(doc) + + _substitute_finding_blocks(doc, severity_controls, grouped_controls=grouped_controls) + _substitute_remediation_rows(doc, remediation_rows, rem_table) + _substitute_appendix_b(doc, all_controls, app_b_table) + _substitute_evidence_table(doc, data.get("evidence_register", [])[:10], ev_table) + _substitute_global(doc, global_mapping) + _remove_markers(doc) + _inject_evidence_extracts(doc, all_controls) + _fix_finding_table_widths(doc) + + return doc, global_mapping + + +# --------------------------------------------------------------------------- +# Public API — full report +# --------------------------------------------------------------------------- + +def generate_full_report_docx( + data: Mapping[str, Any], + *, + template_path: os.PathLike | str = "AutoAudit_Report_Template.docx", + output_dir: os.PathLike | str = "reports_out", + output_filename: Optional[str] = None, +) -> Path: + """ + Fill the template with *data* and save as a .docx. + + The output filename defaults to ``__AutoAudit_Report.docx``. + Open the result in Word to check layout before exporting to PDF, or call + generate_full_report_pdf() if you want the PDF directly. + + Returns the path to the generated file. + """ + tpath = Path(template_path) + if not tpath.exists(): + raise FileNotFoundError(f"Template not found: {tpath}") + + outdir = Path(output_dir) + outdir.mkdir(parents=True, exist_ok=True) + + doc, mapping = _render_report_doc(data, tpath) + out = outdir / f"{_resolve_output_stem(mapping, output_filename)}.docx" + doc.save(str(out)) + + log.info("Report written to: %s", out) + return out + + +def generate_full_report_pdf( + data: Mapping[str, Any], + *, + template_path: os.PathLike | str = "AutoAudit_Report_Template.docx", + output_dir: os.PathLike | str = "reports_out", + output_filename: Optional[str] = None, + keep_docx: bool = False, +) -> Path: + """ + Fill the template, convert to PDF, and return the PDF path. + + Set keep_docx=True to also keep the intermediate .docx alongside the PDF. + PDF conversion tries docx2pdf → LibreOffice → fpdf2 in that order. + + Returns the path to the generated PDF. + """ + tpath = Path(template_path) + if not tpath.exists(): + raise FileNotFoundError(f"Template not found: {tpath}") + + outdir = Path(output_dir) + outdir.mkdir(parents=True, exist_ok=True) + + doc, mapping = _render_report_doc(data, tpath) + stem = _resolve_output_stem(mapping, output_filename) + docx_path = outdir / f"{stem}.docx" + pdf_path = outdir / f"{stem}.pdf" + + doc.save(str(docx_path)) + _convert_docx_to_pdf(docx_path, pdf_path) + + if not keep_docx: try: - if section.footer: - scrub_part(section.footer.part) + docx_path.unlink() except Exception: pass -# ---------- Tolerant placeholder variants ---------- + return pdf_path -def _expand_placeholder_variants(mapping: Dict[str, str]) -> None: + +def convert_docx_to_pdf( + docx_path: os.PathLike | str, + output_dir: Optional[os.PathLike | str] = None, +) -> Path: """ - Make our {token} replacement tolerant to: - - non-ASCII hyphens/dashes (‐, -, –, —) - - optional spaces inside braces: { Token } as well as {Token} - This only adds alias keys; it does NOT change original keys/values. + Convert an existing .docx to PDF. + + Handy after generate_full_report_docx() has been reviewed in Word and you + want to convert without re-rendering the whole report. + + Returns the path to the generated PDF. """ - hyphens = ["-", "\u2010", "\u2011", "\u2013", "\u2014"] - to_add: Dict[str, str] = {} + src = Path(docx_path) + if not src.exists(): + raise FileNotFoundError(f"File not found: {src}") - for k, v in list(mapping.items()): - spaced = f" {k} " - if spaced not in mapping: - to_add[spaced] = v - if "-" in k: - for h in hyphens: - if h == "-": - continue - k_dash = k.replace("-", h) - if k_dash not in mapping: - to_add[k_dash] = v - spaced_dash = f" {k_dash} " - if spaced_dash not in mapping: - to_add[spaced_dash] = v - - mapping.update(to_add) + outdir = Path(output_dir) if output_dir else src.parent + outdir.mkdir(parents=True, exist_ok=True) + pdf_path = outdir / (src.stem + ".pdf") + + _convert_docx_to_pdf(src, pdf_path) + return pdf_path + + +# --------------------------------------------------------------------------- +# Public API — single finding (legacy) +# --------------------------------------------------------------------------- + +def _build_single_finding_mapping( + data: Mapping[str, Any], + base_dir: Path, +) -> Tuple[Dict[str, str], Optional[Path], str]: + n = _normalize_keys(data) + + unique_id = _pick(n, "uniqueid", "unique id", "userid") or str(uuid.uuid4()) + ss = _pick(n, "sub strategy", "sub-strategy") + extract = _pick(n, "evidence extract", "extract") + + ev_str = _pick(n, "evidence", "evidence path", "file", "file path", + "filepath", "image", "screenshot") + preview_str = _pick(n, "evidence preview", "preview", "embed path") + + file_name: str = "" + embed_path: Optional[Path] = None + + if ev_str: + ep = Path(ev_str) + if not ep.is_absolute(): + ep = base_dir / ep + file_name = ep.name + + if preview_str: + pp = Path(preview_str) + if not pp.is_absolute(): + pp = base_dir / pp + if pp.exists(): + embed_path = pp + elif ev_str: + ep = Path(ev_str) + if not ep.is_absolute(): + ep = base_dir / ep + if ep.exists() and ep.suffix.lower() in {".png", ".jpg", ".jpeg", + ".tif", ".tiff", ".bmp", ".webp"}: + embed_path = ep + + mapping: Dict[str, str] = { + "UniqueID": unique_id, + "Strategy": _pick(n, "strategy"), + "Test_id": _pick(n, "testid", "test id"), + "Sub-Strategy": ss, + "Sub_Strategy": ss, + "Level": _pick(n, "ml level", "level"), + "Pass/Fail": _pick(n, "pass fail", "pass/fail"), + "Priority": _pick(n, "priority"), + "Recommendations": _pick(n, "recommendation", "recommendations"), + "Extract": extract.replace("{", "[[LBRACE]]").replace("}", "[[RBRACE]]"), + "Description": _pick(n, "description"), + "Confidence": _pick(n, "confidence"), + "File Name": file_name, + "Date Generated": datetime.now().strftime("%d %b %Y"), + "Generated Date": datetime.now().strftime("%d %b %Y"), + } + return _sanitise_mapping(mapping), embed_path, unique_id + + +def _render_single_finding_doc( + data: Mapping[str, Any], + base_dir: Path, + template_path: Path, + image_marker: str, + unique_id_override: Optional[str], +) -> Tuple[Document, str]: + mapping, embed_path, unique_id = _build_single_finding_mapping(data, base_dir) + + if unique_id_override: + unique_id = unique_id_override + mapping["UniqueID"] = unique_id + + doc = Document(str(template_path)) + _expand_placeholder_variants(mapping) + _sub_mapping_in_element(doc.element, mapping) + for part in _iter_all_hf_parts(doc): + _sub_mapping_in_element(part.element, mapping) + + if embed_path: + if not _insert_image_at_marker(doc, image_marker, embed_path): + _insert_image_at_marker(doc, "[Embed screenshot here]", embed_path) + else: + _remove_markers(doc, [image_marker, "[Embed screenshot here]"]) + + return doc, unique_id + + +def generate_single_finding_docx( + data: Mapping[str, Any], + *, + template_path: os.PathLike | str = "AutoAudit_Report_Template.docx", + output_dir: os.PathLike | str = "reports_out", + base_dir: os.PathLike | str = ".", + image_marker: str = "[Embed evidence here]", + unique_id_override: Optional[str] = None, +) -> Path: + """ + Render a single finding entry as a .docx (legacy / OCR pipeline usage). + Returns the path to the generated file. + """ + tpath = Path(template_path) + if not tpath.exists(): + raise FileNotFoundError(f"Template not found: {tpath}") + + outdir = Path(output_dir) + outdir.mkdir(parents=True, exist_ok=True) + + doc, uid = _render_single_finding_doc( + data, Path(base_dir), tpath, image_marker, unique_id_override + ) + out = outdir / f"{uid}.docx" + doc.save(str(out)) + log.info("Single-finding docx: %s", out) + return out + + +def generate_single_finding_pdf( + data: Mapping[str, Any], + *, + template_path: os.PathLike | str = "AutoAudit_Report_Template.docx", + output_dir: os.PathLike | str = "reports_out", + base_dir: os.PathLike | str = ".", + image_marker: str = "[Embed evidence here]", + unique_id_override: Optional[str] = None, +) -> Path: + """ + Render a single finding entry directly to PDF (legacy usage). + Prefer generate_single_finding_docx() when you need to review the output first. + Returns the path to the generated PDF. + """ + tpath = Path(template_path) + if not tpath.exists(): + raise FileNotFoundError(f"Template not found: {tpath}") + + outdir = Path(output_dir) + outdir.mkdir(parents=True, exist_ok=True) + + doc, uid = _render_single_finding_doc( + data, Path(base_dir), tpath, image_marker, unique_id_override + ) + docx_path = outdir / f"{uid}.docx" + pdf_path = outdir / f"{uid}.pdf" + doc.save(str(docx_path)) + _convert_docx_to_pdf(docx_path, pdf_path) + + try: + docx_path.unlink() + except Exception: + pass + + return pdf_path + + +# --------------------------------------------------------------------------- +# Backward-compatibility alias +# +# security/evidence_backend/reportgenerator.py and security/evidence_ui/app.py +# both import generate_pdf from this module. It maps to generate_single_finding_pdf +# which accepts the same arguments those callers pass. +# --------------------------------------------------------------------------- + +def generate_pdf( + data: Mapping[str, Any], + *, + template_path: os.PathLike | str = "AutoAudit_Report_Template.docx", + output_dir: os.PathLike | str = "reports_out", + base_dir: os.PathLike | str = ".", + image_marker: str = "[Embed evidence here]", + unique_id_override: Optional[str] = None, +) -> Path: + """Alias for generate_single_finding_pdf — kept for backward compatibility.""" + return generate_single_finding_pdf( + data, + template_path=template_path, + output_dir=output_dir, + base_dir=base_dir, + image_marker=image_marker, + unique_id_override=unique_id_override, + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + import json + import sys + + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + + args = sys.argv[1:] + + # Subcommand: convert an existing docx to PDF + if args and args[0] == "convert": + src = args[1] if len(args) > 1 else None + outdir = args[2] if len(args) > 2 else None + if not src: + print("Usage: python report_service.py convert [output_dir]") + sys.exit(1) + pdf = convert_docx_to_pdf(src, outdir) + print(f"PDF written to: {pdf}") + sys.exit(0) + + # Default: generate full report + dataset = args[0] if args and not args[0].startswith("--") else "fake_dataset.json" + template = args[1] if len(args) > 1 and not args[1].startswith("--") else "AutoAudit_Report_Template.docx" + outdir = args[2] if len(args) > 2 and not args[2].startswith("--") else "reports_out" + to_pdf = "--pdf" in args + keep = "--keep-docx" in args + + with open(dataset) as f: + data = json.load(f) + + if to_pdf: + out = generate_full_report_pdf(data, template_path=template, output_dir=outdir, keep_docx=keep) + print(f"PDF written to: {out}") + else: + out = generate_full_report_docx(data, template_path=template, output_dir=outdir) + print(f"Docx written to: {out}") + print(f"Convert to PDF: python {sys.argv[0]} convert \"{out}\"") \ No newline at end of file diff --git a/security/reports/run_test.py b/security/reports/run_test.py new file mode 100644 index 000000000..26aec0469 --- /dev/null +++ b/security/reports/run_test.py @@ -0,0 +1,95 @@ +""" +run_test.py — smoke test for the AutoAudit report generator + +Place this file in the same folder as: + + report_service.py + AutoAudit_Report_Template.docx + fake_dataset.json + +Usage: + + python run_test.py # generates a .docx + python run_test.py --pdf # converts to PDF as well + python run_test.py --pdf --keep-docx +""" + +import json +import sys +from pathlib import Path + +DATASET_PATH = "fake_dataset.json" +TEMPLATE_PATH = "AutoAudit_Report_Template.docx" +OUTPUT_DIR = "reports_out" + + +def main() -> None: + args = sys.argv[1:] + to_pdf = "--pdf" in args + keep_docx = "--keep-docx" in args + + missing = [p for p in [DATASET_PATH, TEMPLATE_PATH] if not Path(p).exists()] + if missing: + for p in missing: + print(f"ERROR: {p} not found — check you're running from the right directory.") + sys.exit(1) + + with open(DATASET_PATH) as f: + data = json.load(f) + + tenant = data.get("tenant", {}).get("Tenant_Name", "unknown") + print(f"Dataset : {DATASET_PATH}") + print(f" Tenant : {tenant}") + print(f" Controls : {len(data.get('controls', []))}") + print(f" Evidence items : {len(data.get('evidence_register', []))}") + print(f" Remediation items : {len(data.get('remediation_plan', []))}") + + try: + import report_service as svc + except ImportError as e: + print(f"\nERROR: {e}") + print("report_service.py must be in the same directory.") + sys.exit(1) + + fmt = "PDF + docx" if to_pdf and keep_docx else "PDF" if to_pdf else "docx" + print(f"\nTemplate : {TEMPLATE_PATH}") + print(f"Output : {OUTPUT_DIR}/ ({fmt})") + print() + + try: + if to_pdf: + out = svc.generate_full_report_pdf( + data, + template_path=TEMPLATE_PATH, + output_dir=OUTPUT_DIR, + keep_docx=keep_docx, + ) + else: + out = svc.generate_full_report_docx( + data, + template_path=TEMPLATE_PATH, + output_dir=OUTPUT_DIR, + ) + except FileNotFoundError as e: + print(f"ERROR: {e}") + sys.exit(1) + except Exception as e: + import traceback + print(f"ERROR: {e}") + traceback.print_exc() + sys.exit(1) + + out = Path(out) + if not out.exists(): + print("ERROR: generation returned a path but no file was written.") + sys.exit(1) + + size_kb = out.stat().st_size / 1024 + if size_kb < 1: + print(f"WARNING: output is only {size_kb:.1f} KB — something may have gone wrong.") + + print(f"✓ {out.resolve()} ({size_kb:.1f} KB)") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/security/results/report_template.docx b/security/results/report_template.docx index cd10badfe..b8e0d2e85 100644 Binary files a/security/results/report_template.docx and b/security/results/report_template.docx differ