diff --git a/ai-ml/AI001/sample_records.json b/ai-ml/AI001/sample_records.json deleted file mode 100644 index 8be65e2b..00000000 --- a/ai-ml/AI001/sample_records.json +++ /dev/null @@ -1,439 +0,0 @@ -#Weather Data Services (BoM) β†’ Hazard - -[ - { - "hazard_event_id": "HZD_101", - "hazard_type": "bushfire", - "severity_level": "high", - "event_status": "active", - "start_time": "2026-03-21T03:00:00Z", - "latitude": -36.75, - "longitude": 144.28, - "state_region": "Victoria", - "temperature": 37.5, - "rainfall": 0.0, - "wind_speed": 40.2 - }, - { - "hazard_event_id": "HZD_102", - "hazard_type": "flood", - "severity_level": "critical", - "event_status": "active", - "start_time": "2026-03-23T01:30:00Z", - "latitude": -27.47, - "longitude": 153.02, - "state_region": "Queensland", - "temperature": 29.1, - "rainfall": 220.3, - "wind_speed": 18.7 - }, - { - "hazard_event_id": "HZD_103", - "hazard_type": "storm", - "severity_level": "medium", - "event_status": "active", - "start_time": "2026-03-25T06:45:00Z", - "latitude": -33.86, - "longitude": 151.20, - "state_region": "NSW", - "temperature": 25.3, - "rainfall": 60.2, - "wind_speed": 55.1 - }, - { - "hazard_event_id": "HZD_104", - "hazard_type": "cyclone", - "severity_level": "high", - "event_status": "active", - "start_time": "2026-03-26T02:10:00Z", - "latitude": -16.92, - "longitude": 145.77, - "state_region": "Queensland", - "temperature": 30.5, - "rainfall": 180.0, - "wind_speed": 80.3 - }, - { - "hazard_event_id": "HZD_105", - "hazard_type": "heatwave", - "severity_level": "high", - "event_status": "active", - "start_time": "2026-03-27T09:00:00Z", - "latitude": -34.92, - "longitude": 138.60, - "state_region": "South Australia", - "temperature": 42.1, - "rainfall": 0.0, - "wind_speed": 20.4 - } -] - -#FIRMS (NASA) β†’ Bushfire Hazard - -[ - { - "hazard_event_id": "HZD_201", - "hazard_type": "bushfire", - "severity_level": "critical", - "event_status": "active", - "start_time": "2026-03-22T04:10:00Z", - "latitude": -37.81, - "longitude": 144.96, - "state_region": "Victoria" - }, - { - "hazard_event_id": "HZD_202", - "hazard_type": "bushfire", - "severity_level": "high", - "event_status": "active", - "start_time": "2026-03-22T05:15:00Z", - "latitude": -35.28, - "longitude": 149.13, - "state_region": "ACT" - }, - { - "hazard_event_id": "HZD_203", - "hazard_type": "bushfire", - "severity_level": "medium", - "event_status": "active", - "start_time": "2026-03-22T06:20:00Z", - "latitude": -31.95, - "longitude": 115.86, - "state_region": "WA" - }, - { - "hazard_event_id": "HZD_204", - "hazard_type": "bushfire", - "severity_level": "high", - "event_status": "active", - "start_time": "2026-03-22T07:30:00Z", - "latitude": -12.46, - "longitude": 130.84, - "state_region": "NT" - }, - { - "hazard_event_id": "HZD_205", - "hazard_type": "bushfire", - "severity_level": "critical", - "event_status": "active", - "start_time": "2026-03-22T08:45:00Z", - "latitude": -42.88, - "longitude": 147.32, - "state_region": "Tasmania" - } -] - -#EM-DAT - -[ -{ -"hazard_event_id":"HZD_301", -"hazard_type":"flood", -"severity_level":"high", -"event_status":"active", -"start_time":"2026-03-10T02:00:00Z", -"latitude":-33.86, -"longitude":151.20, -"state_region":"NSW", -"local_government_area":"Sydney", -"suburb":"Parramatta", -"temperature":24.5, -"rainfall":180.3, -"wind_speed":30.2 -}, -{ -"hazard_event_id":"HZD_302", -"hazard_type":"bushfire", -"severity_level":"critical", -"event_status":"active", -"start_time":"2026-03-11T03:30:00Z", -"latitude":-37.81, -"longitude":144.96, -"state_region":"Victoria", -"local_government_area":"Melbourne", -"suburb":"Dandenong", -"temperature":41.2, -"rainfall":0.0, -"wind_speed":45.1 -}, -{ -"hazard_event_id":"HZD_303", -"hazard_type":"storm", -"severity_level":"medium", -"event_status":"resolved", -"start_time":"2026-03-12T04:00:00Z", -"latitude":-27.47, -"longitude":153.02, -"state_region":"Queensland", -"local_government_area":"Brisbane", -"suburb":"Logan", -"temperature":27.1, -"rainfall":90.4, -"wind_speed":50.5 -}, -{ -"hazard_event_id":"HZD_304", -"hazard_type":"cyclone", -"severity_level":"high", -"event_status":"active", -"start_time":"2026-03-13T06:00:00Z", -"latitude":-16.92, -"longitude":145.77, -"state_region":"Queensland", -"local_government_area":"Cairns", -"suburb":"Trinity Beach", -"temperature":30.2, -"rainfall":210.7, -"wind_speed":85.6 -}, -{ -"hazard_event_id":"HZD_305", -"hazard_type":"heatwave", -"severity_level":"high", -"event_status":"active", -"start_time":"2026-03-14T08:00:00Z", -"latitude":-34.92, -"longitude":138.60, -"state_region":"SA", -"local_government_area":"Adelaide", -"suburb":"Glenelg", -"temperature":43.5, -"rainfall":0.0, -"wind_speed":22.4 -} -] - -#weather-API - -[ -{ -"hazard_event_id":"HZD_401", -"hazard_type":"storm", -"severity_level":"medium", -"event_status":"active", -"start_time":"2026-03-20T01:00:00Z", -"latitude":-33.86, -"longitude":151.20, -"state_region":"NSW", -"temperature":25.6, -"rainfall":60.2, -"wind_speed":55.1 -}, -{ -"hazard_event_id":"HZD_402", -"hazard_type":"flood", -"severity_level":"high", -"event_status":"active", -"start_time":"2026-03-20T02:30:00Z", -"latitude":-27.47, -"longitude":153.02, -"state_region":"QLD", -"temperature":28.4, -"rainfall":200.3, -"wind_speed":22.5 -}, -{ -"hazard_event_id":"HZD_403", -"hazard_type":"heatwave", -"severity_level":"high", -"event_status":"active", -"start_time":"2026-03-20T04:00:00Z", -"latitude":-31.95, -"longitude":115.86, -"state_region":"WA", -"temperature":41.8, -"rainfall":0.0, -"wind_speed":18.2 -}, -{ -"hazard_event_id":"HZD_404", -"hazard_type":"storm", -"severity_level":"low", -"event_status":"active", -"start_time":"2026-03-20T05:30:00Z", -"latitude":-42.88, -"longitude":147.32, -"state_region":"Tasmania", -"temperature":19.2, -"rainfall":45.1, -"wind_speed":38.7 -}, -{ -"hazard_event_id":"HZD_405", -"hazard_type":"cyclone", -"severity_level":"critical", -"event_status":"active", -"start_time":"2026-03-20T07:00:00Z", -"latitude":-12.46, -"longitude":130.84, -"state_region":"NT", -"temperature":29.5, -"rainfall":250.8, -"wind_speed":95.3 -} -] - -#GitHub Misinformation - -[ -{ -"threat_id":"THR_101", -"threat_type":"misinformation", -"category":"false_alert", -"risk_level":"high", -"confidence_score":0.88, -"detected_at":"2026-03-22T04:20:00Z" -}, -{ -"threat_id":"THR_102", -"threat_type":"misinformation", -"category":"fake_news", -"risk_level":"medium", -"confidence_score":0.76, -"detected_at":"2026-03-22T05:10:00Z" -}, -{ -"threat_id":"THR_103", -"threat_type":"misinformation", -"category":"rumour", -"risk_level":"high", -"confidence_score":0.91, -"detected_at":"2026-03-22T06:00:00Z" -}, -{ -"threat_id":"THR_104", -"threat_type":"misinformation", -"category":"panic_message", -"risk_level":"high", -"confidence_score":0.89, -"detected_at":"2026-03-22T07:15:00Z" -}, -{ -"threat_id":"THR_105", -"threat_type":"misinformation", -"category":"social_media_spread", -"risk_level":"medium", -"confidence_score":0.79, -"detected_at":"2026-03-22T08:30:00Z" -} -] - -#GoMask Fraud - -[ -{ -"threat_id":"THR_301", -"threat_type":"phishing", -"category":"insurance_scam", -"risk_level":"high", -"confidence_score":0.92 -}, -{ -"threat_id":"THR_302", -"threat_type":"phishing", -"category":"donation_scam", -"risk_level":"critical", -"confidence_score":0.95 -}, -{ -"threat_id":"THR_303", -"threat_type":"phishing", -"category":"loan_scam", -"risk_level":"medium", -"confidence_score":0.78 -}, -{ -"threat_id":"THR_304", -"threat_type":"phishing", -"category":"bank_scam", -"risk_level":"high", -"confidence_score":0.90 -}, -{ -"threat_id":"THR_305", -"threat_type":"phishing", -"category":"identity_theft", -"risk_level":"critical", -"confidence_score":0.96 -} -] - -#OpenPhish - -[ -{ -"threat_id":"THR_401", -"threat_type":"phishing", -"category":"email", -"risk_level":"high", -"confidence_score":0.87 -}, -{ -"threat_id":"THR_402", -"threat_type":"phishing", -"category":"website", -"risk_level":"critical", -"confidence_score":0.94 -}, -{ -"threat_id":"THR_403", -"threat_type":"phishing", -"category":"sms", -"risk_level":"medium", -"confidence_score":0.75 -}, -{ -"threat_id":"THR_404", -"threat_type":"phishing", -"category":"email", -"risk_level":"medium", -"confidence_score":0.80 -}, -{ -"threat_id":"THR_405", -"threat_type":"phishing", -"category":"website", -"risk_level":"high", -"confidence_score":0.89 -} -] - -#Spamhaus - -[ -{ -"threat_id":"THR_501", -"threat_type":"spam", -"category":"ip_blacklist", -"risk_level":"high", -"confidence_score":0.88 -}, -{ -"threat_id":"THR_502", -"threat_type":"spam", -"category":"botnet", -"risk_level":"critical", -"confidence_score":0.96 -}, -{ -"threat_id":"THR_503", -"threat_type":"spam", -"category":"malware_source", -"risk_level":"high", -"confidence_score":0.91 -}, -{ -"threat_id":"THR_504", -"threat_type":"spam", -"category":"phishing_ip", -"risk_level":"high", -"confidence_score":0.89 -}, -{ -"threat_id":"THR_505", -"threat_type":"spam", -"category":"dns_blocklist", -"risk_level":"medium", -"confidence_score":0.77 -} -] diff --git a/ai-ml/Sample Records/cyber_threat_samples.json b/ai-ml/Sample Records/cyber_threat_samples.json new file mode 100644 index 00000000..1a961281 --- /dev/null +++ b/ai-ml/Sample Records/cyber_threat_samples.json @@ -0,0 +1,60 @@ + +{ + "threat_id": "b2e4d5f6-1111-4c3d-c111-000000000001", + "threat_type": "phishing", + "title": "Fake disaster relief payment email", + "description": "Email campaign impersonating government agencies offering emergency relief payments.", + "risk_level": "critical", + "status": "active", + "category": "cyber", + "confidence_score": 0.92, + "detected_at": "2026-03-21T12:10:00Z", + "source_id": "33333333-cccc-4ccc-dddd-000000000010", + "created_at": "2026-03-21T12:20:00Z", + "updated_at": "2026-03-21T12:20:00Z" +} + +{ + "threat_id": "b2e4d5f6-2222-4c3d-c222-000000000002", + "threat_type": "malware", + "title": "Malicious weather alert application", + "description": "Fake mobile application distributing malware under the guise of real-time weather alerts.", + "risk_level": "high", + "status": "active", + "category": "cyber", + "confidence_score": 0.87, + "detected_at": "2026-02-15T08:45:00Z", + "source_id": "33333333-cccc-4ccc-dddd-000000000011", + "created_at": "2026-02-15T09:00:00Z", + "updated_at": "2026-02-15T09:00:00Z" +} + +{ + "threat_id": "b2e4d5f6-3333-4c3d-c333-000000000003", + "threat_type": "misinformation", + "title": "False evacuation alert circulating online", + "description": "Social media posts spreading false evacuation notices during severe storm events.", + "risk_level": "medium", + "status": "monitoring", + "category": "cyber", + "confidence_score": 0.78, + "detected_at": "2026-01-30T16:20:00Z", + "source_id": "33333333-cccc-4ccc-dddd-000000000012", + "created_at": "2026-01-30T16:30:00Z", + "updated_at": "2026-01-30T16:30:00Z" +} + +{ + "threat_id": "b2e4d5f6-4444-4c3d-c444-000000000004", + "threat_type": "data_breach", + "title": "Unauthorized access to emergency response system", + "description": "Multiple unauthorized login attempts detected targeting emergency service infrastructure.", + "risk_level": "high", + "status": "active", + "category": "cyber", + "confidence_score": 0.85, + "detected_at": "2026-03-10T22:15:00Z", + "source_id": "33333333-cccc-4ccc-dddd-000000000013", + "created_at": "2026-03-10T22:30:00Z", + "updated_at": "2026-03-10T22:30:00Z" +} \ No newline at end of file diff --git a/ai-ml/Sample Records/hazard_event_samples.json b/ai-ml/Sample Records/hazard_event_samples.json new file mode 100644 index 00000000..138d623c --- /dev/null +++ b/ai-ml/Sample Records/hazard_event_samples.json @@ -0,0 +1,80 @@ +{ + "hazard_event_id": "a1f3c2d4-1111-4a2b-b111-000000000001", + "hazard_type": "heatwave", + "description": "Extreme heatwave conditions affecting metropolitan communities.", + "severity_level": "high", + "event_status": "active", + "start_time": "2026-01-10T09:00:00Z", + "end_time": null, + "source_id": "33333333-cccc-4ccc-dddd-000000000001", + "source_ref_event": "BOM-HEAT-001", + "geo_location_id": "22222222-bbbb-4bbb-cccc-000000000001", + "created_at": "2026-01-10T09:15:00Z", + "updated_at": "2026-01-10T09:15:00Z" +} + +{ + "hazard_event_id": "a1f3c2d4-2222-4a2b-b222-000000000002", + "hazard_type": "storm", + "description": "Severe thunderstorm with heavy rainfall and strong winds.", + "severity_level": "medium", + "event_status": "active", + "start_time": "2026-02-05T14:30:00Z", + "end_time": null, + "source_id": "33333333-cccc-4ccc-dddd-000000000002", + "source_ref_event": "BOM-STORM-002", + "geo_location_id": "22222222-bbbb-4bbb-cccc-000000000002", + "created_at": "2026-02-05T14:45:00Z", + "updated_at": "2026-02-05T14:45:00Z" +} + +{ + "hazard_event_id": "a1f3c2d4-3333-4a2b-b333-000000000003", + "hazard_type": "bushfire", + "description": "Rapidly spreading bushfire threatening rural communities.", + "severity_level": "critical", + "event_status": "active", + "start_time": "2026-03-21T10:30:00Z", + "end_time": null, + "source_id": "33333333-cccc-4ccc-dddd-000000000003", + "source_ref_event": "BOM-FIRE-003", + "geo_location_id": "22222222-bbbb-4bbb-cccc-000000000003", + "created_at": "2026-03-21T11:05:00Z", + "updated_at": "2026-03-21T11:05:00Z" +} + +{ + "hazard_event_id": "a1f3c2d4-4444-4a2b-b444-000000000004", + "hazard_type": "flood", + "description": "River flooding caused by prolonged rainfall across low-lying areas.", + "severity_level": "high", + "event_status": "monitoring", + "start_time": "2026-04-01T06:00:00Z", + "end_time": null, + "source_id": "33333333-cccc-4ccc-dddd-000000000004", + "source_ref_event": "BOM-FLOOD-004", + "geo_location_id": "22222222-bbbb-4bbb-cccc-000000000004", + "created_at": "2026-04-01T06:15:00Z", + "updated_at": "2026-04-01T06:15:00Z" +} + +{ + "hazard_event_id": "a1f3c2d4-5555-4a2b-b555-000000000005", + "hazard_type": "cyclone", + "description": "Category 4 cyclone approaching coastal regions with destructive winds.", + "severity_level": "critical", + "event_status": "active", + "start_time": "2026-02-20T03:00:00Z", + "end_time": null, + "source_id": "33333333-cccc-4ccc-dddd-000000000005", + "source_ref_event": "BOM-CYC-005", + "geo_location_id": "22222222-bbbb-4bbb-cccc-000000000005", + "created_at": "2026-02-20T03:10:00Z", + "updated_at": "2026-02-20T03:10:00Z" +} + + + + + + diff --git a/ai-ml/Sample Records/risk_assessment_integration.json b/ai-ml/Sample Records/risk_assessment_integration.json new file mode 100644 index 00000000..7e92260e --- /dev/null +++ b/ai-ml/Sample Records/risk_assessment_integration.json @@ -0,0 +1,64 @@ + +{ + "integration_event_id": "c3f5e6a7-1111-4d4e-d111-000000000001", + "related_hazard_event_id": "a1f3c2d4-3333-4a2b-b333-000000000003", + "related_threat_id": "b2e4d5f6-1111-4c3d-c111-000000000001", + "correlation_score": 0.88, + "linkage_reason": "Phishing campaign detected during active bushfire emergency targeting affected communities.", + "integration_confidence": 0.91, + "linked_event_type": 3, + "event_status": 1, + "event_time": "2026-03-21T12:00:00Z", + "detected_at": "2026-03-21T12:10:00Z", + "reported_at": "2026-03-21T12:20:00Z", + "created_at": "2026-03-21T12:25:00Z", + "updated_at": "2026-03-21T12:25:00Z" +} + +{ + "integration_event_id": "c3f5e6a7-2222-4d4e-d222-000000000002", + "related_hazard_event_id": "a1f3c2d4-2222-4a2b-b222-000000000002", + "related_threat_id": "b2e4d5f6-3333-4c3d-c333-000000000003", + "correlation_score": 0.72, + "linkage_reason": "False evacuation alerts spread during storm warnings causing public confusion.", + "integration_confidence": 0.80, + "linked_event_type": 3, + "event_status": 2, + "event_time": "2026-02-05T15:00:00Z", + "detected_at": "2026-02-05T15:10:00Z", + "reported_at": "2026-02-05T15:25:00Z", + "created_at": "2026-02-05T15:30:00Z", + "updated_at": "2026-02-05T15:30:00Z" +} + +{ + "integration_event_id": "c3f5e6a7-3333-4d4e-d333-000000000003", + "related_hazard_event_id": "a1f3c2d4-4444-4a2b-b444-000000000004", + "related_threat_id": null, + "correlation_score": 0.60, + "linkage_reason": "Flood event monitored without associated cyber threat.", + "integration_confidence": 0.70, + "linked_event_type": 1, + "event_status": 2, + "event_time": "2026-04-01T07:00:00Z", + "detected_at": "2026-04-01T07:10:00Z", + "reported_at": "2026-04-01T07:20:00Z", + "created_at": "2026-04-01T07:25:00Z", + "updated_at": "2026-04-01T07:25:00Z" +} + +{ + "integration_event_id": "c3f5e6a7-4444-4d4e-d444-000000000004", + "related_hazard_event_id": null, + "related_threat_id": "b2e4d5f6-2222-4c3d-c222-000000000002", + "correlation_score": 0.65, + "linkage_reason": "Malware campaign detected independently of hazard events.", + "integration_confidence": 0.75, + "linked_event_type": 2, + "event_status": 1, + "event_time": "2026-02-15T09:00:00Z", + "detected_at": "2026-02-15T09:10:00Z", + "reported_at": "2026-02-15T09:20:00Z", + "created_at": "2026-02-15T09:25:00Z", + "updated_at": "2026-02-15T09:25:00Z" +} \ No newline at end of file diff --git a/ai-ml/cleaning/.gitkeep b/ai-ml/cleaning/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/ai-ml/cleaning/README.md b/ai-ml/cleaning/README.md new file mode 100644 index 00000000..a1abb596 --- /dev/null +++ b/ai-ml/cleaning/README.md @@ -0,0 +1,134 @@ +# Data Cleaning Pipeline (`ai-ml/cleaning`) + +This module provides a configurable CSV cleaning + validation pipeline. + +It is designed to: + +- clean raw tabular data (missing values, duplicates, type conversion, string normalization) +- validate cleaned data against rules (required fields, allowed values, ranges, types, date formats) +- generate output artifacts (cleaned CSV, validation report, comparison report, pipeline log) + +## Folder Structure + +- `config/pipeline_config.json` - all pipeline configuration +- `data/input/` - raw CSV input files +- `data/output/` - cleaned CSV output +- `data/reports/` - JSON reports (validation + before/after comparison) +- `data/logs/` - pipeline event log +- `src/` - pipeline implementation + +## What The Pipeline Does + +For each run, the pipeline: + +1. reads input CSV from `paths.input_csv` +2. selects cleaning/validation rules: + +- dataset-specific rules under `datasets.` when columns match +- otherwise falls back to top-level `cleaning` + `validation` (`generic`) + +1. applies cleaning steps +2. runs validation checks +3. writes outputs to configured paths + +## Run The Pipeline + +From repo root: + +```powershell +python ai-ml/cleaning/src/main.py +``` + +With a custom config: + +```powershell +python ai-ml/cleaning/src/main.py --config ai-ml/cleaning/config/pipeline_config.json +``` + +## Use In Other Python Scripts + +Example integration: + +```python +from pathlib import Path +import sys + +cleaning_root = Path("ai-ml/cleaning").resolve() +if str(cleaning_root) not in sys.path: + sys.path.insert(0, str(cleaning_root)) + +from src.pipeline import run_pipeline + +summary = run_pipeline(cleaning_root / "config" / "pipeline_config.json") +print(summary) +``` + +## How To Modify `pipeline_config.json` + +### 1. Set input/output paths + +Update `paths`: + +- `input_csv` +- `cleaned_csv` +- `validation_report` +- `comparison_report` +- `pipeline_log` + +### 2. Configure generic fallback rules + +Top-level `cleaning` and `validation` are used when no dataset-specific schema matches. + +### 3. Add or edit dataset-specific rules + +Under `datasets`, each dataset entry should contain: + +- `cleaning` +- `validation.required_columns` +- `validation.column_rules` + +Minimal pattern: + +```json +"datasets": { + "my_dataset": { + "cleaning": { + "missing_values": { "drop": [], "fill": {} }, + "duplicates": { "subset": [] }, + "type_conversion": { "int": [], "float": [], "datetime": [] }, + "string_standardisation": ["col_a", "col_b"] + }, + "validation": { + "required_columns": ["id"], + "column_rules": { + "id": { "required": true, "type": "int", "unique": true } + } + } + } +} +``` + +## Validation Rules Supported + +Per column (`validation.column_rules.`): + +- `required: true|false` +- `type: "int" | "float" | "str" | "date"` +- `unique: true|false` +- `allowed_values: [...]` +- `min` / `max` (numeric) +- `format` (for `type: "date"`) + +## Cleaning Rules Supported + +- `missing_values.drop`: drop rows where these columns are null +- `missing_values.fill`: fill nulls with provided values +- `duplicates.subset`: drop duplicate rows by subset +- `type_conversion.int|float|datetime`: coercive conversion +- `string_standardisation`: trim + normalize configured text columns + +## Notes + +- Input is expected to be CSV. +- Validation `FAIL` means rule violations were found; pipeline still completes and writes reports. +- If a new dataset is not being picked, verify its `required_columns` match the CSV column names exactly. diff --git a/ai-ml/cleaning/ai003/README.md b/ai-ml/cleaning/ai003/README.md deleted file mode 100644 index 340cca54..00000000 --- a/ai-ml/cleaning/ai003/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# AI003 - Workstream 3 - -This folder contains starter work for AI003 Workstream 3: logging, testing, before-vs-after comparison, and documentation. - -## Files -- `test_data.csv` - dummy dataset with common data issues -- `logging_utils.py` - helper functions for logging transformations -- `comparison.py` - compares dataset quality before and after cleaning -- `run_demo.py` - demo script to test logging and comparison flow -- `documentation.md` - documentation notes for AI003 - -## Current Scope -This work is schema-independent and uses dummy CSV data for early development. - -## Covered in Workstream 3 -- logging rows removed -- logging missing values found -- logging simple transformations -- before vs after comparison -- test dataset preparation -- initial documentation \ No newline at end of file diff --git a/ai-ml/cleaning/ai003/__pycache__/comparison.cpython-312.pyc b/ai-ml/cleaning/ai003/__pycache__/comparison.cpython-312.pyc deleted file mode 100644 index da3e8431..00000000 Binary files a/ai-ml/cleaning/ai003/__pycache__/comparison.cpython-312.pyc and /dev/null differ diff --git a/ai-ml/cleaning/ai003/__pycache__/logging_utils.cpython-312.pyc b/ai-ml/cleaning/ai003/__pycache__/logging_utils.cpython-312.pyc deleted file mode 100644 index 8da381fc..00000000 Binary files a/ai-ml/cleaning/ai003/__pycache__/logging_utils.cpython-312.pyc and /dev/null differ diff --git a/ai-ml/cleaning/ai003/cleaned_output.csv b/ai-ml/cleaning/ai003/cleaned_output.csv deleted file mode 100644 index 4df22f1a..00000000 --- a/ai-ml/cleaning/ai003/cleaned_output.csv +++ /dev/null @@ -1,6 +0,0 @@ -id,timestamp,location,event_type,severity,status -1,2026-03-24 10:00:00,Melbourne,phishing,5,open -2,,Sydney,phishing,8,open -3,24/03/2026,melbourne,misinformation,11,closed -4,2026/03/25 09:30,Brisbane,scam,-1,open -5,2026-03-25T12:00:00,,phishing,4, diff --git a/ai-ml/cleaning/ai003/documentation.md b/ai-ml/cleaning/ai003/documentation.md deleted file mode 100644 index 133a4c8e..00000000 --- a/ai-ml/cleaning/ai003/documentation.md +++ /dev/null @@ -1,53 +0,0 @@ -# AI003 Documentation Notes - -## 1. Task Overview -Task ID: AI003 -Task Name: Data Cleaning Pipeline Logic -Workstream: 3 - Logging, Testing, and Documentation - -## 2. Objective -Support the reusable data cleaning pipeline by: -- tracking transformations -- preparing test datasets -- comparing before vs after outputs -- documenting cleaning behaviour - -## 3. Input Data Description -Dataset Name: Dummy AI003 test dataset -Source: Synthetic / manually created -Format: CSV -Fields: -- id -- timestamp -- location -- event_type -- severity -- status - -## 4. Identified Data Issues -- Missing timestamp values -- Missing location/status values -- Duplicate rows -- Timestamp inconsistencies -- Categorical inconsistencies (`phishing`, `Phishing`, `phish`) -- Invalid severity values - -## 5. Logging & Traceability -Track: -- rows removed -- nulls found -- category normalisation -- other transformations - -## 6. Before vs After Comparison -Compare: -- row count -- column count -- missing values -- duplicate rows - -## 7. Testing -A dummy CSV dataset is used to simulate common data quality issues. - -## 8. Notes -This work is currently schema-independent and will later integrate with AI001 once the schema is finalised. \ No newline at end of file diff --git a/ai-ml/cleaning/ai003/logging_utils.py b/ai-ml/cleaning/ai003/logging_utils.py deleted file mode 100644 index 7fb85d64..00000000 --- a/ai-ml/cleaning/ai003/logging_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -from datetime import datetime - - -def log_message(step: str, details: str) -> str: - timestamp = datetime.now().isoformat() - return f"[{timestamp}] {step}: {details}" - - -def log_rows_removed(count: int) -> str: - return log_message("remove_duplicates", f"rows_removed={count}") - - -def log_nulls_found(count: int) -> str: - return log_message("missing_values", f"null_values_found={count}") - - -def log_other_transformations(details: str) -> str: - return log_message("transformation", details) \ No newline at end of file diff --git a/ai-ml/cleaning/ai003/run_demo.py b/ai-ml/cleaning/ai003/run_demo.py deleted file mode 100644 index ad35ab9e..00000000 --- a/ai-ml/cleaning/ai003/run_demo.py +++ /dev/null @@ -1,39 +0,0 @@ -import pandas as pd -from logging_utils import log_rows_removed, log_nulls_found, log_other_transformations -from comparison import compare_before_after - - -def demo_clean(df: pd.DataFrame) -> pd.DataFrame: - cleaned = df.copy() - - null_count = int(cleaned.isnull().sum().sum()) - print(log_nulls_found(null_count)) - - duplicate_count = int(cleaned.duplicated().sum()) - cleaned = cleaned.drop_duplicates() - print(log_rows_removed(duplicate_count)) - - if "event_type" in cleaned.columns: - cleaned["event_type"] = cleaned["event_type"].replace({ - "Phishing": "phishing", - "phish": "phishing" - }) - print(log_other_transformations("normalised event_type values")) - - return cleaned - - -def main(): - before_df = pd.read_csv("test_data.csv") - after_df = demo_clean(before_df) - - result = compare_before_after(before_df, after_df) - print("\nBefore vs After Summary") - print(result) - - after_df.to_csv("cleaned_output.csv", index=False) - print("\nSaved cleaned_output.csv") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/ai-ml/cleaning/ai003/test_data.csv b/ai-ml/cleaning/ai003/test_data.csv deleted file mode 100644 index c881cf0d..00000000 --- a/ai-ml/cleaning/ai003/test_data.csv +++ /dev/null @@ -1,7 +0,0 @@ -id,timestamp,location,event_type,severity,status -1,2026-03-24 10:00:00,Melbourne,phishing,5,open -2,,Sydney,Phishing,8,open -2,,Sydney,Phishing,8,open -3,24/03/2026,melbourne,misinformation,11,closed -4,2026/03/25 09:30,Brisbane,scam,-1,open -5,2026-03-25T12:00:00,,phish,4, \ No newline at end of file diff --git a/ai-ml/cleaning/config/pipeline_config.json b/ai-ml/cleaning/config/pipeline_config.json new file mode 100644 index 00000000..ceafabad --- /dev/null +++ b/ai-ml/cleaning/config/pipeline_config.json @@ -0,0 +1,199 @@ +{ + "dataset_type": "generic", + "paths": { + "input_csv": "data/input/sample_raw.csv", + "cleaned_csv": "data/output/cleaned_data.csv", + "comparison_report": "data/reports/comparison_report.json", + "validation_report": "data/reports/validation_report.json", + "pipeline_log": "data/logs/pipeline.log" + }, + "datasets": { + "hazard_event": { + "cleaning": { + "missing_values": { + "drop": ["hazard_event_id", "hazard_type", "severity_level", "start_time"], + "fill": { + "event_status": "monitoring", + "end_time": null, + "source_ref_event": "unknown", + "description": "" + } + }, + "duplicates": { + "subset": ["hazard_event_id"] + }, + "type_conversion": { + "datetime": ["start_time", "end_time", "created_at", "updated_at"] + }, + "string_standardisation": [ + "hazard_type", + "severity_level", + "event_status" + ] + }, + "validation": { + "required_columns": [ + "hazard_event_id", + "hazard_type", + "severity_level", + "start_time" + ], + "column_rules": { + "hazard_event_id": { + "required": true, + "type": "str", + "unique": true + }, + "hazard_type": { + "required": true, + "type": "str", + "allowed_values": ["flood", "bushfire", "storm", "earthquake", "cyclone"] + }, + "severity_level": { + "required": true, + "type": "str", + "allowed_values": ["low", "medium", "high", "critical"] + }, + "event_status": { + "required": false, + "type": "str", + "allowed_values": ["active", "resolved", "monitoring"] + }, + "start_time": { + "required": true, + "type": "date", + "format": "%Y-%m-%dT%H:%M:%S%z" + } + } + } + }, + "cyber_threat": { + "cleaning": { + "missing_values": { + "drop": ["threat_id", "threat_type", "detected_at"], + "fill": { + "status": "monitoring", + "risk_level": "low", + "category": "unknown", + "confidence_score": 0.0, + "description": "", + "title": "untitled" + } + }, + "duplicates": { + "subset": ["threat_id"] + }, + "type_conversion": { + "int": ["threat_id", "source_id"], + "float": ["confidence_score"], + "datetime": ["detected_at", "created_at", "updated_at"] + }, + "string_standardisation": [ + "threat_type", + "risk_level", + "status", + "category" + ] + }, + "validation": { + "required_columns": [ + "threat_id", + "threat_type", + "detected_at" + ], + "column_rules": { + "threat_id": { + "required": true, + "type": "int", + "unique": true + }, + "threat_type": { + "required": true, + "type": "str" + }, + "risk_level": { + "required": false, + "type": "str", + "allowed_values": ["low", "medium", "high", "critical"] + }, + "status": { + "required": false, + "type": "str", + "allowed_values": ["active", "monitoring", "resolved", "archived"] + }, + "confidence_score": { + "required": false, + "type": "float", + "min": 0.0, + "max": 100.0 + }, + "detected_at": { + "required": true, + "type": "date", + "format": "%Y-%m-%dT%H:%M:%S" + } + } + } + } + }, + "cleaning": { + "missing_values": { + "drop": ["event_type", "location", "timestamp"], + "fill": { + "severity": 0, + "cyber_threat_type": "unknown", + "threat_url": "unknown", + "threat_severity": "unknown" + } + }, + "duplicates": { + "subset": ["event_id"] + }, + "type_conversion": { + "int": ["severity"], + "datetime": ["timestamp"] + }, + "string_standardisation": [ + "event_type", + "location", + "cyber_threat_type", + "threat_severity" + ] + }, + "validation": { + "required_columns": [ + "event_id", + "event_type", + "location", + "severity", + "timestamp" + ], + "column_rules": { + "event_id": { + "required": true, + "type": "int", + "unique": true + }, + "event_type": { + "required": true, + "type": "str", + "allowed_values": ["Bushfire", "Flood", "Severe thunderstorm", "Earthquake"] + }, + "location": { + "required": true, + "type": "str" + }, + "severity": { + "required": true, + "type": "int", + "min": 0, + "max": 5 + }, + "timestamp": { + "required": true, + "type": "date", + "format": "%Y-%m-%d" + } + } + } +} diff --git a/ai-ml/cleaning/data/input/sample_raw.csv b/ai-ml/cleaning/data/input/sample_raw.csv new file mode 100644 index 00000000..ccf5b920 --- /dev/null +++ b/ai-ml/cleaning/data/input/sample_raw.csv @@ -0,0 +1,28 @@ +event_id,event_type,location,severity,timestamp,cyber_threat_type,threat_url,threat_severity +1,Bushfire,VIC,4,2024-01-15 08:30:00,Phishing,http://fake-relief.com,HIGH +2,FLOOD,nsw,2,15/02/2024,MISINFORMATION,,low +3,bushfire,TAS,5,2024-03-10 14:00:00,,http://scam-bushfire.org, +4,Severe Thunderstorm, ,7,7:45 p.m,phishing, ,High +5,,Unknown Location,3,,Donation Scam,,MEDIUM +3,bushfire,TAS,5,2024-03-10 14:00:00,,http://scam-bushfire.org,low +6,Flood,NSW,-1,2024-02-30 12:00:00,Ransomware,http://malicious-flood-alert.net,CRITICAL +7,Earthquake,VIC,3,2024-04-01 09:15:00,Phishing,http://quake-scam.com,HIGH +8,Bushfire,SA,999,2024-04-05 10:00:00,Misinformation,https://fake-news-fire.org,MEDIUM +9,Flood,WA,2,not_a_date,Donation Scam,http://donate-now-scam.com,LOW +10,Severe thunderstorm,QLD,4,2024/05/10 16:30:00,phishing,http://storm-warning-fake.com,HIGH +11,BUSHFIRE,vic,4,2024-01-15 08:30:00,PHISHING,http://fake-relief.com,HIGH +12,Flood,NSW,,2024-06-01 11:00:00,,, +13,Bushfire,,3,2024-06-02 13:00:00,Phishing,http://phishfire.com,HIGH +14,Flood,NSW,three,2024-06-03 14:00:00,Misinformation,http://fake-flood-news.com,LOW +15,Severe Thunderstorm,VIC,5,2024-13-01 08:00:00,Phishing,http://bad-date.com,HIGH +16,Bushfire,TAS,2,2024-06-05 18:45:00,SQL Injection,http://attack-vector.com,SEVERE +17,Flood,NSW,4,2024-06-06 20:00:00,XSS,http://xss-scam.com,HIGH +18, Severe Thunderstorm , VIC , 4 , 2024-06-07 21:15:00 , phishing , http://spaces.com , medium +19,Bushfire,VIC,NULL,2024-06-08 10:10:10,Misinformation,http://nullseverity.com,LOW +20,Flood,NSW,5,2024-06-09T12:00:00Z,Phishing,http://iso-date.com,HIGH +21,, , , , , , +22,Bushfire,VIC,4,2024-06-10 09:00:00,Phishing,not_a_url,HIGH +23,Flood,NSW,4,2024-06-10 09:00:00,Phishing,http://duplicate-test.com,HIGH +23,Flood,NSW,4,2024-06-10 09:00:00,Phishing,http://duplicate-test.com,HIGH +24,Bushfire,VIC,5,2024-06-11 08:00:00,Phishing,http://unicode-πŸ”₯.com,HIGH +25,Flood,NSW,0,2024-06-12 11:00:00,unknown,unknown,unknown \ No newline at end of file diff --git a/ai-ml/cleaning/data/output/cleaned_data.csv b/ai-ml/cleaning/data/output/cleaned_data.csv new file mode 100644 index 00000000..59d7a3bf --- /dev/null +++ b/ai-ml/cleaning/data/output/cleaned_data.csv @@ -0,0 +1,19 @@ +event_id,event_type,location,severity,timestamp,cyber_threat_type,threat_url,threat_severity +1,Bushfire,Vic,4.0,2024-01-15 08:30:00+00:00,Phishing,http://fake-relief.com,High +2,Flood,Nsw,2.0,2024-02-15 00:00:00+00:00,Misinformation,unknown,Low +3,Bushfire,Tas,5.0,2024-03-10 14:00:00+00:00,Unknown,http://scam-bushfire.org,Unknown +7,Earthquake,Vic,3.0,2024-04-01 09:15:00+00:00,Phishing,http://quake-scam.com,High +8,Bushfire,Sa,999.0,2024-04-05 10:00:00+00:00,Misinformation,https://fake-news-fire.org,Medium +10,Severe thunderstorm,Qld,4.0,2024-05-10 16:30:00+00:00,Phishing,http://storm-warning-fake.com,High +11,Bushfire,Vic,4.0,2024-01-15 08:30:00+00:00,Phishing,http://fake-relief.com,High +12,Flood,Nsw,0.0,2024-06-01 11:00:00+00:00,Unknown,unknown,Unknown +14,Flood,Nsw,0.0,2024-06-03 14:00:00+00:00,Misinformation,http://fake-flood-news.com,Low +16,Bushfire,Tas,2.0,2024-06-05 18:45:00+00:00,Sql injection,http://attack-vector.com,Severe +17,Flood,Nsw,4.0,2024-06-06 20:00:00+00:00,Xss,http://xss-scam.com,High +18,Severe thunderstorm,Vic,4.0,2024-06-07 21:15:00+00:00,Phishing, http://spaces.com ,Medium +19,Bushfire,Vic,0.0,2024-06-08 10:10:10+00:00,Misinformation,http://nullseverity.com,Low +20,Flood,Nsw,5.0,2024-06-09 12:00:00+00:00,Phishing,http://iso-date.com,High +22,Bushfire,Vic,4.0,2024-06-10 09:00:00+00:00,Phishing,not_a_url,High +23,Flood,Nsw,4.0,2024-06-10 09:00:00+00:00,Phishing,http://duplicate-test.com,High +24,Bushfire,Vic,5.0,2024-06-11 08:00:00+00:00,Phishing,http://unicode-πŸ”₯.com,High +25,Flood,Nsw,0.0,2024-06-12 11:00:00+00:00,Unknown,unknown,Unknown diff --git a/ai-ml/cleaning/data/reports/comparison_report.json b/ai-ml/cleaning/data/reports/comparison_report.json new file mode 100644 index 00000000..2e9250b3 --- /dev/null +++ b/ai-ml/cleaning/data/reports/comparison_report.json @@ -0,0 +1,14 @@ +{ + "before": { + "rows": 27, + "columns": 8, + "missing_values": 15, + "duplicate_rows": 1 + }, + "after": { + "rows": 18, + "columns": 8, + "missing_values": 0, + "duplicate_rows": 0 + } +} \ No newline at end of file diff --git a/ai-ml/cleaning/data/reports/validation_report.json b/ai-ml/cleaning/data/reports/validation_report.json new file mode 100644 index 00000000..ee82631c --- /dev/null +++ b/ai-ml/cleaning/data/reports/validation_report.json @@ -0,0 +1,21 @@ +{ + "dataset_name": "sample_raw.csv", + "total_rows": 18, + "total_columns": 8, + "checks_run": 7, + "total_issues": 1, + "status": "FAIL", + "issue_summary_by_column": { + "severity": 1 + }, + "issues": [ + { + "row": 8, + "column": "severity", + "rule": "max", + "value": "999.0", + "message": "Value above maximum allowed (5)" + } + ], + "error_rate": 0.05555555555555555 +} \ No newline at end of file diff --git a/ai-ml/cleaning/docs/architecture.md b/ai-ml/cleaning/docs/architecture.md new file mode 100644 index 00000000..b95186f2 --- /dev/null +++ b/ai-ml/cleaning/docs/architecture.md @@ -0,0 +1,41 @@ +# AI003 Pipeline Architecture + +The AI003 data cleaning pipeline is organised into three main components: + +- **cleaning**: handles raw data preprocessing and transformation +- **validation**: checks required fields, formats, and data consistency +- **logging**: tracks missing values, duplicate removal, transformations, and output summaries + +## Flow +Raw Input Data -> Cleaning -> Validation -> Logging -> Cleaned Output + +## Current Implementation +The current implementation uses a dummy CSV dataset for testing and demonstrates: +- missing value detection +- duplicate removal +- categorical normalisation +- before vs after comparison +- cleaned output generation + +## Integration +This structure is designed to align with the AI001 schema and support later integration with AI004, AI007, and AI008.# AI003 Pipeline Architecture + +The AI003 data cleaning pipeline is organised into three main components: + +- **cleaning**: handles raw data preprocessing and transformation +- **validation**: checks required fields, formats, and data consistency +- **logging**: tracks missing values, duplicate removal, transformations, and output summaries + +## Flow +Raw Input Data -> Cleaning -> Validation -> Logging -> Cleaned Output + +## Current Implementation +The current implementation uses a dummy CSV dataset for testing and demonstrates: +- missing value detection +- duplicate removal +- categorical normalisation +- before vs after comparison +- cleaned output generation + +## Integration +This structure is designed to align with the AI001 schema and support later integration with AI004, AI007, and AI008. \ No newline at end of file diff --git a/ai-ml/cleaning/docs/proof-pack.md b/ai-ml/cleaning/docs/proof-pack.md new file mode 100644 index 00000000..c6e4f46c --- /dev/null +++ b/ai-ml/cleaning/docs/proof-pack.md @@ -0,0 +1,31 @@ +# AI003 Proof Pack + +## Test Dataset +A dummy CSV dataset was used with the following intentional issues: +- missing values +- duplicate rows +- inconsistent categorical values + +## Execution Result +The pipeline was executed successfully using `run_demo.py`. + +## Observed Output +Before cleaning: +- Rows: 6 +- Columns: 6 +- Missing values: 4 +- Duplicate rows: 1 + +After cleaning: +- Rows: 5 +- Columns: 6 +- Missing values: 3 +- Duplicate rows: 0 + +## Logged Transformations +- Missing values detected +- Duplicate row removed +- `event_type` values normalised + +## Output File +- `cleaned_output.csv` generated successfully \ No newline at end of file diff --git a/ai-ml/cleaning/docs/usage.md b/ai-ml/cleaning/docs/usage.md new file mode 100644 index 00000000..83f37108 --- /dev/null +++ b/ai-ml/cleaning/docs/usage.md @@ -0,0 +1,24 @@ +# AI003 Usage Guide + +## Purpose +This guide explains how to run the AI003 data cleaning demo pipeline. + +## Steps +1. Open terminal in the project folder +2. Navigate to: + `ai-ml/cleaning/logging` +3. Run: + `py run_demo.py` + +## What the script does +- loads the dummy dataset +- detects missing values +- removes duplicate rows +- normalises categorical values +- compares before vs after dataset quality +- generates a cleaned CSV output + +## Output +- console logs +- before vs after summary +- `cleaned_output.csv` \ No newline at end of file diff --git a/ai-ml/cleaning/logging/.gitkeep b/ai-ml/cleaning/logging/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/ai-ml/cleaning/logging/README.md b/ai-ml/cleaning/logging/README.md deleted file mode 100644 index 340cca54..00000000 --- a/ai-ml/cleaning/logging/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# AI003 - Workstream 3 - -This folder contains starter work for AI003 Workstream 3: logging, testing, before-vs-after comparison, and documentation. - -## Files -- `test_data.csv` - dummy dataset with common data issues -- `logging_utils.py` - helper functions for logging transformations -- `comparison.py` - compares dataset quality before and after cleaning -- `run_demo.py` - demo script to test logging and comparison flow -- `documentation.md` - documentation notes for AI003 - -## Current Scope -This work is schema-independent and uses dummy CSV data for early development. - -## Covered in Workstream 3 -- logging rows removed -- logging missing values found -- logging simple transformations -- before vs after comparison -- test dataset preparation -- initial documentation \ No newline at end of file diff --git a/ai-ml/cleaning/logging/__pycache__/comparison.cpython-312.pyc b/ai-ml/cleaning/logging/__pycache__/comparison.cpython-312.pyc deleted file mode 100644 index 3206b947..00000000 Binary files a/ai-ml/cleaning/logging/__pycache__/comparison.cpython-312.pyc and /dev/null differ diff --git a/ai-ml/cleaning/logging/__pycache__/logging_utils.cpython-312.pyc b/ai-ml/cleaning/logging/__pycache__/logging_utils.cpython-312.pyc deleted file mode 100644 index 066662fc..00000000 Binary files a/ai-ml/cleaning/logging/__pycache__/logging_utils.cpython-312.pyc and /dev/null differ diff --git a/ai-ml/cleaning/logging/cleaned_output.csv b/ai-ml/cleaning/logging/cleaned_output.csv deleted file mode 100644 index 4df22f1a..00000000 --- a/ai-ml/cleaning/logging/cleaned_output.csv +++ /dev/null @@ -1,6 +0,0 @@ -id,timestamp,location,event_type,severity,status -1,2026-03-24 10:00:00,Melbourne,phishing,5,open -2,,Sydney,phishing,8,open -3,24/03/2026,melbourne,misinformation,11,closed -4,2026/03/25 09:30,Brisbane,scam,-1,open -5,2026-03-25T12:00:00,,phishing,4, diff --git a/ai-ml/cleaning/logging/comparison.py b/ai-ml/cleaning/logging/comparison.py deleted file mode 100644 index 11acce0a..00000000 --- a/ai-ml/cleaning/logging/comparison.py +++ /dev/null @@ -1,17 +0,0 @@ -import pandas as pd - - -def dataset_summary(df: pd.DataFrame) -> dict: - return { - "rows": len(df), - "columns": len(df.columns), - "missing_values": int(df.isnull().sum().sum()), - "duplicate_rows": int(df.duplicated().sum()), - } - - -def compare_before_after(before_df: pd.DataFrame, after_df: pd.DataFrame) -> dict: - return { - "before": dataset_summary(before_df), - "after": dataset_summary(after_df), - } \ No newline at end of file diff --git a/ai-ml/cleaning/logging/documentation.md b/ai-ml/cleaning/logging/documentation.md deleted file mode 100644 index 133a4c8e..00000000 --- a/ai-ml/cleaning/logging/documentation.md +++ /dev/null @@ -1,53 +0,0 @@ -# AI003 Documentation Notes - -## 1. Task Overview -Task ID: AI003 -Task Name: Data Cleaning Pipeline Logic -Workstream: 3 - Logging, Testing, and Documentation - -## 2. Objective -Support the reusable data cleaning pipeline by: -- tracking transformations -- preparing test datasets -- comparing before vs after outputs -- documenting cleaning behaviour - -## 3. Input Data Description -Dataset Name: Dummy AI003 test dataset -Source: Synthetic / manually created -Format: CSV -Fields: -- id -- timestamp -- location -- event_type -- severity -- status - -## 4. Identified Data Issues -- Missing timestamp values -- Missing location/status values -- Duplicate rows -- Timestamp inconsistencies -- Categorical inconsistencies (`phishing`, `Phishing`, `phish`) -- Invalid severity values - -## 5. Logging & Traceability -Track: -- rows removed -- nulls found -- category normalisation -- other transformations - -## 6. Before vs After Comparison -Compare: -- row count -- column count -- missing values -- duplicate rows - -## 7. Testing -A dummy CSV dataset is used to simulate common data quality issues. - -## 8. Notes -This work is currently schema-independent and will later integrate with AI001 once the schema is finalised. \ No newline at end of file diff --git a/ai-ml/cleaning/logging/logging_utils.py b/ai-ml/cleaning/logging/logging_utils.py deleted file mode 100644 index 7fb85d64..00000000 --- a/ai-ml/cleaning/logging/logging_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -from datetime import datetime - - -def log_message(step: str, details: str) -> str: - timestamp = datetime.now().isoformat() - return f"[{timestamp}] {step}: {details}" - - -def log_rows_removed(count: int) -> str: - return log_message("remove_duplicates", f"rows_removed={count}") - - -def log_nulls_found(count: int) -> str: - return log_message("missing_values", f"null_values_found={count}") - - -def log_other_transformations(details: str) -> str: - return log_message("transformation", details) \ No newline at end of file diff --git a/ai-ml/cleaning/logging/run_demo.py b/ai-ml/cleaning/logging/run_demo.py deleted file mode 100644 index 21aa9eff..00000000 --- a/ai-ml/cleaning/logging/run_demo.py +++ /dev/null @@ -1,38 +0,0 @@ -import pandas as pd -from logging_utils import log_rows_removed, log_nulls_found, log_other_transformations -from comparison import compare_before_after - -def demo_clean(df: pd.DataFrame) -> pd.DataFrame: - cleaned = df.copy() - - null_count = int(cleaned.isnull().sum().sum()) - print(log_nulls_found(null_count)) - - duplicate_count = int(cleaned.duplicated().sum()) - cleaned = cleaned.drop_duplicates() - print(log_rows_removed(duplicate_count)) - - if "event_type" in cleaned.columns: - cleaned["event_type"] = cleaned["event_type"].replace({ - "Phishing": "phishing", - "phish": "phishing" - }) - print(log_other_transformations("normalised event_type values")) - - return cleaned - - -def main(): - before_df = pd.read_csv("test_data.csv") - after_df = demo_clean(before_df) - - result = compare_before_after(before_df, after_df) - print("\nBefore vs After Summary") - print(result) - - after_df.to_csv("cleaned_output.csv", index=False) - print("\nSaved cleaned_output.csv") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/ai-ml/cleaning/logging/test_data.csv b/ai-ml/cleaning/logging/test_data.csv deleted file mode 100644 index c881cf0d..00000000 --- a/ai-ml/cleaning/logging/test_data.csv +++ /dev/null @@ -1,7 +0,0 @@ -id,timestamp,location,event_type,severity,status -1,2026-03-24 10:00:00,Melbourne,phishing,5,open -2,,Sydney,Phishing,8,open -2,,Sydney,Phishing,8,open -3,24/03/2026,melbourne,misinformation,11,closed -4,2026/03/25 09:30,Brisbane,scam,-1,open -5,2026-03-25T12:00:00,,phish,4, \ No newline at end of file diff --git a/ai-ml/cleaning/pipeline/.gitkeep b/ai-ml/cleaning/pipeline/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/ai-ml/cleaning/pipeline/cleaned_data.csv b/ai-ml/cleaning/pipeline/cleaned_data.csv deleted file mode 100644 index a573601b..00000000 --- a/ai-ml/cleaning/pipeline/cleaned_data.csv +++ /dev/null @@ -1,4 +0,0 @@ -event_id,event_type,location,severity,timestamp,cyber_threat_type,threat_url,threat_severity -1,Bushfire,Vic,4.0,2024-01-15 08:30:00,Phishing,http://fake-relief.com,High -2,Flood,Nsw,2.0,2024-02-15 00:00:00,Misinformation,unknown,Low -3,Bushfire,Tas,5.0,2024-03-10 14:00:00,Unknown,http://scam-bushfire.org,Unknown diff --git a/ai-ml/cleaning/pipeline/cleaning_pipeline.py b/ai-ml/cleaning/pipeline/cleaning_pipeline.py deleted file mode 100644 index 8e580bd0..00000000 --- a/ai-ml/cleaning/pipeline/cleaning_pipeline.py +++ /dev/null @@ -1,117 +0,0 @@ -import pandas as pd -import numpy as np -import yaml -import sys - -def load_data(file_path): - try: - df = pd.read_csv(file_path) - print(df.head()) - return df - except FileNotFoundError: - print(f"Error: The file '{file_path}' was not found") - sys.exit(1) - except pd.errors.EmptyDataError: - print(f"Error: The file '{file_path}' is empty") - except pd.errors.ParserError as e: - print(f"Error parsing file '{file_path}': {e}") - except Exception as e: - print(f"An unexpected error occurred: {e}") - -def load_config(config_path): - try: - with open(config_path, 'r') as file: - data = yaml.safe_load(file) - print("Config file has been successfully loaded") - return data - - except FileNotFoundError: - print(f"File wasnt found: {config_path}") - except yaml.YAMLError as exc: - print(f"Error parsing YAML file: {exc}") - -def handle_missing_values(df, config): - initial_rows = len(df) - columns_to_drop = config['missing_values']['drop'] - columns_to_fill = config['missing_values']['fill'] - - df = df.replace(r'^\s*$', np.nan, regex=True) - - df = df.dropna(subset = columns_to_drop) - df = df.fillna(value=columns_to_fill) - - dropped = initial_rows - len(df) - print(f"[LOG] handle_missing_values: Dropped {dropped} rows containing nulls in critical columns.") - - return df - -def handle_duplicates(df, config): - initial_rows = len(df) - columns_with_duplicates = config['duplicates']['subset'] - - df = df.drop_duplicates(subset=columns_with_duplicates, keep='first') - - removed = initial_rows - len(df) - print(f"[LOG] handle_duplicates: Removed {removed} duplicate rows based on {columns_with_duplicates}.") - return df - -def handle_type_conversion(df, config): - - initial_nulls = df.isnull().sum().sum() - - colulmns_to_int = config['type_conversion']['int'] - - for col in colulmns_to_int: - df[col] = pd.to_numeric(df[col], errors='coerce') - - columns_to_time = config['type_conversion']['datetime'] - for col in columns_to_time: - df[col] = pd.to_datetime(df[col], format='mixed', errors = 'coerce') - - new_nulls = df.isnull().sum().sum() - initial_nulls - if new_nulls > 0: - print(f"[LOG] handle_type_conversion: {new_nulls} invalid values were coerced to NULL/NaT.") - return df - -def handle_string_standardisation(df,config): - columns_to_standardise = config['string_standardisation'] - for col in columns_to_standardise: - df[col] =df[col].str.strip().str.capitalize() - return df - -def verify_clean_data(df): - print("--- Null Values ---") - print(df.isnull().sum()) - print("\n--- Data Info ---") - df.info() - print("\n--- Statistics ---") - print(df.describe()) - -def run_pipeline(df,config): - print("Starting pipeline...") - start_rows = len(df) - - df = handle_type_conversion(df, config) - df = handle_missing_values(df, config) - df = handle_duplicates(df, config) - df = handle_string_standardisation(df, config) - - end_rows = len(df) - print("Pipeline sequence complete.") - return df - - -if __name__ == "__main__": - raw_df = load_data('messy_data.csv') - pipeline_config = load_config('config.yaml') - - cleaned_df = run_pipeline(raw_df, pipeline_config) - - verify_clean_data(cleaned_df) - cleaned_df.to_csv('cleaned_data.csv', index=False) - print("Cleaning Complete") - - - - - diff --git a/ai-ml/cleaning/pipeline/config.yaml b/ai-ml/cleaning/pipeline/config.yaml deleted file mode 100644 index c5b757ad..00000000 --- a/ai-ml/cleaning/pipeline/config.yaml +++ /dev/null @@ -1,26 +0,0 @@ -missing_values: - drop: - - event_type - - location - - timestamp - fill: - severity: 0 - cyber_threat_type: "unknown" - threat_url: "unknown" - threat_severity: "unknown" - -duplicates: - subset: - - event_id - -type_conversion: - int: - - severity - datetime: - - timestamp - -string_standardisation: - - event_type - - location - - cyber_threat_type - - threat_severity \ No newline at end of file diff --git a/ai-ml/cleaning/pipeline/messy_data.csv b/ai-ml/cleaning/pipeline/messy_data.csv deleted file mode 100644 index e16b04c4..00000000 --- a/ai-ml/cleaning/pipeline/messy_data.csv +++ /dev/null @@ -1,7 +0,0 @@ -event_id,event_type,location,severity,timestamp,cyber_threat_type,threat_url,threat_severity -1,Bushfire,VIC,4,2024-01-15 08:30:00,Phishing,http://fake-relief.com,HIGH -2,FLOOD,nsw,2,15/02/2024,MISINFORMATION,,low -3,bushfire,TAS,5,2024-03-10 14:00:00,,http://scam-bushfire.org, -4,Severe Thunderstorm, ,,7:45 p.m,phishing, ,High -5,,Unknown Location,3,,Donation Scam,,MEDIUM -3,bushfire,TAS,5,2024-03-10 14:00:00,,http://scam-bushfire.org,low diff --git a/ai-ml/cleaning/pipeline/messy_data.py b/ai-ml/cleaning/pipeline/messy_data.py deleted file mode 100644 index 75e199c2..00000000 --- a/ai-ml/cleaning/pipeline/messy_data.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -import numpy as np - -data = { - 'event_id': [1, 2, 3, 4, 5, 3], # row 6 is a duplicate of row 3 - 'event_type': ['Bushfire', 'FLOOD', 'bushfire', 'Severe Thunderstorm', None, 'bushfire'], - 'location': ['VIC', 'nsw', 'TAS', ' ', 'Unknown Location', 'TAS'], - 'severity': ['4', '2', '5', None, '3', '5'], # numbers stored as strings - 'timestamp': ['2024-01-15 08:30:00', '15/02/2024', '2024-03-10 14:00:00', '7:45 p.m', None, '2024-03-10 14:00:00'], - 'cyber_threat_type': ['Phishing', 'MISINFORMATION', None, 'phishing', 'Donation Scam', None], - 'threat_url': ['http://fake-relief.com', None, 'http://scam-bushfire.org', ' ', None, 'http://scam-bushfire.org'], - 'threat_severity': ['HIGH', 'low', None, 'High', 'MEDIUM', 'low'], -} - -df = pd.DataFrame(data) -df.to_csv('messy_data.csv', index=False) -print("Before cleaning:") -print(df) -print("\nShape:", df.shape) \ No newline at end of file diff --git a/ai-ml/cleaning/requirements.txt b/ai-ml/cleaning/requirements.txt new file mode 100644 index 00000000..9dff8939 --- /dev/null +++ b/ai-ml/cleaning/requirements.txt @@ -0,0 +1,2 @@ +pandas +numpy diff --git a/ai-ml/cleaning/src/__init__.py b/ai-ml/cleaning/src/__init__.py new file mode 100644 index 00000000..d953db4b --- /dev/null +++ b/ai-ml/cleaning/src/__init__.py @@ -0,0 +1 @@ +# Unified cleaning pipeline package. diff --git a/ai-ml/cleaning/src/cleaning/__init__.py b/ai-ml/cleaning/src/cleaning/__init__.py new file mode 100644 index 00000000..9b3255c9 --- /dev/null +++ b/ai-ml/cleaning/src/cleaning/__init__.py @@ -0,0 +1 @@ +from .cleaning_pipeline import run_cleaning_pipeline diff --git a/ai-ml/cleaning/src/cleaning/cleaning_pipeline.py b/ai-ml/cleaning/src/cleaning/cleaning_pipeline.py new file mode 100644 index 00000000..e6605a3a --- /dev/null +++ b/ai-ml/cleaning/src/cleaning/cleaning_pipeline.py @@ -0,0 +1,109 @@ +import numpy as np +import pandas as pd + + +def _log(events, step, details): + events.append({"step": step, "details": details}) + + +def handle_type_conversion(df, config, events): + initial_nulls = int(df.isna().sum().sum()) + int_columns = config.get("int", []) + float_columns = config.get("float", []) + datetime_columns = config.get("datetime", []) + + for col in int_columns: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + + for col in float_columns: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + + for col in datetime_columns: + if col in df.columns: + try: + df[col] = pd.to_datetime( + df[col], format="mixed", errors="coerce", utc=True + ) + except (TypeError, ValueError): + df[col] = pd.to_datetime(df[col], errors="coerce", utc=True) + + new_nulls = int(df.isna().sum().sum()) - initial_nulls + if new_nulls > 0: + _log(events, "type_conversion", f"coerced_invalid_to_null={new_nulls}") + return df + + +def handle_missing_values(df, config, events): + initial_rows = len(df) + columns_to_drop = config.get("drop", []) + fill_values = config.get("fill", {}) + null_count_before = int(df.isna().sum().sum()) + _log(events, "missing_values", f"null_values_found={null_count_before}") + + df = df.replace(r"^\s*$", np.nan, regex=True) + drop_subset = [col for col in columns_to_drop if col in df.columns] + if drop_subset: + df = df.dropna(subset=drop_subset) + if fill_values: + df = df.fillna(value=fill_values) + + dropped = initial_rows - len(df) + _log(events, "missing_values", f"rows_dropped={dropped}") + return df + + +def handle_duplicates(df, config, events): + initial_rows = len(df) + subset = [col for col in config.get("subset", []) if col in df.columns] + if subset: + df = df.drop_duplicates(subset=subset, keep="first") + else: + df = df.drop_duplicates(keep="first") + + removed = initial_rows - len(df) + _log(events, "remove_duplicates", f"rows_removed={removed}") + return df + + +def handle_string_standardisation(df, columns, events): + case_mode = "capitalize" + if isinstance(columns, dict): + case_mode = columns.get("case", "capitalize") + columns = columns.get("columns", []) + + transformed = [] + for col in columns: + if col in df.columns: + series = df[col].astype("string").str.strip() + if case_mode == "lower": + series = series.str.lower() + elif case_mode == "upper": + series = series.str.upper() + elif case_mode == "title": + series = series.str.title() + elif case_mode == "capitalize": + series = series.str.capitalize() + df[col] = series + transformed.append(col) + _log( + events, + "transformation", + f"standardised_columns={transformed}; case={case_mode}", + ) + return df + + +def run_cleaning_pipeline(df, config): + cleaned = df.copy() + events = [] + + cleaned = handle_type_conversion(cleaned, config.get("type_conversion", {}), events) + cleaned = handle_missing_values(cleaned, config.get("missing_values", {}), events) + cleaned = handle_duplicates(cleaned, config.get("duplicates", {}), events) + cleaned = handle_string_standardisation( + cleaned, config.get("string_standardisation", []), events + ) + + return cleaned, events diff --git a/ai-ml/cleaning/src/logging/__init__.py b/ai-ml/cleaning/src/logging/__init__.py new file mode 100644 index 00000000..9c592cfa --- /dev/null +++ b/ai-ml/cleaning/src/logging/__init__.py @@ -0,0 +1,2 @@ +from .comparison import compare_before_after +from .logging_utils import write_log_file diff --git a/ai-ml/cleaning/ai003/comparison.py b/ai-ml/cleaning/src/logging/comparison.py similarity index 100% rename from ai-ml/cleaning/ai003/comparison.py rename to ai-ml/cleaning/src/logging/comparison.py diff --git a/ai-ml/cleaning/src/logging/logging_utils.py b/ai-ml/cleaning/src/logging/logging_utils.py new file mode 100644 index 00000000..33d36e98 --- /dev/null +++ b/ai-ml/cleaning/src/logging/logging_utils.py @@ -0,0 +1,34 @@ +from datetime import datetime +from pathlib import Path +from typing import Iterable + + +def log_message(step: str, details: str) -> str: + timestamp = datetime.now().isoformat() + return f"[{timestamp}] {step}: {details}" + + +def log_rows_removed(count: int) -> str: + return log_message("remove_duplicates", f"rows_removed={count}") + + +def log_nulls_found(count: int) -> str: + return log_message("missing_values", f"null_values_found={count}") + + +def log_other_transformations(details: str) -> str: + return log_message("transformation", details) + + +def format_events(events: Iterable[dict]) -> list[str]: + lines = [] + for event in events: + lines.append(log_message(event["step"], event["details"])) + return lines + + +def write_log_file(events: Iterable[dict], output_path: str) -> None: + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + lines = format_events(events) + path.write_text("\n".join(lines) + ("\n" if lines else ""), encoding="utf-8") diff --git a/ai-ml/cleaning/src/main.py b/ai-ml/cleaning/src/main.py new file mode 100644 index 00000000..fc6b8a5f --- /dev/null +++ b/ai-ml/cleaning/src/main.py @@ -0,0 +1,39 @@ +import argparse +from pathlib import Path + +if __package__ is None or __package__ == "": + import sys + + script_dir = Path(__file__).resolve().parent + project_root = Path(__file__).resolve().parent.parent + if str(script_dir) in sys.path: + sys.path.remove(str(script_dir)) + if str(project_root) not in sys.path: + sys.path.insert(0, str(project_root)) + from src.pipeline import run_pipeline +else: + from .pipeline import run_pipeline + + +def main(): + parser = argparse.ArgumentParser(description="Run unified cleaning pipeline") + parser.add_argument( + "--config", + default=str(Path(__file__).resolve().parent.parent / "config" / "pipeline_config.json"), + help="Path to pipeline config JSON", + ) + args = parser.parse_args() + + summary = run_pipeline(args.config) + print("Pipeline complete") + print(f"Dataset type: {summary['dataset_type']}") + print(f"Rows: {summary['input_rows']} -> {summary['output_rows']}") + print(f"Validation: {summary['status']} ({summary['issues_found']} issues)") + print(f"Cleaned CSV: {summary['outputs']['cleaned_csv']}") + print(f"Validation report: {summary['outputs']['validation_report']}") + print(f"Comparison report: {summary['outputs']['comparison_report']}") + print(f"Pipeline log: {summary['outputs']['pipeline_log']}") + + +if __name__ == "__main__": + main() diff --git a/ai-ml/cleaning/src/pipeline.py b/ai-ml/cleaning/src/pipeline.py new file mode 100644 index 00000000..7f67383f --- /dev/null +++ b/ai-ml/cleaning/src/pipeline.py @@ -0,0 +1,81 @@ +import json +from pathlib import Path + +import pandas as pd + +from .cleaning import run_cleaning_pipeline +from .logging import compare_before_after, write_log_file +from .validation import run_validation_df + + +def _resolve_paths(base_dir, paths_config): + resolved = {} + for key, value in paths_config.items(): + resolved[key] = str((base_dir / value).resolve()) + return resolved + + +def _write_json(path, data): + output_path = Path(path) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def _dataset_rules_match_columns(validation_config, columns): + required = set(validation_config.get("required_columns", [])) + return required.issubset(set(columns)) + + +def _select_rules(config, raw_columns): + datasets = config.get("datasets", {}) + dataset_type = config.get("dataset_type", "generic") + + if dataset_type in datasets: + selected = datasets[dataset_type] + cleaning_config = selected["cleaning"] + validation_config = selected["validation"] + if _dataset_rules_match_columns(validation_config, raw_columns): + return cleaning_config, validation_config, dataset_type + + for name, dataset_config in datasets.items(): + validation_config = dataset_config.get("validation", {}) + if _dataset_rules_match_columns(validation_config, raw_columns): + return dataset_config["cleaning"], validation_config, name + + return config["cleaning"], config["validation"], "generic" + + +def run_pipeline(config_path): + config_file = Path(config_path).resolve() + with config_file.open("r", encoding="utf-8") as file: + config = json.load(file) + + base_dir = config_file.parent.parent + paths = _resolve_paths(base_dir, config["paths"]) + + raw_df = pd.read_csv(paths["input_csv"]) + cleaning_config, validation_config, effective_dataset_type = _select_rules( + config, raw_df.columns + ) + cleaned_df, cleaning_events = run_cleaning_pipeline(raw_df, cleaning_config) + comparison_report = compare_before_after(raw_df, cleaned_df) + validation_report = run_validation_df( + cleaned_df, + validation_config, + dataset_name=Path(paths["input_csv"]).name, + ) + + Path(paths["cleaned_csv"]).parent.mkdir(parents=True, exist_ok=True) + cleaned_df.to_csv(paths["cleaned_csv"], index=False) + _write_json(paths["comparison_report"], comparison_report) + _write_json(paths["validation_report"], validation_report) + write_log_file(cleaning_events, paths["pipeline_log"]) + + return { + "input_rows": int(raw_df.shape[0]), + "output_rows": int(cleaned_df.shape[0]), + "issues_found": validation_report["total_issues"], + "status": validation_report["status"], + "dataset_type": effective_dataset_type, + "outputs": paths, + } diff --git a/ai-ml/cleaning/src/validation/__init__.py b/ai-ml/cleaning/src/validation/__init__.py new file mode 100644 index 00000000..64e52ed8 --- /dev/null +++ b/ai-ml/cleaning/src/validation/__init__.py @@ -0,0 +1 @@ +from .validation import run_validation, run_validation_df diff --git a/ai-ml/cleaning/validation/src/report_generator.py b/ai-ml/cleaning/src/validation/report_generator.py similarity index 72% rename from ai-ml/cleaning/validation/src/report_generator.py rename to ai-ml/cleaning/src/validation/report_generator.py index 898648f9..711cc8af 100644 --- a/ai-ml/cleaning/validation/src/report_generator.py +++ b/ai-ml/cleaning/src/validation/report_generator.py @@ -3,20 +3,19 @@ def generate_report(df, issues, dataset_name): issues_by_column = defaultdict(int) - for issue in issues: issues_by_column[issue["column"]] += 1 + row_count = int(df.shape[0]) report = { "dataset_name": dataset_name, - "total_rows": int(df.shape[0]), + "total_rows": row_count, "total_columns": int(df.shape[1]), - "checks_run": 6, + "checks_run": 7, "total_issues": len(issues), "status": "PASS" if len(issues) == 0 else "FAIL", "issue_summary_by_column": dict(issues_by_column), "issues": issues, - "error_rate": len(issues) / df.shape[0] + "error_rate": (len(issues) / row_count) if row_count else 0, } - - return report \ No newline at end of file + return report diff --git a/ai-ml/cleaning/validation/src/validation.py b/ai-ml/cleaning/src/validation/validation.py similarity index 59% rename from ai-ml/cleaning/validation/src/validation.py rename to ai-ml/cleaning/src/validation/validation.py index cdd5f833..389dac00 100644 --- a/ai-ml/cleaning/validation/src/validation.py +++ b/ai-ml/cleaning/src/validation/validation.py @@ -1,30 +1,31 @@ import json import os + import pandas as pd -from validators import ( - check_required_columns, - check_missing_values, - check_uniqueness, +from .report_generator import generate_report +from .validators import ( check_allowed_values, - check_range, + check_data_types, check_date_format, - check_data_types + check_missing_values, + check_range, + check_required_columns, + check_uniqueness, ) -from report_generator import generate_report def load_rules(rules_path): with open(rules_path, "r", encoding="utf-8") as file: - return json.load(file) + data = json.load(file) + # Supports both legacy rules-only JSON and unified pipeline config JSON. + if "validation" in data and "column_rules" not in data: + return data["validation"] + return data -def run_validation(data_path, rules_path): - df = pd.read_csv(data_path) - rules = load_rules(rules_path) - +def run_validation_df(df, rules, dataset_name="dataset.csv"): issues = [] - issues.extend(check_required_columns(df, rules)) issues.extend(check_missing_values(df, rules)) issues.extend(check_uniqueness(df, rules)) @@ -32,6 +33,10 @@ def run_validation(data_path, rules_path): issues.extend(check_range(df, rules)) issues.extend(check_date_format(df, rules)) issues.extend(check_data_types(df, rules)) + return generate_report(df, issues, dataset_name) - report = generate_report(df, issues, os.path.basename(data_path)) - return report \ No newline at end of file + +def run_validation(data_path, rules_path): + df = pd.read_csv(data_path) + rules = load_rules(rules_path) + return run_validation_df(df, rules, dataset_name=os.path.basename(data_path)) diff --git a/ai-ml/cleaning/validation/src/validators.py b/ai-ml/cleaning/src/validation/validators.py similarity index 73% rename from ai-ml/cleaning/validation/src/validators.py rename to ai-ml/cleaning/src/validation/validators.py index 25c11c61..1e5794f0 100644 --- a/ai-ml/cleaning/validation/src/validators.py +++ b/ai-ml/cleaning/src/validation/validators.py @@ -7,14 +7,13 @@ def make_issue(row, column, rule, value, message): "column": column, "rule": rule, "value": None if pd.isna(value) else str(value), - "message": message + "message": message, } def check_required_columns(df, rules): issues = [] required_columns = rules.get("required_columns", []) - for column in required_columns: if column not in df.columns: issues.append( @@ -23,79 +22,65 @@ def check_required_columns(df, rules): column=column, rule="required_column", value=None, - message=f"Missing required column: {column}" + message=f"Missing required column: {column}", ) ) - return issues def check_missing_values(df, rules): issues = [] column_rules = rules.get("column_rules", {}) - for column, rule in column_rules.items(): - if column not in df.columns: + if column not in df.columns or not rule.get("required", False): continue - - if rule.get("required", False): - missing_rows = df[df[column].isna()].index - - for row in missing_rows: - issues.append( - make_issue( - row=row, - column=column, - rule="required_value", - value=None, - message=f"Missing required value in column '{column}'" - ) + for row in df[df[column].isna()].index: + issues.append( + make_issue( + row=row, + column=column, + rule="required_value", + value=None, + message=f"Missing required value in column '{column}'", ) - + ) return issues def check_uniqueness(df, rules): issues = [] column_rules = rules.get("column_rules", {}) - for column, rule in column_rules.items(): - if column not in df.columns: + if column not in df.columns or not rule.get("unique", False): continue - - if rule.get("unique", False): - duplicate_rows = df[df[column].duplicated(keep=False) & df[column].notna()].index - - for row in duplicate_rows: - issues.append( - make_issue( - row=row, - column=column, - rule="unique", - value=df.loc[row, column], - message=f"Duplicate value found in unique column '{column}'" - ) + duplicate_rows = df[df[column].duplicated(keep=False) & df[column].notna()].index + for row in duplicate_rows: + issues.append( + make_issue( + row=row, + column=column, + rule="unique", + value=df.loc[row, column], + message=f"Duplicate value found in unique column '{column}'", ) - + ) return issues def check_allowed_values(df, rules): issues = [] column_rules = rules.get("column_rules", {}) - for column, rule in column_rules.items(): if column not in df.columns: continue - allowed_values = rule.get("allowed_values") if not allowed_values: continue - - invalid_rows = df[ - df[column].notna() & ~df[column].isin(allowed_values) - ].index - + allowed_lookup = {str(value).strip().lower() for value in allowed_values} + normalised = df[column].apply( + lambda value: str(value).strip().lower() if pd.notna(value) else value + ) + invalid_rows = df[df[column].notna() & ~normalised.isin(allowed_lookup)].index for row in invalid_rows: issues.append( make_issue( @@ -103,33 +88,26 @@ def check_allowed_values(df, rules): column=column, rule="allowed_values", value=df.loc[row, column], - message=f"Invalid value. Allowed values: {allowed_values}" + message=f"Invalid value. Allowed values: {allowed_values}", ) ) - return issues def check_range(df, rules): issues = [] column_rules = rules.get("column_rules", {}) - for column, rule in column_rules.items(): if column not in df.columns: continue - min_value = rule.get("min") max_value = rule.get("max") - if min_value is None and max_value is None: continue - numeric_series = pd.to_numeric(df[column], errors="coerce") - for row, value in numeric_series.items(): if pd.isna(value): continue - if min_value is not None and value < min_value: issues.append( make_issue( @@ -137,10 +115,9 @@ def check_range(df, rules): column=column, rule="min", value=df.loc[row, column], - message=f"Value below minimum allowed ({min_value})" + message=f"Value below minimum allowed ({min_value})", ) ) - if max_value is not None and value > max_value: issues.append( make_issue( @@ -148,30 +125,22 @@ def check_range(df, rules): column=column, rule="max", value=df.loc[row, column], - message=f"Value above maximum allowed ({max_value})" + message=f"Value above maximum allowed ({max_value})", ) ) - return issues def check_date_format(df, rules): issues = [] column_rules = rules.get("column_rules", {}) - for column, rule in column_rules.items(): - if column not in df.columns: + if column not in df.columns or rule.get("type") != "date": continue - - if rule.get("type") != "date": - continue - date_format = rule.get("format", "%Y-%m-%d") - for row, value in df[column].items(): if pd.isna(value): continue - try: pd.to_datetime(value, format=date_format) except (ValueError, TypeError): @@ -181,27 +150,38 @@ def check_date_format(df, rules): column=column, rule="date_format", value=value, - message=f"Invalid date format. Expected format: {date_format}" + message=f"Invalid date format. Expected format: {date_format}", ) ) - return issues def check_data_types(df, rules): issues = [] column_rules = rules.get("column_rules", {}) - for column, rule in column_rules.items(): if column not in df.columns: continue - expected_type = rule.get("type") - if expected_type == "int": + converted = pd.to_numeric(df[column], errors="coerce") + invalid_mask = ( + df[column].notna() & (converted.isna() | (converted % 1 != 0)) + ) + invalid_rows = df[invalid_mask].index + for row in invalid_rows: + issues.append( + make_issue( + row=row, + column=column, + rule="type", + value=df.loc[row, column], + message="Expected integer type", + ) + ) + elif expected_type == "float": converted = pd.to_numeric(df[column], errors="coerce") invalid_rows = df[df[column].notna() & converted.isna()].index - for row in invalid_rows: issues.append( make_issue( @@ -209,13 +189,9 @@ def check_data_types(df, rules): column=column, rule="type", value=df.loc[row, column], - message="Expected integer type" + message="Expected float type", ) ) - - elif expected_type == "date": - continue - elif expected_type == "str": for row, value in df[column].items(): if pd.isna(value): @@ -227,8 +203,7 @@ def check_data_types(df, rules): column=column, rule="type", value=value, - message="Expected string type" + message="Expected string type", ) ) - - return issues \ No newline at end of file + return issues diff --git a/ai-ml/cleaning/validation/README.md b/ai-ml/cleaning/validation/README.md deleted file mode 100644 index 1f0c5879..00000000 --- a/ai-ml/cleaning/validation/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# AI003 Workstream 2 - Data Validation and Rules - -## Definition of Clean Data - -Data is considered valid if: - -- All required columns are present -- No required fields contain null values -- Values fall within defined ranges -- Categorical fields match allowed values -- Dates follow the expected format -- Unique fields contain no duplicates - -## Integration Plan (AI001) - -Once schema is available: - -- Replace validation_rules.json with schema-driven rules -- Map schema fields to validation rules -- Ensure compatibility with cleaning pipeline outputs - -## Purpose - -This module validates raw input data against configurable quality rules. - -It checks for: - -- missing required columns -- missing required values -- duplicate values in unique fields -- invalid category values -- out-of-range numeric values -- invalid date formats -- basic type mismatches - -## Structure - -- `data/` -> dummy datasets -- `config/` -> validation rules -- `src/` -> validation scripts -- `output/` -> generated validation reports - -## How to run - -From the validation folder: - -```bash -pip install -r requirements.txt -python src/main.py diff --git a/ai-ml/cleaning/validation/config/validation_rules.json b/ai-ml/cleaning/validation/config/validation_rules.json deleted file mode 100644 index 6beac6c9..00000000 --- a/ai-ml/cleaning/validation/config/validation_rules.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "required_columns": ["event_id", "event_type", "severity", "region", "date", "source"], - "column_rules": { - "event_id": { - "required": true, - "type": "int", - "unique": true - }, - "event_type": { - "required": true, - "type": "str", - "allowed_values": ["bushfire", "flood", "storm", "cyclone"] - }, - "severity": { - "required": true, - "type": "int", - "min": 1, - "max": 5 - }, - "region": { - "required": true, - "type": "str" - }, - "date": { - "required": true, - "type": "date", - "format": "%Y-%m-%d" - }, - "source": { - "required": true, - "type": "str", - "allowed_values": ["BoM", "NASA"] - } - } -} \ No newline at end of file diff --git a/ai-ml/cleaning/validation/data/dummy_input.csv b/ai-ml/cleaning/validation/data/dummy_input.csv deleted file mode 100644 index e0b716fb..00000000 --- a/ai-ml/cleaning/validation/data/dummy_input.csv +++ /dev/null @@ -1,5 +0,0 @@ -event_id,event_type,severity,region,date,source -1,bushfire,4,Victoria,2026-03-20,BoM -2,flood,3,NSW,2026-03-21,NASA -3,storm,2,Queensland,2026-03-22,BoM -4,cyclone,5,WA,2026-03-23,NASA \ No newline at end of file diff --git a/ai-ml/cleaning/validation/data/dummy_input_bad.csv b/ai-ml/cleaning/validation/data/dummy_input_bad.csv deleted file mode 100644 index c012f8de..00000000 --- a/ai-ml/cleaning/validation/data/dummy_input_bad.csv +++ /dev/null @@ -1,6 +0,0 @@ -event_id,event_type,severity,region,date,source -1,bushfire,4,Victoria,2026-03-20,BoM -2,,7,NSW,2026/03/21,NASA -2,flood,-1,,not_a_date, -,storm,2,Queensland,2026-03-22,BoM -5,alienstorm,3,Tasmania,2026-03-25,Unknown \ No newline at end of file diff --git a/ai-ml/cleaning/validation/output/validation_report_bad.json b/ai-ml/cleaning/validation/output/validation_report_bad.json deleted file mode 100644 index 92a61ef0..00000000 --- a/ai-ml/cleaning/validation/output/validation_report_bad.json +++ /dev/null @@ -1,103 +0,0 @@ -{ - "dataset_name": "dummy_input_bad.csv", - "total_rows": 5, - "total_columns": 6, - "checks_run": 6, - "total_issues": 12, - "status": "FAIL", - "issue_summary_by_column": { - "event_id": 3, - "event_type": 2, - "region": 1, - "source": 2, - "severity": 2, - "date": 2 - }, - "issues": [ - { - "row": 3, - "column": "event_id", - "rule": "required_value", - "value": null, - "message": "Missing required value in column 'event_id'" - }, - { - "row": 1, - "column": "event_type", - "rule": "required_value", - "value": null, - "message": "Missing required value in column 'event_type'" - }, - { - "row": 2, - "column": "region", - "rule": "required_value", - "value": null, - "message": "Missing required value in column 'region'" - }, - { - "row": 2, - "column": "source", - "rule": "required_value", - "value": null, - "message": "Missing required value in column 'source'" - }, - { - "row": 1, - "column": "event_id", - "rule": "unique", - "value": "2.0", - "message": "Duplicate value found in unique column 'event_id'" - }, - { - "row": 2, - "column": "event_id", - "rule": "unique", - "value": "2.0", - "message": "Duplicate value found in unique column 'event_id'" - }, - { - "row": 4, - "column": "event_type", - "rule": "allowed_values", - "value": "alienstorm", - "message": "Invalid value. Allowed values: ['bushfire', 'flood', 'storm', 'cyclone']" - }, - { - "row": 4, - "column": "source", - "rule": "allowed_values", - "value": "Unknown", - "message": "Invalid value. Allowed values: ['BoM', 'NASA']" - }, - { - "row": 1, - "column": "severity", - "rule": "max", - "value": "7", - "message": "Value above maximum allowed (5)" - }, - { - "row": 2, - "column": "severity", - "rule": "min", - "value": "-1", - "message": "Value below minimum allowed (1)" - }, - { - "row": 1, - "column": "date", - "rule": "date_format", - "value": "2026/03/21", - "message": "Invalid date format. Expected format: %Y-%m-%d" - }, - { - "row": 2, - "column": "date", - "rule": "date_format", - "value": "not_a_date", - "message": "Invalid date format. Expected format: %Y-%m-%d" - } - ], - "error_rate": 2.4 -} \ No newline at end of file diff --git a/ai-ml/cleaning/validation/output/validation_report_good.json b/ai-ml/cleaning/validation/output/validation_report_good.json deleted file mode 100644 index 8568f7ee..00000000 --- a/ai-ml/cleaning/validation/output/validation_report_good.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "dataset_name": "dummy_input.csv", - "total_rows": 4, - "total_columns": 6, - "checks_run": 6, - "total_issues": 0, - "status": "PASS", - "issue_summary_by_column": {}, - "issues": [] -} \ No newline at end of file diff --git a/ai-ml/cleaning/validation/requirements.txt b/ai-ml/cleaning/validation/requirements.txt deleted file mode 100644 index 1411a4a0..00000000 --- a/ai-ml/cleaning/validation/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pandas \ No newline at end of file diff --git a/ai-ml/cleaning/validation/src/__pycache__/report_generator.cpython-310.pyc b/ai-ml/cleaning/validation/src/__pycache__/report_generator.cpython-310.pyc deleted file mode 100644 index e8cb3279..00000000 Binary files a/ai-ml/cleaning/validation/src/__pycache__/report_generator.cpython-310.pyc and /dev/null differ diff --git a/ai-ml/cleaning/validation/src/__pycache__/validation.cpython-310.pyc b/ai-ml/cleaning/validation/src/__pycache__/validation.cpython-310.pyc deleted file mode 100644 index 364c6d98..00000000 Binary files a/ai-ml/cleaning/validation/src/__pycache__/validation.cpython-310.pyc and /dev/null differ diff --git a/ai-ml/cleaning/validation/src/__pycache__/validators.cpython-310.pyc b/ai-ml/cleaning/validation/src/__pycache__/validators.cpython-310.pyc deleted file mode 100644 index 91fcbdc3..00000000 Binary files a/ai-ml/cleaning/validation/src/__pycache__/validators.cpython-310.pyc and /dev/null differ diff --git a/ai-ml/cleaning/validation/src/main.py b/ai-ml/cleaning/validation/src/main.py deleted file mode 100644 index c3e417eb..00000000 --- a/ai-ml/cleaning/validation/src/main.py +++ /dev/null @@ -1,29 +0,0 @@ -import json -import os - -from validation import run_validation - - -def main(): - base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - data_path = os.path.join(base_dir, "data", "dummy_input_bad.csv") - rules_path = os.path.join(base_dir, "config", "validation_rules.json") - output_path = os.path.join(base_dir, "output", "validation_report_bad.json") - - report = run_validation(data_path, rules_path) - - os.makedirs(os.path.dirname(output_path), exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as file: - json.dump(report, file, indent=4) - - print("Validation complete") - print(f"Rows checked: {report['total_rows']}") - print(f"Issues found: {report['total_issues']}") - print(f"Status: {report['status']}") - print(f"Report saved to: {output_path}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/ai-ml/datasets/.gitkeep b/ai-ml/datasets/.gitkeep new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/ai-ml/datasets/.gitkeep @@ -0,0 +1 @@ + diff --git a/ai-ml/docs/AI001 Define Data Schema.pdf b/ai-ml/docs/AI001 Define Data Schema.pdf new file mode 100644 index 00000000..51c8e87f Binary files /dev/null and b/ai-ml/docs/AI001 Define Data Schema.pdf differ diff --git a/ai-ml/docs/AI002 Dataset Model Research.pdf b/ai-ml/docs/AI002 Dataset Model Research.pdf new file mode 100644 index 00000000..7571052f Binary files /dev/null and b/ai-ml/docs/AI002 Dataset Model Research.pdf differ diff --git a/ai-ml/docs/AI003 Cleaning Pipeline.pdf b/ai-ml/docs/AI003 Cleaning Pipeline.pdf new file mode 100644 index 00000000..fbd4b21f Binary files /dev/null and b/ai-ml/docs/AI003 Cleaning Pipeline.pdf differ diff --git a/backend/database/01_schema.sql b/backend/database/01_schema.sql index 99f37d8f..526129be 100644 --- a/backend/database/01_schema.sql +++ b/backend/database/01_schema.sql @@ -1,114 +1,48 @@ /*PROJECT PHOENIX BACKEND DATABASE CREATION SCRIPT Implementing backend structured data schema for Project Phoenix + Latest update 16 April 2026 Toby */ CREATE EXTENSION IF NOT EXISTS pgcrypto; - /*GEOLOCATION TABLE */ CREATE TABLE geo_location ( - geo_location_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,country VARCHAR(100) - ,state_region VARCHAR(100) - ,local_government_area VARCHAR(100) - ,suburb VARCHAR(100) - ,latitude DECIMAL(9,6) - ,longitude DECIMAL(9,6) - ,geo_precision VARCHAR(50) + geo_location_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + country VARCHAR(100), + state_region VARCHAR(100), + local_government_area VARCHAR(100), + suburb VARCHAR(100), + latitude DECIMAL(9,6), + longitude DECIMAL(9,6), + geo_precision VARCHAR ); - - /*DATA_SOURCE TABLE Added because hazard_event and cyber_threat reference source_id */ CREATE TABLE data_source ( - source_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,source_name VARCHAR(255) - ,source_type VARCHAR(100) - ,access_method VARCHAR(100) - ,source_url TEXT -); - - -/*LINKED_EVENT_TYPE TABLE - Dimensional table */ -CREATE TABLE linked_event_type ( - linked_event_type_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,linked_event_type_description VARCHAR(100) -); - - -/*EVENT_STATUS TABLE - Dimensional table */ -CREATE TABLE event_status ( - event_status_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,event_status_description VARCHAR(100) -); - - -/*SEASON - Dimensional reference table */ -CREATE TABLE season ( - season_id INT - PRIMARY KEY - ,season_description VARCHAR(50) -); - - -/*REFERENCE_DAY TABLE - Reference table */ -CREATE TABLE reference_day ( - ref_date DATE - ,locale_id INT - ,dow VARCHAR(20) - ,is_weekend BOOLEAN - ,season INT - ,is_holiday BOOLEAN - ,PRIMARY KEY (ref_date, locale_id) - ,FOREIGN KEY (season) REFERENCES season(season_id) -); - - -/*REFERENCE_TIME - Reference table */ -CREATE TABLE reference_time ( - ref_time TIME - PRIMARY KEY - ,is_nighttime BOOLEAN - ,is_business_hours BOOLEAN + source_id UUID PRIMARY KEY DEFAULT gen_random_uuid() + ,source_name VARCHAR + ,source_type VARCHAR + ,access_method VARCHAR + ,source_url TEXT + ,status VARCHAR + ,fail_reason VARCHAR ); - /*HAZARD_EVENT TABLE Entity table */ CREATE TABLE hazard_event ( - hazard_event_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,hazard_type TEXT - NOT NULL - ,severity_level TEXT - NOT NULL + hazard_event_id UUID PRIMARY KEY DEFAULT gen_random_uuid() + ,hazard_type TEXT NOT NULL + ,severity_level TEXT NOT NULL CHECK (severity_level IN ('low', 'medium', 'high', 'critical')) - ,event_status TEXT - ,start_time TIMESTAMPTZ - ,end_time TIMESTAMPTZ - ,geo_location_id UUID - ,source_id UUID - ,source_ref_event TEXT - ,description TEXT - ,updated_at TIMESTAMPTZ - DEFAULT CURRENT_TIMESTAMP - ,created_at TIMESTAMPTZ - DEFAULT CURRENT_TIMESTAMP - ,FOREIGN KEY (geo_location_id) REFERENCES geo_location(geo_location_id) + ,event_status TEXT + ,start_time TIMESTAMPTZ + ,end_time TIMESTAMPTZ + ,source_id UUID + ,source_ref_event TEXT + ,description TEXT + ,updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ,created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ,FOREIGN KEY (source_id) REFERENCES data_source(source_id) ); @@ -116,111 +50,115 @@ CREATE TABLE hazard_event ( /*CYBER_THREAT TABLE Entity table */ CREATE TABLE cyber_threat ( - threat_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,threat_type VARCHAR(100) - NOT NULL - ,source_id UUID - ,title VARCHAR(255) - ,description TEXT - ,risk_level VARCHAR(20) + threat_id UUID PRIMARY KEY DEFAULT gen_random_uuid() + ,threat_type VARCHAR NOT NULL + ,source_id UUID + ,title VARCHAR + ,description TEXT + ,risk_level VARCHAR(20) CHECK (risk_level IN ('Low', 'Medium', 'High', 'Critical')) - ,status VARCHAR(20) + ,status VARCHAR(20) CHECK (status IN ('Active', 'Monitoring', 'Resolved', 'Archived')) - ,category VARCHAR(50) - ,confidence_score DECIMAL(5,2) - ,detected_at TIMESTAMPTZ - ,updated_at TIMESTAMPTZ - DEFAULT CURRENT_TIMESTAMP - ,created_at TIMESTAMPTZ - DEFAULT CURRENT_TIMESTAMP + ,category VARCHAR + ,confidence_score DECIMAL(5,2) + ,detected_at TIMESTAMPTZ + ,updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP + ,created_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ,FOREIGN KEY (source_id) REFERENCES data_source(source_id) ); - -/*RISK_ASSESSMENT OR INTEGRATION TABLE - Fact table */ -CREATE TABLE risk_assessment ( - integration_event_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,related_hazard_event_id UUID - ,related_threat_id UUID - ,correlation_score REAL - ,linkage_reason TEXT - ,integration_confidence REAL - ,linked_event_type UUID - ,event_status UUID - ,event_time TIMESTAMPTZ - ,detected_at TIMESTAMPTZ - ,reported_at TIMESTAMPTZ - ,created_at TIMESTAMPTZ - DEFAULT CURRENT_TIMESTAMP - ,updated_at TIMESTAMPTZ - DEFAULT CURRENT_TIMESTAMP - ,FOREIGN KEY (related_hazard_event_id) REFERENCES hazard_event(hazard_event_id) - ,FOREIGN KEY (related_threat_id) REFERENCES cyber_threat(threat_id) - ,FOREIGN KEY (linked_event_type) REFERENCES linked_event_type(linked_event_type_id) - ,FOREIGN KEY (event_status) REFERENCES event_status(event_status_id) -); - - /*HAZARD_LOCATION TABLE Joining table */ CREATE TABLE hazard_location ( - hazard_event_id UUID - ,geo_location_id UUID + hazard_event_id UUID + ,geo_location_id UUID ,PRIMARY KEY (hazard_event_id, geo_location_id) ,FOREIGN KEY (hazard_event_id) REFERENCES hazard_event(hazard_event_id) ,FOREIGN KEY (geo_location_id) REFERENCES geo_location(geo_location_id) ); - - -/*THREAT_LOCATION TABLE + + /*THREAT_LOCATION TABLE Joining table */ CREATE TABLE threat_location ( - threat_id UUID - ,geo_location_id UUID + threat_id UUID + ,geo_location_id UUID ,PRIMARY KEY (threat_id, geo_location_id) ,FOREIGN KEY (threat_id) REFERENCES cyber_threat(threat_id) ,FOREIGN KEY (geo_location_id) REFERENCES geo_location(geo_location_id) ); +/*LINKED_EVENT_TYPE TABLE + Dimensional table */ + CREATE TABLE linked_event_type ( + linked_event_type_id SERIAL UNIQUE + ,linked_event_type_description VARCHAR + ); -/*USER TABLE - New addition to schema to support front end */ -CREATE TABLE user_account ( - user_id UUID - PRIMARY KEY - DEFAULT gen_random_uuid() - ,password_hashed VARCHAR(255) - ,role VARCHAR(50) -); - - -/*INDEXES - Based on schema documentation */ -CREATE INDEX idx_hazard_type - ON hazard_event(hazard_type); - -CREATE INDEX idx_hazard_severity_level - ON hazard_event(severity_level); - -CREATE INDEX idx_hazard_start_time - ON hazard_event(start_time); - -CREATE INDEX idx_hazard_geo_time - ON hazard_event(geo_location_id, start_time); - -CREATE INDEX idx_threat_type - ON cyber_threat(threat_type); -CREATE INDEX idx_threat_risk_level - ON cyber_threat(risk_level); +/*EVENT_STATUS TABLE + Dimensional table */ +CREATE TABLE event_status( + event_status_id SERIAL UNIQUE + ,event_status_dscription VARCHAR +); -CREATE INDEX idx_threat_detected_at - ON cyber_threat(detected_at); +/*RISK_ASSESSMENT OR INTEGRATION TABLE + Fact table*/ + CREATE TABLE tisk_assessment ( + integration_event_id UUID PRIMARY KEY DEFAULT gen_random_uuid() + ,related_hazard_event_id UUID + ,related_threat_id UUID + ,correlation_score DECIMAL(5,2) + ,linkage_reason TEXT + ,integration_confidence REAL + ,linked_event_type INTEGER + ,event_status INTEGER + --or should this be a simple text field with a CHECK CONSTRAINT as above? + ,event_time TIMESTAMPTZ + ,detected_at TIMESTAMPTZ + ,reported_at TIMESTAMPTZ + ,created_at TIMESTAMPTZ + ,updated_at TIMESTAMPTZ + ,FOREIGN KEY (related_hazard_EVENT_id) REFERENCES hazard_event(hazard_event_id) + ,FOREIGN KEY (related_threat_id) REFERENCES cyber_threat(threat_id) + ,FOREIGN KEY (linked_event_type) REFERENCES linked_event_type(linked_event_type_id) + ,FOREIGN KEY (event_status) REFERENCES event_status(event_status_id) +); -CREATE INDEX idx_threat_status_detected_at - ON cyber_threat(status, detected_at); \ No newline at end of file +/*SEASON + Dimensional reference table */ + CREATE TABLE season( + season_id SERIAL UNIQUE + ,season_description VARCHAR(20) + ); + +/*REFERENCE_DAY TABLE + Reference table */ +CREATE TABLE reference_day( + ref_date DATE + ,locale_id INTEGER + ,dow VARCHAR(20) + ,is_weekend BOOLEAN + ,season INTEGER + ,is_holiday BOOLEAN + ,PRIMARY KEY (ref_date,locale_id) + ,FOREIGN KEY (season) REFERENCES season(season_id) +); + + /*REFERENCE_TIME + Reference table */ + CREATE TABLE reference_time( + ref_time TIME PRIMARY KEY + ,is_nighttime BOOLEAN + ,is_business_hours BOOLEAN +); + + /*USER TABLE + New addition to schema to support front end + Just build the shell, amend and add fields later */ +CREATE TABLE "user"( + userid UUID PRIMARY KEY DEFAULT gen_random_uuid() + ,password_hashed BYTEA + ,role VARCHAR + --add check constraint once values are established + ); \ No newline at end of file diff --git a/backend/database/BE003_Pheoneix_Database_Schema_Documentation_and_ER_Diagram 260416.docx b/backend/database/BE003_Pheoneix_Database_Schema_Documentation_and_ER_Diagram 260416.docx new file mode 100644 index 00000000..d5644eea Binary files /dev/null and b/backend/database/BE003_Pheoneix_Database_Schema_Documentation_and_ER_Diagram 260416.docx differ diff --git a/cyber/Rasanjana CY004 Attack_Scenarios.pdf b/cyber/Rasanjana CY004 Attack_Scenarios.pdf new file mode 100644 index 00000000..ff6bec27 Binary files /dev/null and b/cyber/Rasanjana CY004 Attack_Scenarios.pdf differ diff --git a/cyber/Rasanjana CY004 entry-points.pdf b/cyber/Rasanjana CY004 entry-points.pdf new file mode 100644 index 00000000..f81e3c1c Binary files /dev/null and b/cyber/Rasanjana CY004 entry-points.pdf differ diff --git a/cyber/Rasanjana CY005-Apply STRIDE.pdf b/cyber/Rasanjana CY005-Apply STRIDE.pdf new file mode 100644 index 00000000..e14d9890 Binary files /dev/null and b/cyber/Rasanjana CY005-Apply STRIDE.pdf differ diff --git a/cyber/Rasanjana CY005-Map Mitigations.pdf b/cyber/Rasanjana CY005-Map Mitigations.pdf new file mode 100644 index 00000000..d7965f13 Binary files /dev/null and b/cyber/Rasanjana CY005-Map Mitigations.pdf differ diff --git a/cyber/cyber/detection-response/detection_rules.py b/cyber/cyber/detection-response/detection_rules.py new file mode 100644 index 00000000..7a0cf934 --- /dev/null +++ b/cyber/cyber/detection-response/detection_rules.py @@ -0,0 +1,254 @@ +from typing import Optional, Dict, Any + + +def detect_login_attack( + failed_login_attempts: int, + is_unknown_location: bool, + is_unknown_device: bool +) -> Dict[str, Any]: + if failed_login_attempts > 10: + return { + "threat": "Login Attack", + "severity": "Critical", + "action": "lock_account" + } + if failed_login_attempts > 5 or is_unknown_location or is_unknown_device: + return { + "threat": "Login Attack", + "severity": "High", + "action": "trigger_alert" + } + return { + "threat": "Login Attack", + "severity": "Normal", + "action": None + } + + +def detect_api_abuse( + request_count: int, + unauthorized_access: bool, + threshold: int = 100 +) -> Dict[str, Any]: + if unauthorized_access: + return { + "threat": "API Abuse", + "severity": "Critical", + "action": "revoke_api_key" + } + if request_count > threshold: + return { + "threat": "API Abuse", + "severity": "High", + "action": "rate_limit" + } + return { + "threat": "API Abuse", + "severity": "Normal", + "action": None + } + + +def detect_data_breach( + records_accessed: int, + is_sensitive_data: bool, + unusual_access: bool, + threshold: int = 500 +) -> Dict[str, Any]: + if is_sensitive_data and unusual_access: + return { + "threat": "Data Breach", + "severity": "Critical", + "action": "restrict_access" + } + if records_accessed > threshold: + return { + "threat": "Data Breach", + "severity": "High", + "action": "trigger_alert" + } + return { + "threat": "Data Breach", + "severity": "Normal", + "action": None + } + + +def detect_phishing_scam( + has_suspicious_link: bool, + has_urgent_language: bool, + untrusted_sender: bool, + reported_by_users: int = 0 +) -> Dict[str, Any]: + if (has_suspicious_link and has_urgent_language) or reported_by_users >= 3: + return { + "threat": "Phishing/Scam", + "severity": "High", + "action": "quarantine_message" + } + if untrusted_sender: + return { + "threat": "Phishing/Scam", + "severity": "Medium", + "action": "flag_sender" + } + return { + "threat": "Phishing/Scam", + "severity": "Normal", + "action": None + } + + +def detect_malware_ransomware( + file_encryption_events: int, + suspicious_process: bool, + unusual_outbound_traffic: bool, + threshold: int = 10 +) -> Dict[str, Any]: + if file_encryption_events > threshold: + return { + "threat": "Ransomware", + "severity": "Critical", + "action": "isolate_system" + } + if suspicious_process and unusual_outbound_traffic: + return { + "threat": "Malware", + "severity": "High", + "action": "stop_process" + } + return { + "threat": "Malware/Ransomware", + "severity": "Normal", + "action": None + } + + +def detect_ddos( + request_rate: int, + response_time_ms: int, + rate_threshold: int = 1000, + response_threshold: int = 3000 +) -> Dict[str, Any]: + if request_rate > rate_threshold and response_time_ms > response_threshold: + return { + "threat": "DDoS", + "severity": "Critical", + "action": "block_ips_and_rate_limit" + } + if request_rate > rate_threshold: + return { + "threat": "DDoS", + "severity": "High", + "action": "rate_limit" + } + return { + "threat": "DDoS", + "severity": "Normal", + "action": None + } + + +def detect_iot_attack( + sensor_value: float, + min_expected: float, + max_expected: float, + inconsistent_with_peers: bool +) -> Dict[str, Any]: + if inconsistent_with_peers: + return { + "threat": "IoT Attack", + "severity": "High", + "action": "isolate_device" + } + if sensor_value < min_expected or sensor_value > max_expected: + return { + "threat": "IoT Attack", + "severity": "Medium", + "action": "flag_sensor" + } + return { + "threat": "IoT Attack", + "severity": "Normal", + "action": None + } + + +def detect_misinformation_deepfake( + unverified_source: bool, + conflicts_with_official_info: bool, + rapid_spread: bool, + deepfake_indicator: bool +) -> Dict[str, Any]: + if deepfake_indicator: + return { + "threat": "Deepfake/Misinformation", + "severity": "High", + "action": "restrict_content" + } + if unverified_source and conflicts_with_official_info and rapid_spread: + return { + "threat": "Misinformation", + "severity": "High", + "action": "flag_for_review" + } + if unverified_source: + return { + "threat": "Misinformation", + "severity": "Medium", + "action": "flag_content" + } + return { + "threat": "Misinformation", + "severity": "Normal", + "action": None + } + + +def detect_alert_system_compromise( + authorized_sender: bool, + approval_status: bool, + authentication_status: bool +) -> Dict[str, Any]: + if not authorized_sender or not approval_status or not authentication_status: + return { + "threat": "Alert System Compromise", + "severity": "Critical", + "action": "disable_alert_dispatch" + } + return { + "threat": "Alert System Compromise", + "severity": "Normal", + "action": None + } + + +def detect_fraudulent_claim( + duplicate_submission: bool, + fake_document_detected: bool, + identity_mismatch: bool +) -> Dict[str, Any]: + if identity_mismatch: + return { + "threat": "Fraudulent Claim", + "severity": "High", + "action": "reject_submission" + } + if duplicate_submission or fake_document_detected: + return { + "threat": "Fraudulent Claim", + "severity": "Medium", + "action": "send_for_manual_review" + } + return { + "threat": "Fraudulent Claim", + "severity": "Normal", + "action": None + } + + +if __name__ == "__main__": + print(detect_login_attack(12, False, False)) + print(detect_api_abuse(120, False)) + print(detect_iot_attack(98.0, 10.0, 60.0, False)) + print(detect_alert_system_compromise(False, True, True)) diff --git a/cyber/detection-response/Detection_Monitoring_Response_Framework_.pdf b/cyber/detection-response/Detection_Monitoring_Response_Framework_.pdf new file mode 100644 index 00000000..ed1c805a Binary files /dev/null and b/cyber/detection-response/Detection_Monitoring_Response_Framework_.pdf differ diff --git a/cyber/detection-response/README.md b/cyber/detection-response/README.md new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/cyber/detection-response/README.md @@ -0,0 +1 @@ + diff --git a/cyber/detection-response/implementation.md b/cyber/detection-response/implementation.md new file mode 100644 index 00000000..0c662e75 --- /dev/null +++ b/cyber/detection-response/implementation.md @@ -0,0 +1,471 @@ +# Detection and Response Implementation Rules for Project PHOENIX + +This document defines reusable implementation-level detection and response rules for major cyber threat categories identified in Project PHOENIX. These rules are designed to support backend integration by specifying the required inputs, detection logic, alert level, mitigation action, and communication flow. + +## Rule 1: Login and Authentication Attack Detection + +### Covers +- Brute force attacks +- Credential stuffing +- Unauthorized access using stolen credentials +- Suspicious login activity + +### Required Inputs +- user_id +- ip_address +- failed_login_attempts +- login_timestamp +- geo_location +- device_id +- session_id + +### Detection Logic +- Detect repeated failed login attempts in a short time window +- Detect login from unknown or unusual location +- Detect login from unrecognized device +- Detect sudden abnormal session behaviour after login + +### Rule Logic +- If failed login attempts are greater than 5 within 1 minute, trigger a high alert +- If failed login attempts are greater than 10, lock the account +- If login occurs from an unknown location or device, trigger a medium or high alert depending on severity +- If suspicious session activity is detected after login, revoke session + +### Alert Level +- Medium to Critical + +### Mitigation +- Lock account temporarily +- Enforce MFA +- Revoke active session +- Reset credentials if needed + +### Communication +- Notify security team +- Notify system administrator +- Notify affected user if account action is taken + +--- + +## Rule 2: API Abuse and Unauthorised API Access Detection + +### Covers +- API abuse +- Unsecured API access +- Abnormal API request patterns +- Excessive request frequency + +### Required Inputs +- api_key +- user_id +- endpoint +- request_count +- request_timestamp +- response_code +- auth_status +- source_ip + +### Detection Logic +- Detect request spikes from same API key or IP +- Detect unauthorised access attempts +- Detect repeated access to restricted endpoints +- Detect abnormal use outside expected usage patterns + +### Rule Logic +- If request count exceeds threshold within fixed time window, trigger a high alert +- If authentication fails repeatedly, block or throttle the source +- If restricted endpoint is accessed without proper authorisation, trigger critical alert +- If abnormal API pattern continues, revoke API key + +### Alert Level +- High to Critical + +### Mitigation +- Revoke API key +- Rate limit requests +- Block suspicious IP +- Enforce authentication and validation + +### Communication +- Notify backend team +- Notify system administrator +- Notify security team + +--- + +## Rule 3: Data Breach and Unauthorised Data Access Detection + +### Covers +- Data breach +- Unauthorised access to database +- Sensitive data exposure +- Abnormal data extraction + +### Required Inputs +- user_id +- database_query +- records_accessed +- download_size +- timestamp +- source_ip +- role + +### Detection Logic +- Detect abnormal number of database queries +- Detect access to sensitive records outside expected role +- Detect large data extraction +- Detect access at unusual times or from unusual locations + +### Rule Logic +- If accessed records exceed threshold, trigger high alert +- If low-privilege user accesses sensitive dataset, trigger critical alert +- If large download size is detected unexpectedly, suspend session +- If repeated abnormal access occurs, restrict access immediately + +### Alert Level +- High to Critical + +### Mitigation +- Restrict access +- Suspend session +- Isolate affected account +- Secure database endpoint +- Begin incident response + +### Communication +- Notify security team +- Notify database administrator +- Notify relevant stakeholders if sensitive data is impacted + +--- + +## Rule 4: Phishing, Scam, and Social Engineering Detection + +### Covers +- Phishing emails +- Scam messages +- Fake donation requests +- Government impersonation scams +- Voice phishing +- Social engineering attempts + +### Required Inputs +- message_id +- sender +- receiver +- sender_domain +- message_content +- links +- channel_type +- user_reports +- timestamp + +### Detection Logic +- Detect suspicious links +- Detect urgent or manipulative language +- Detect impersonation of trusted organisations +- Detect unverified sender or domain +- Detect repeated scam patterns in messages + +### Rule Logic +- If suspicious link and urgent language are both present, trigger high alert +- If sender domain does not match trusted domain, flag as suspicious +- If multiple users report same sender or content, quarantine the message +- If message contains known scam patterns, block or isolate delivery + +### Alert Level +- Medium to High + +### Mitigation +- Quarantine email or message +- Block malicious URL +- Prevent further delivery +- Flag suspicious sender + +### Communication +- Notify user +- Notify moderation or security team +- Raise awareness alert if campaign is widespread + +--- + +## Rule 5: Malware and Ransomware Detection + +### Covers +- Malware infection +- Information stealer +- Ransomware +- Suspicious executable activity +- Malicious mobile app behaviour + +### Required Inputs +- host_id +- process_name +- file_modifications +- cpu_usage +- outbound_connections +- file_encryption_events +- app_behaviour +- timestamp + +### Detection Logic +- Detect abnormal process behaviour +- Detect file encryption activity +- Detect unusual outbound communication +- Detect suspicious application access to sensitive files +- Detect sudden system changes + +### Rule Logic +- If file encryption event count exceeds threshold, trigger critical alert +- If unknown process accesses multiple sensitive files, isolate host +- If suspicious outbound traffic is detected after execution, flag malware activity +- If malicious application behaviour is found on device, block or remove app + +### Alert Level +- High to Critical + +### Mitigation +- Isolate infected system +- Stop malicious processes +- Remove malicious application +- Recover from clean backup if needed + +### Communication +- Notify security team +- Notify incident response team +- Notify administrator immediately + +--- + +## Rule 6: DDoS and Traffic Flooding Detection + +### Covers +- DDoS attacks +- Resource exhaustion +- Traffic flooding +- Service availability attacks + +### Required Inputs +- source_ip +- request_count +- request_rate +- endpoint +- system_load +- response_time +- timestamp + +### Detection Logic +- Detect abnormal traffic spikes +- Detect repeated requests from same or distributed sources +- Detect degraded service availability +- Detect resource exhaustion patterns + +### Rule Logic +- If traffic rate exceeds threshold, trigger high alert +- If endpoint response time degrades rapidly under traffic spike, trigger critical alert +- If repeated malicious traffic detected, rate limit or block source +- If system load exceeds safe limits, activate protection controls + +### Alert Level +- High to Critical + +### Mitigation +- Rate limit traffic +- Block malicious IP addresses +- Scale infrastructure if possible +- Redirect or filter suspicious traffic + +### Communication +- Notify administrator +- Notify network/security team +- Notify operations team if service is affected + +--- + +## Rule 7: IoT Sensor and Device Manipulation Detection + +### Covers +- IoT attack +- Sensor manipulation +- False disaster readings +- Device compromise +- Abnormal sensor communication + +### Required Inputs +- device_id +- sensor_type +- sensor_value +- expected_range +- timestamp +- location +- communication_status +- peer_sensor_values + +### Detection Logic +- Detect sensor values outside normal range +- Detect inconsistency across multiple sensors +- Detect unusual device communication +- Detect repeated invalid readings from same device + +### Rule Logic +- If sensor value exceeds expected threshold, trigger medium alert +- If multiple sensors conflict significantly, trigger high alert +- If same device repeatedly sends abnormal readings, mark device as compromised +- If device communication pattern becomes abnormal, isolate device from system + +### Alert Level +- Medium to High + +### Mitigation +- Ignore corrupted readings +- Isolate affected device +- Mark data as untrusted +- Require manual verification + +### Communication +- Notify system operator +- Notify technical team +- Notify monitoring team for validation + +--- + +## Rule 8: Misinformation and Deepfake Detection + +### Covers +- Fake disaster updates +- Social media misinformation +- Deepfake media +- False evacuation information +- Unverified public communications + +### Required Inputs +- content_id +- source +- content_text +- media_type +- verification_status +- share_count +- user_reports +- timestamp + +### Detection Logic +- Detect unverified sources +- Detect conflicting information against trusted data +- Detect AI-generated or manipulated media indicators +- Detect rapid spread of false or suspicious content + +### Rule Logic +- If content is unverified and rapidly spreading, trigger medium alert +- If content conflicts with trusted official sources, flag for review +- If deepfake indicators are found in media, trigger high alert +- If repeated user reports are received, restrict content distribution + +### Alert Level +- Medium to High + +### Mitigation +- Flag content +- Restrict distribution +- Send for verification +- Prevent automated escalation of false information + +### Communication +- Notify moderation team +- Notify analysts or reviewers +- Inform users where appropriate + +--- + +## Rule 9: Emergency Alert System Compromise Detection + +### Covers +- Emergency alert compromise +- Fake official alerts +- Unauthorized alert generation +- Alert suppression or misuse + +### Required Inputs +- alert_id +- sender_id +- alert_content +- approval_status +- dispatch_time +- source_ip +- authentication_status + +### Detection Logic +- Detect alert dispatch from unauthorised source +- Detect alert without valid approval +- Detect unusual alert timing or abnormal volume +- Detect alert content inconsistent with official disaster context + +### Rule Logic +- If alert is sent without approval, trigger critical alert +- If sender does not match authorised source, block alert +- If multiple abnormal alerts are generated, disable dispatch temporarily +- If authentication failure occurs during alert dispatch, revoke session immediately + +### Alert Level +- Critical + +### Mitigation +- Block or disable alert dispatch +- Revoke user/session access +- Verify official alert content manually +- Escalate incident immediately + +### Communication +- Notify authorities +- Notify system administrator +- Notify emergency response management team + +--- + +## Rule 10: Fraudulent Claims and Application Abuse Detection + +### Covers +- Fraudulent grant claims +- Fake disaster recovery applications +- Duplicate claim submissions +- False supporting evidence + +### Required Inputs +- application_id +- applicant_id +- uploaded_documents +- submission_time +- source_ip +- identity_verification_status +- previous_submission_count + +### Detection Logic +- Detect duplicate or repeated applications +- Detect fake or suspicious supporting documents +- Detect mismatched identity information +- Detect large number of submissions from same source + +### Rule Logic +- If duplicate applications are submitted from same source, trigger medium alert +- If uploaded evidence appears reused or manipulated, flag for verification +- If identity mismatch is detected, reject submission +- If repeated abusive submission pattern continues, block source + +### Alert Level +- Medium to High + +### Mitigation +- Reject suspicious applications +- Send to manual verification +- Block abusive source +- Strengthen validation controls + +### Communication +- Notify verification team +- Notify administrators +- Notify fraud review team + +--- + +## Summary + +These implementation rules provide reusable detection and response patterns that can be integrated into backend services. Instead of duplicating logic for every threat in the dataset, the rules group similar threats into core categories that can be monitored, alerted, and mitigated systematically within Project PHOENIX. diff --git a/cyber/secure-design/input-validation.md b/cyber/secure-design/input-validation.md new file mode 100644 index 00000000..dcf35615 --- /dev/null +++ b/cyber/secure-design/input-validation.md @@ -0,0 +1,151 @@ +# Input Validation and Injection Prevention + +## Overview +The PHOENIX system must validate and sanitise all user and system inputs to prevent injection attacks. Injection prevention is important to ensure that alert data, API inputs, and external data feeds are treated only as data and not executed as commands, queries, or scripts. + +## Validation Enforcement + +- Validation is enforced at the API layer before request processing +- Invalid requests are rejected early to reduce system load and attack surface + +## Injection Risks +Potential injection risks in the PHOENIX system include: +- SQL injection through API input fields +- Script injection through alert messages or user-submitted content +- Command injection through system-level processing +- Log injection through malicious text added to audit records + + +## Injection Prevention Rules + +### 1. Validate All Inputs +All input fields must be checked before processing. +- Reject unexpected characters +- Reject malformed input +- Ensure required fields are present +- Enforce correct data types +- Repeated invalid inputs are tracked and flagged + +### 2. Use Allowed Input Formats +Each field should only accept expected formats. +- `title` β†’ plain text, limited length +- `message` β†’ plain text only +- `severity` β†’ only predefined values (low, medium, high, critical) +- `disaster_type` β†’ only predefined values (bushfire, flood, both) + +### 3. Sanitise User Input +Inputs must be sanitised to remove or neutralise harmful content. +- Remove script tags +- Escape special characters where required +- Strip unsupported HTML or code content + +### 4. Use Parameterised Queries +Database operations must use parameterised queries or prepared statements instead of directly inserting user input into queries. + +### 5. Avoid Direct Command Execution +User input must never be directly passed into system commands, shell commands, or file operations. + +### 6. Validate External Data Sources +Data received from external systems such as BoM, Scamwatch, or other feeds must also be validated before being processed. + +### 7. Protect Logging and Error Messages +Input included in logs must be sanitised to prevent log injection. System error messages should not expose internal query structures or sensitive implementation details. + +### 8. Input Limits +- Maximum API request size: 2 KB – 10 KB (depending on endpoint) +- Prevents large payload attacks and system overload + +### 9. Field Length Constraints +- title - max 100 characters +- message - max 500 characters +- location - max 100 characters +- source - max 50 characters + +### 10. Logging for Suspicious Inputs + +- Suspicious or repeated invalid inputs are logged with: +timestamp + - IP address + - action attempted +- Supports monitoring and security audits + + + +## Example Validation Rules + +| Field | Allowed Input | Rule | +|---|---|---| +| title | Text | Maximum 100 characters, no script tags | +| message | Text | Maximum 500 characters, plain text only | +| severity | Enum | Must be one of: low, medium, high, critical | +| disaster_type | Enum | Must be one of: bushfire, flood, both | +| location | Text | Letters, spaces, and approved punctuation only | + +## Format Validation Rules + +### Overview +The PHOENIX system enforces strict format validation to ensure that all inputs follow expected data types, structures, and allowed values. This prevents invalid data entry and reduces security risks. + + + +### Field Format Requirements + +| Field | Type | Allowed Format | Example | +|---|---|---|---| +| title | string | Plain text (max 100 characters) | "Flood Warning" | +| message | string | Plain text (max 500 characters) | "Heavy rainfall expected" | +| disaster_type | enum | bushfire, flood, both | "flood" | +| threat_type | enum | phishing, scam, ransomware, misinformation | "phishing" | +| severity | enum | low, medium, high, critical | "high" | +| location | string | Letters and spaces only | "Melbourne" | +| source | string | Valid source name | "Scamwatch" | + + +### Validation Rules + +- Inputs must match defined data types. +- Enum fields must only contain predefined values. +- Input length must not exceed specified limits. +- Invalid or unexpected formats must be rejected. +- Only safe characters are allowed in text fields. + +### Validation Workflow + +- Request received by API +- Required fields checked +- Data type validation applied +- Allowed values verified +- Input length checked +- Request accepted or rejected + + +### Security Impact + +- Reduces attack surface at early stages +- Prevents system overload from invalid requests +- Detects malicious patterns before deeper processing +- Improves system reliability and trust + + +### Error Handling for Invalid Format + +Invalid inputs should result in: +- `400 Bad Request` +- Clear error message indicating the incorrect field + +Repeated invalid requests may result in: + - `429 Too Many Requests` +Suspicious activity is flagged and logged. + +Example: + +```json +{ + "error_code": 400, + "error_message": "Invalid format", + "details": "Severity must be one of: low, medium, high, critical" +} + +## Conclusion +Injection prevention in PHOENIX is achieved by validating inputs, sanitising content, restricting allowed formats, using parameterised queries, and ensuring that untrusted input is never executed as code or commands. + diff --git a/cyber/secure-design/rate-limiting.md b/cyber/secure-design/rate-limiting.md new file mode 100644 index 00000000..49574929 --- /dev/null +++ b/cyber/secure-design/rate-limiting.md @@ -0,0 +1,147 @@ +# Rate Limiting and Abuse Prevention Design + +## Overview +The PHOENIX system applies rate limiting to prevent brute-force attacks, spam alerts, API abuse, and excessive requests that could affect system availability. In addition to rate limiting, the system includes behaviour-based abuse prevention mechanisms. These mechanisms detect repeated failures, suspicious activity patterns, and misuse of system features + + +## Rate Limiting Rules + +### 1. Login Endpoint +- Limit: 5 requests per 15 minutes per user/IP +- Purpose: Prevent brute-force login attempts +- Failed login attempts are tracked +- Temporary lock applied after threshold (15 minutes) +- Repeated violations trigger extended lock duration +- Suspicious IPs may be blocked or monitored + +### 2. Create Alert Endpoint +- Limit: 10 requests per minute per authenticated user +- Purpose: Prevent spam alert creation and misuse of the TEAVS system +- Duplicate or repeated alerts are detected +- Similar alert content within short time is flagged + +### 3. Get Alert / List Alerts +- Limit: 60 requests per minute per user +- Purpose: Prevent excessive API load while allowing normal system use + +### 4. Verify Alert Endpoint +- Limit: 30 requests per minute per IP/user +- Purpose: Prevent automated abuse while allowing public verification + +### 5. Update Alert Status +- Limit: 20 requests per minute per admin user +- Purpose: Prevent misuse of alert lifecycle management + + +## Abuse Prevention Controls + +### 1. Brute-Force Protection +- Repeated failed login attempts trigger temporary blocking +- Account or IP may be locked for a limited period after too many failures + +### Protection Mechanisms + +- Limit failed login attempts per user/IP +- Temporary account lock after repeated failures +- IP blocking for suspicious activity + +### Failed Login Handling + +- After 5 failed attempts - temporary lock (15 minutes) +- Repeated violations - extended lock duration + +### Account Lock Strategy + +- Short-term lock applied for initial abuse +- Longer lock duration for repeated violations +- Admin override available if required + +### 2. Spam Alert Prevention +- Alert creation is restricted to authorised roles only +- Excessive alert creation attempts trigger throttling or review + +### Spam Prevention Controls + +- Only authorised roles can create alerts +- Duplicate or repeated alerts are flagged +- High-frequency alert creation is monitored + +### Behaviour Monitoring +The system detects: +- Multiple alerts with similar content +- Rapid alert submissions +- Unusual activity patterns + +### Response to Spam + +- Alert creation temporarily restricted +- Alerts flagged for review +- User activity logged for investigation + +### 3. API Throttling +- Requests above the allowed threshold are temporarily rejected +- The system should return a `429 Too Many Requests` response + +### 4. Monitoring and Logging +- Excessive requests must be logged +- Suspicious behaviour should be flagged for review + +### Logging and Audit +All suspicious activities are: + +- Logged securely +- Time-stamped +- Stored for audit and analysis + +Logs help: + +- Identify attackers +- Support investigations +- Improve system security + +## Example Rate Limit Table + +| Endpoint | Limit | Purpose | +|---|---|---| +| `/api/login` | 5 per 15 min | Prevent brute-force login | +| `/api/alerts` POST | 10 per min | Prevent spam alerts | +| `/api/alerts` GET | 60 per min | Control API usage | +| `/api/alerts/{id}/verify` | 30 per min | Prevent automated abuse | +| `/api/alerts/{id}/status` | 20 per min | Protect admin actions | + + +## Response for Exceeded Limits +If a rate limit is exceeded, the API should return: +- `429 Too Many Requests` +- a message explaining that the request limit has been exceeded +- a retry time if applicable + +## Abuse Detection Strategy +The system monitors: + +- High request frequency +- Repeated failed actions +- Suspicious usage patterns + +Actions taken: + +- Temporary blocking +- Activity logging +- Security alerts triggered + +## Security Response Actions +When abuse is detected: + +- Access may be temporarily restricted +- Requests may be rejected +- System administrators may be notified + +This design ensures: + +- Protection against brute-force attacks +- Prevention of spam alert misuse +- Detection of abnormal behaviour +- Improved system trust and reliability + +## Conclusion +Rate limiting in PHOENIX helps maintain system availability, prevents abuse, and protects critical alert functionality from brute-force, spam, and overload attacks. \ No newline at end of file diff --git a/cyber/secure-design/signing-workflow.md b/cyber/secure-design/signing-workflow.md new file mode 100644 index 00000000..be20ce56 --- /dev/null +++ b/cyber/secure-design/signing-workflow.md @@ -0,0 +1,22 @@ +# Signing Workflow + +## Overview +The signing workflow in the TEAVS system defines the process by which data is securely signed before being transmitted. This ensures that the data originates from a trusted source and remains unchanged during transmission. + +## Signing Process +Steps followed by the system to generate a digital signature: + +1. Data is generated by the Backend (e.g., alerts, notifications, API responses). +2. The data is passed through the SHA-256 hashing algorithm to produce a fixed-length hash value. +3. The generated hash value represents the integrity of the original data. +4. The Backend encrypts this hash using its private key. +5. The encrypted hash becomes the digital signature. +6. The original data and the digital signature are sent together to the receiver. + +## Key Characteristics +- The private key is kept secure within the Backend system. +- Any change in the original data results in a completely different hash value. +- The signature ensures both data integrity and source authenticity. + +## Flow Representation +Data β†’ SHA-256 β†’ Encrypt with Private Key β†’ Digital Signature \ No newline at end of file diff --git a/cyber/secure-design/verification-process.md b/cyber/secure-design/verification-process.md new file mode 100644 index 00000000..25dce228 --- /dev/null +++ b/cyber/secure-design/verification-process.md @@ -0,0 +1,27 @@ +# Verification Process + +## Overview +The verification process ensures that the received data is authentic and has not been tampered with. It allows the receiver to validate both the integrity of the data and the identity of the sender. + +## Verification Steps +Steps followed by the receiver to verify the digital signature: + +1. The receiver obtains the original data and the digital signature. +2. The digital signature is decrypted using the sender’s public key to retrieve the original hash value. +3. The receiver independently computes the SHA-256 hash of the received data. +4. The computed hash is compared with the decrypted hash. + +## Verification Outcome +- If both hash values match: + The data is authentic and has not been altered. + +- If the hash values do not match: + The data is considered tampered or invalid and is rejected. + +## Key Characteristics +- The public key is shared and used only for verification. +- No private key is exposed during this process. +- Ensures protection against tampering and spoofing. + +## Flow Representation +Signature β†’ Decrypt with Public Key β†’ Compare with SHA-256 Hash \ No newline at end of file diff --git a/docs/secure_storage.md b/docs/secure_storage.md new file mode 100644 index 00000000..7e995cb2 --- /dev/null +++ b/docs/secure_storage.md @@ -0,0 +1,27 @@ +# PHOENIX Secure Storage Strategy (CY011) + +## 1. Overview +This document defines the standards for protecting PHOENIX data at rest. The primary goal is to ensure the integrity of disaster hazard data and the confidentiality of security credentials, preventing unauthorized modification of emergency alerts. + +## 2. Data Security Matrix +| Data Category | Storage Location | Protection Mechanism | +| :--- | :--- | :--- | +| **Hazard & Alert Data** | PostgreSQL (Main DB) | **AES-256 Encryption** at rest; Row-Level Security (RLS). | +| **Token Blacklist** | Redis (Cache Layer) | **Time-To-Live (TTL)** expiration; prevents memory bloat. | +| **Audit & Abuse Logs** | Secure Log Server | **WORM** (Write Once, Read Many) to prevent deletion. | +| **Secrets & JWT Keys** | AWS Secrets Manager | **HSM** (Hardware Security Module) backing; no local `.env` storage. | + +## 3. Data Integrity (Hashing) +To ensure that disaster alerts are not tampered with by external actors: +* Every alert record is assigned a unique **SHA-256 hash** upon creation. +* The system re-calculates and verifies this hash before any alert is broadcasted via the TEAVS API. +* Any mismatch between the stored hash and the current content triggers an immediate "Integrity Violation" alert to the PHOENIX AI analysis engine. + +## 4. Abuse Prevention & Logging +In alignment with the PHOENIX rate-limiting framework: +* **Brute-Force Protection**: Repeated failed attempts to access storage-level data trigger an automatic temporary block on the originating IP or User ID. +* **Throttling**: Excessive queries that exceed defined thresholds trigger the **429 Too Many Requests** error response to maintain system availability. +* **Detailed Monitoring**: Every successful or failed administrative change to an alert status must be logged with a timestamp and user signature. + +## 5. Secret Management Policy +Private keys used for signing JWTs (as defined in CY007) are never stored within the source code or local environment files. The PHOENIX application retrieves these secrets at runtime through a secure, authenticated connection to the organization's secrets management provider. \ No newline at end of file