Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions a4d-python/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ A4D_DATASET=tracker
A4D_DOWNLOAD_BUCKET=a4dphase2_upload
A4D_UPLOAD_BUCKET=a4dphase2_output

# GCP Authentication (optional - uses Application Default Credentials if not set)
# For local development: run `gcloud auth application-default login`
# For CI/CD or VM: set path to service account key file
# GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json

# Paths
A4D_DATA_ROOT=/path/to/tracker/files
A4D_OUTPUT_DIR=output
Expand Down
3 changes: 2 additions & 1 deletion a4d-python/scripts/check_sheets.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python3
"""Check which sheets are being processed by R vs Python."""

import polars as pl
from pathlib import Path

import polars as pl


def check_sheets():
"""Compare which sheets were processed."""
Expand Down
39 changes: 23 additions & 16 deletions a4d-python/scripts/compare_r_vs_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@
R and Python pipelines to verify the migration produces equivalent results.

Usage:
uv run python scripts/compare_r_vs_python.py --file "2018_CDA A4D Tracker_patient_cleaned.parquet"
uv run python scripts/compare_r_vs_python.py -f "2018_CDA A4D Tracker_patient_cleaned.parquet"
uv run python scripts/compare_r_vs_python.py \
--file "2018_CDA A4D Tracker_patient_cleaned.parquet"
uv run python scripts/compare_r_vs_python.py \
-f "2018_CDA A4D Tracker_patient_cleaned.parquet"
"""

from pathlib import Path

import polars as pl
import typer
from pathlib import Path
from rich import box
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich import box
from rich.table import Table

console = Console()
app = typer.Typer()
Expand Down Expand Up @@ -169,7 +172,7 @@ def compare_metadata_fields(r_df: pl.DataFrame, py_df: pl.DataFrame):
sample = r_unique.head(3).to_list()
console.print(f" Sample: {sample}")
else:
console.print(f" [red]✗ Mismatch![/red]")
console.print(" [red]✗ Mismatch![/red]")
console.print(f" R has {len(r_unique):,} unique values")
console.print(f" Python has {len(py_unique):,} unique values")

Expand Down Expand Up @@ -268,7 +271,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
try:
joined = r_df.join(py_df, on=join_keys, how="inner", suffix="_py")
console.print(
f"[cyan]Analyzing {len(joined):,} common records (matched on {'+'.join(join_keys)})[/cyan]\n"
f"[cyan]Analyzing {len(joined):,} common records "
f"(matched on {'+'.join(join_keys)})[/cyan]\n"
)
except Exception as e:
console.print(f"[red]Error joining datasets: {e}[/red]\n")
Expand All @@ -281,8 +285,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):

# Tolerance for floating point comparisons
# Use relative tolerance of 1e-9 (about 9 decimal places)
FLOAT_REL_TOL = 1e-9
FLOAT_ABS_TOL = 1e-12
float_rel_tol = 1e-9
float_abs_tol = 1e-12

for col in sorted(common_cols):
col_py = f"{col}_py"
Expand All @@ -305,7 +309,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):

if is_numeric:
# For numeric columns, use approximate comparison
# Two values are considered equal if |a - b| <= max(rel_tol * max(|a|, |b|), abs_tol)
# Two values are equal if:
# |a - b| <= max(rel_tol * max(|a|, |b|), abs_tol)

# Add columns for comparison logic
comparison_df = joined.with_columns(
Expand All @@ -315,9 +320,9 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
# Calculate tolerance threshold
pl.max_horizontal(
[
FLOAT_REL_TOL
float_rel_tol
* pl.max_horizontal([pl.col(col).abs(), pl.col(col_py).abs()]),
pl.lit(FLOAT_ABS_TOL),
pl.lit(float_abs_tol),
]
).alias("_tolerance"),
# Check null status
Expand All @@ -327,7 +332,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
)

# Find mismatches
# Mismatch if: (1) null status differs OR (2) both not null and differ by more than tolerance
# Mismatch if: (1) null status differs OR
# (2) both not null and differ by more than tolerance
mismatched_rows = comparison_df.filter(
(pl.col("_col_null") != pl.col("_col_py_null")) # Null mismatch
| (
Expand Down Expand Up @@ -394,7 +400,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
mismatches.items(), key=lambda x: x[1]["percentage"], reverse=True
):
console.print(
f"\n[bold cyan]{col}:[/bold cyan] {stats['count']} mismatches ({stats['percentage']:.1f}%)"
f"\n[bold cyan]{col}:[/bold cyan] "
f"{stats['count']} mismatches ({stats['percentage']:.1f}%)"
)
# Include patient_id and sheet_name in examples
examples_with_ids = stats["examples_with_ids"]
Expand Down Expand Up @@ -496,14 +503,14 @@ def compare(
console.print(f" ✓ R output: {len(r_df):,} records, {len(r_df.columns)} columns")
except Exception as e:
console.print(f"[red] ✗ Failed to read R parquet: {e}[/red]")
raise typer.Exit(1)
raise typer.Exit(1) from e

try:
py_df = pl.read_parquet(python_parquet)
console.print(f" ✓ Python output: {len(py_df):,} records, {len(py_df.columns)} columns")
except Exception as e:
console.print(f"[red] ✗ Failed to read Python parquet: {e}[/red]")
raise typer.Exit(1)
raise typer.Exit(1) from e

console.print()

Expand Down
6 changes: 4 additions & 2 deletions a4d-python/scripts/export_single_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
uv run python scripts/export_single_tracker.py <tracker_file> <output_dir>

Example:
uv run python scripts/export_single_tracker.py \
"/Volumes/USB SanDisk 3.2Gen1 Media/A4D/data/a4dphase2_upload/Malaysia/SBU/2024_Sibu Hospital A4D Tracker.xlsx" \
uv run python scripts/export_single_tracker.py \\
"/Volumes/USB SanDisk 3.2Gen1 Media/A4D/data/\\
a4dphase2_upload/Malaysia/SBU/\\
2024_Sibu Hospital A4D Tracker.xlsx" \\
output/patient_data_raw
"""

Expand Down
3 changes: 2 additions & 1 deletion a4d-python/scripts/reprocess_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
"""Quick script to re-process a single tracker."""

from pathlib import Path

from a4d.pipeline.tracker import process_tracker_patient

tracker_file = Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Cambodia/CDA/2025_06_CDA A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Cambodia/CDA/2025_06_CDA A4D Tracker.xlsx" # noqa: E501
)
output_root = Path("/Volumes/USB SanDisk 3.2Gen1 Media/a4d/output_python")

Expand Down
15 changes: 8 additions & 7 deletions a4d-python/scripts/test_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""Test cleaning pipeline on Sibu Hospital 2024 tracker."""

from pathlib import Path

import polars as pl

from a4d.clean.patient import clean_patient_data
Expand All @@ -27,7 +28,7 @@ def test_cleaning():

# Read raw data
df_raw = pl.read_parquet(raw_path)
print(f"\n📥 Raw data loaded:")
print("\n📥 Raw data loaded:")
print(f" Rows: {len(df_raw)}")
print(f" Columns: {len(df_raw.columns)}")
print(f" Columns: {df_raw.columns[:10]}...")
Expand All @@ -36,15 +37,15 @@ def test_cleaning():
collector = ErrorCollector()

# Clean data
print(f"\n🧹 Cleaning data...")
print("\n🧹 Cleaning data...")
df_clean = clean_patient_data(df_raw, collector)

print(f"\n📤 Cleaned data:")
print("\n📤 Cleaned data:")
print(f" Rows: {len(df_clean)}")
print(f" Columns: {len(df_clean.columns)}")

# Show schema
print(f"\n📋 Schema (first 20 columns):")
print("\n📋 Schema (first 20 columns):")
for i, (col, dtype) in enumerate(df_clean.schema.items()):
if i < 20:
null_count = df_clean[col].null_count()
Expand All @@ -55,12 +56,12 @@ def test_cleaning():
print(f"\n⚠️ Errors collected: {len(collector)}")
if len(collector) > 0:
errors_df = collector.to_dataframe()
print(f"\n Error breakdown by column:")
print("\n Error breakdown by column:")
error_counts = errors_df.group_by("column").count().sort("count", descending=True)
for row in error_counts.iter_rows(named=True):
print(f" {row['column']:40s}: {row['count']:3d} errors")

print(f"\n First 5 errors:")
print("\n First 5 errors:")
print(errors_df.head(5))

# Write output
Expand All @@ -72,7 +73,7 @@ def test_cleaning():
print(f"\n✅ Cleaned data written to: {output_path}")

# Sample data check
print(f"\n🔍 Sample row (first non-null patient):")
print("\n🔍 Sample row (first non-null patient):")
sample = df_clean.filter(pl.col("patient_id").is_not_null()).head(1)
for col in sample.columns[:15]:
print(f" {col:40s}: {sample[col][0]}")
Expand Down
28 changes: 15 additions & 13 deletions a4d-python/scripts/test_extended_trackers.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,52 @@
#!/usr/bin/env python3
"""Extended end-to-end tests on older tracker files (2018-2021)."""

# Disable logging for clean output
import logging
import sys
from pathlib import Path
from a4d.extract.patient import read_all_patient_sheets

from a4d.clean.patient import clean_patient_data
from a4d.errors import ErrorCollector
import sys

# Disable logging for clean output
import logging
from a4d.extract.patient import read_all_patient_sheets

logging.disable(logging.CRITICAL)

test_files = [
(
"2021_Siriraj_Thailand",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/SRJ/2021_Siriraj Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/SRJ/2021_Siriraj Hospital A4D Tracker.xlsx" # noqa: E501
),
),
(
"2021_UdonThani_Thailand",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/UTH/2021_Udon Thani Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/UTH/2021_Udon Thani Hospital A4D Tracker.xlsx" # noqa: E501
),
),
(
"2020_VNC_Vietnam",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Vietnam/VNC/2020_Vietnam National Children's Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Vietnam/VNC/2020_Vietnam National Children's Hospital A4D Tracker.xlsx" # noqa: E501
),
),
(
"2019_Penang_Malaysia",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2019_Penang General Hospital A4D Tracker_DC.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2019_Penang General Hospital A4D Tracker_DC.xlsx" # noqa: E501
),
),
(
"2019_Mandalay_Myanmar",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/MCH/2019_Mandalay Children's Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/MCH/2019_Mandalay Children's Hospital A4D Tracker.xlsx" # noqa: E501
),
),
(
"2018_Yangon_Myanmar",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/YCH/2018_Yangon Children's Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/YCH/2018_Yangon Children's Hospital A4D Tracker.xlsx" # noqa: E501
),
),
]
Expand Down Expand Up @@ -83,7 +83,8 @@
)

print(
f" ✅ EXTRACTION: {len(df_raw)} rows, {len(df_raw.columns)} cols, year={year}, months={months}"
f" ✅ EXTRACTION: {len(df_raw)} rows, "
f"{len(df_raw.columns)} cols, year={year}, months={months}"
)

# Clean
Expand All @@ -105,7 +106,8 @@
}

print(
f" ✅ CLEANING: {len(df_clean)} rows, {len(df_clean.columns)} cols, {len(collector)} errors"
f" ✅ CLEANING: {len(df_clean)} rows, "
f"{len(df_clean.columns)} cols, {len(collector)} errors"
)
print(
f" Key columns: insulin_type={stats['insulin_type']}/{len(df_clean)}, "
Expand Down
21 changes: 11 additions & 10 deletions a4d-python/scripts/test_multiple_trackers.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
#!/usr/bin/env python3
"""Test extraction + cleaning on multiple trackers for end-to-end validation."""

# Disable logging for clean output
import logging
import sys
from pathlib import Path
from a4d.extract.patient import read_all_patient_sheets

from a4d.clean.patient import clean_patient_data
from a4d.errors import ErrorCollector
import sys

# Disable logging for clean output
import logging
from a4d.extract.patient import read_all_patient_sheets

logging.disable(logging.CRITICAL)

test_files = [
(
"2024_ISDFI",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Philippines/ISD/2024_ISDFI A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Philippines/ISD/2024_ISDFI A4D Tracker.xlsx" # noqa: E501
),
),
(
"2024_Penang",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2024_Penang General Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2024_Penang General Hospital A4D Tracker.xlsx" # noqa: E501
),
),
(
"2023_Sibu",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/SBU/2023_Sibu Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/SBU/2023_Sibu Hospital A4D Tracker.xlsx" # noqa: E501
),
),
(
"2022_Penang",
Path(
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2022_Penang General Hospital A4D Tracker.xlsx"
"/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2022_Penang General Hospital A4D Tracker.xlsx" # noqa: E501
),
),
]
Expand Down Expand Up @@ -72,7 +72,8 @@
)

print(
f" ✅ EXTRACTION: {len(df_raw)} rows, {len(df_raw.columns)} cols, year={year}, months={months}"
f" ✅ EXTRACTION: {len(df_raw)} rows, "
f"{len(df_raw.columns)} cols, year={year}, months={months}"
)

# Clean
Expand Down
3 changes: 2 additions & 1 deletion a4d-python/scripts/verify_fixes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python3
"""Verify that the Python fixes are working correctly by analyzing the output."""

import polars as pl
from pathlib import Path

import polars as pl


def verify_python_output():
"""Verify Python output has correct types and column ordering."""
Expand Down
8 changes: 5 additions & 3 deletions a4d-python/src/a4d/clean/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,11 @@ def parse_date_column(
df = df.with_columns(pl.col(column).alias(f"_orig_{column}"))

# Apply parse_date_flexible to each value
# NOTE: Using list-based approach instead of map_elements() because map_elements()
# with return_dtype=pl.Date fails when ALL values are None (all-NA columns like hospitalisation_date).
# Explicit Series creation with dtype=pl.Date works because it doesn't require non-null values.
# NOTE: Using list-based approach instead of map_elements() because
# map_elements() with return_dtype=pl.Date fails when ALL values are None
# (all-NA columns like hospitalisation_date).
# Explicit Series creation with dtype=pl.Date works because it doesn't
# require non-null values.
column_values = df[column].cast(pl.Utf8).to_list()
parsed_dates = [
parse_date_flexible(val, error_val=settings.error_val_date) for val in column_values
Expand Down
Loading