CorrelAid · pmayd · Feb 24, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/a4d-python/.env.example b/a4d-python/.env.example
@@ -7,6 +7,11 @@ A4D_DATASET=tracker
 A4D_DOWNLOAD_BUCKET=a4dphase2_upload
 A4D_UPLOAD_BUCKET=a4dphase2_output
 
+# GCP Authentication (optional - uses Application Default Credentials if not set)
+# For local development: run `gcloud auth application-default login`
+# For CI/CD or VM: set path to service account key file
+# GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account-key.json
+
 # Paths
 A4D_DATA_ROOT=/path/to/tracker/files
 A4D_OUTPUT_DIR=output

diff --git a/a4d-python/scripts/check_sheets.py b/a4d-python/scripts/check_sheets.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 """Check which sheets are being processed by R vs Python."""
 
-import polars as pl
 from pathlib import Path
 
+import polars as pl
+
 
 def check_sheets():
     """Compare which sheets were processed."""

diff --git a/a4d-python/scripts/compare_r_vs_python.py b/a4d-python/scripts/compare_r_vs_python.py
@@ -5,17 +5,20 @@
 R and Python pipelines to verify the migration produces equivalent results.
 
 Usage:
-    uv run python scripts/compare_r_vs_python.py --file "2018_CDA A4D Tracker_patient_cleaned.parquet"
-    uv run python scripts/compare_r_vs_python.py -f "2018_CDA A4D Tracker_patient_cleaned.parquet"
+    uv run python scripts/compare_r_vs_python.py \
+        --file "2018_CDA A4D Tracker_patient_cleaned.parquet"
+    uv run python scripts/compare_r_vs_python.py \
+        -f "2018_CDA A4D Tracker_patient_cleaned.parquet"
 """
 
+from pathlib import Path
+
 import polars as pl
 import typer
-from pathlib import Path
+from rich import box
 from rich.console import Console
-from rich.table import Table
 from rich.panel import Panel
-from rich import box
+from rich.table import Table
 
 console = Console()
 app = typer.Typer()
@@ -169,7 +172,7 @@ def compare_metadata_fields(r_df: pl.DataFrame, py_df: pl.DataFrame):
             sample = r_unique.head(3).to_list()
             console.print(f"    Sample: {sample}")
         else:
-            console.print(f"  [red]✗ Mismatch![/red]")
+            console.print("  [red]✗ Mismatch![/red]")
             console.print(f"    R has {len(r_unique):,} unique values")
             console.print(f"    Python has {len(py_unique):,} unique values")
 
@@ -268,7 +271,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
     try:
         joined = r_df.join(py_df, on=join_keys, how="inner", suffix="_py")
         console.print(
-            f"[cyan]Analyzing {len(joined):,} common records (matched on {'+'.join(join_keys)})[/cyan]\n"
+            f"[cyan]Analyzing {len(joined):,} common records "
+            f"(matched on {'+'.join(join_keys)})[/cyan]\n"
         )
     except Exception as e:
         console.print(f"[red]Error joining datasets: {e}[/red]\n")
@@ -281,8 +285,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
 
     # Tolerance for floating point comparisons
     # Use relative tolerance of 1e-9 (about 9 decimal places)
-    FLOAT_REL_TOL = 1e-9
-    FLOAT_ABS_TOL = 1e-12
+    float_rel_tol = 1e-9
+    float_abs_tol = 1e-12
 
     for col in sorted(common_cols):
         col_py = f"{col}_py"
@@ -305,7 +309,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
 
                 if is_numeric:
                     # For numeric columns, use approximate comparison
-                    # Two values are considered equal if |a - b| <= max(rel_tol * max(|a|, |b|), abs_tol)
+                    # Two values are equal if:
+                    # |a - b| <= max(rel_tol * max(|a|, |b|), abs_tol)
 
                     # Add columns for comparison logic
                     comparison_df = joined.with_columns(
@@ -315,9 +320,9 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
                             # Calculate tolerance threshold
                             pl.max_horizontal(
                                 [
-                                    FLOAT_REL_TOL
+                                    float_rel_tol
                                     * pl.max_horizontal([pl.col(col).abs(), pl.col(col_py).abs()]),
-                                    pl.lit(FLOAT_ABS_TOL),
+                                    pl.lit(float_abs_tol),
                                 ]
                             ).alias("_tolerance"),
                             # Check null status
@@ -327,7 +332,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
                     )
 
                     # Find mismatches
-                    # Mismatch if: (1) null status differs OR (2) both not null and differ by more than tolerance
+                    # Mismatch if: (1) null status differs OR
+                    # (2) both not null and differ by more than tolerance
                     mismatched_rows = comparison_df.filter(
                         (pl.col("_col_null") != pl.col("_col_py_null"))  # Null mismatch
                         | (
@@ -394,7 +400,8 @@ def find_value_mismatches(r_df: pl.DataFrame, py_df: pl.DataFrame):
             mismatches.items(), key=lambda x: x[1]["percentage"], reverse=True
         ):
             console.print(
-                f"\n[bold cyan]{col}:[/bold cyan] {stats['count']} mismatches ({stats['percentage']:.1f}%)"
+                f"\n[bold cyan]{col}:[/bold cyan] "
+                f"{stats['count']} mismatches ({stats['percentage']:.1f}%)"
             )
             # Include patient_id and sheet_name in examples
             examples_with_ids = stats["examples_with_ids"]
@@ -496,14 +503,14 @@ def compare(
         console.print(f"  ✓ R output: {len(r_df):,} records, {len(r_df.columns)} columns")
     except Exception as e:
         console.print(f"[red]  ✗ Failed to read R parquet: {e}[/red]")
-        raise typer.Exit(1)
+        raise typer.Exit(1) from e
 
     try:
         py_df = pl.read_parquet(python_parquet)
         console.print(f"  ✓ Python output: {len(py_df):,} records, {len(py_df.columns)} columns")
     except Exception as e:
         console.print(f"[red]  ✗ Failed to read Python parquet: {e}[/red]")
-        raise typer.Exit(1)
+        raise typer.Exit(1) from e
 
     console.print()
 

diff --git a/a4d-python/scripts/export_single_tracker.py b/a4d-python/scripts/export_single_tracker.py
@@ -5,8 +5,10 @@
     uv run python scripts/export_single_tracker.py <tracker_file> <output_dir>
 
 Example:
-    uv run python scripts/export_single_tracker.py \
-        "/Volumes/USB SanDisk 3.2Gen1 Media/A4D/data/a4dphase2_upload/Malaysia/SBU/2024_Sibu Hospital A4D Tracker.xlsx" \
+    uv run python scripts/export_single_tracker.py \\
+        "/Volumes/USB SanDisk 3.2Gen1 Media/A4D/data/\\
+        a4dphase2_upload/Malaysia/SBU/\\
+        2024_Sibu Hospital A4D Tracker.xlsx" \\
         output/patient_data_raw
 """
 

diff --git a/a4d-python/scripts/reprocess_tracker.py b/a4d-python/scripts/reprocess_tracker.py
@@ -2,10 +2,11 @@
 """Quick script to re-process a single tracker."""
 
 from pathlib import Path
+
 from a4d.pipeline.tracker import process_tracker_patient
 
 tracker_file = Path(
-    "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Cambodia/CDA/2025_06_CDA A4D Tracker.xlsx"
+    "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Cambodia/CDA/2025_06_CDA A4D Tracker.xlsx"  # noqa: E501
 )
 output_root = Path("/Volumes/USB SanDisk 3.2Gen1 Media/a4d/output_python")
 

diff --git a/a4d-python/scripts/test_cleaning.py b/a4d-python/scripts/test_cleaning.py
@@ -2,6 +2,7 @@
 """Test cleaning pipeline on Sibu Hospital 2024 tracker."""
 
 from pathlib import Path
+
 import polars as pl
 
 from a4d.clean.patient import clean_patient_data
@@ -27,7 +28,7 @@ def test_cleaning():
 
     # Read raw data
     df_raw = pl.read_parquet(raw_path)
-    print(f"\n📥 Raw data loaded:")
+    print("\n📥 Raw data loaded:")
     print(f"   Rows: {len(df_raw)}")
     print(f"   Columns: {len(df_raw.columns)}")
     print(f"   Columns: {df_raw.columns[:10]}...")
@@ -36,15 +37,15 @@ def test_cleaning():
     collector = ErrorCollector()
 
     # Clean data
-    print(f"\n🧹 Cleaning data...")
+    print("\n🧹 Cleaning data...")
     df_clean = clean_patient_data(df_raw, collector)
 
-    print(f"\n📤 Cleaned data:")
+    print("\n📤 Cleaned data:")
     print(f"   Rows: {len(df_clean)}")
     print(f"   Columns: {len(df_clean.columns)}")
 
     # Show schema
-    print(f"\n📋 Schema (first 20 columns):")
+    print("\n📋 Schema (first 20 columns):")
     for i, (col, dtype) in enumerate(df_clean.schema.items()):
         if i < 20:
             null_count = df_clean[col].null_count()
@@ -55,12 +56,12 @@ def test_cleaning():
     print(f"\n⚠️  Errors collected: {len(collector)}")
     if len(collector) > 0:
         errors_df = collector.to_dataframe()
-        print(f"\n   Error breakdown by column:")
+        print("\n   Error breakdown by column:")
         error_counts = errors_df.group_by("column").count().sort("count", descending=True)
         for row in error_counts.iter_rows(named=True):
             print(f"      {row['column']:40s}: {row['count']:3d} errors")
 
-        print(f"\n   First 5 errors:")
+        print("\n   First 5 errors:")
         print(errors_df.head(5))
 
     # Write output
@@ -72,7 +73,7 @@ def test_cleaning():
     print(f"\n✅ Cleaned data written to: {output_path}")
 
     # Sample data check
-    print(f"\n🔍 Sample row (first non-null patient):")
+    print("\n🔍 Sample row (first non-null patient):")
     sample = df_clean.filter(pl.col("patient_id").is_not_null()).head(1)
     for col in sample.columns[:15]:
         print(f"   {col:40s}: {sample[col][0]}")

diff --git a/a4d-python/scripts/test_extended_trackers.py b/a4d-python/scripts/test_extended_trackers.py
@@ -1,52 +1,52 @@
 #!/usr/bin/env python3
 """Extended end-to-end tests on older tracker files (2018-2021)."""
 
+# Disable logging for clean output
+import logging
+import sys
 from pathlib import Path
-from a4d.extract.patient import read_all_patient_sheets
+
 from a4d.clean.patient import clean_patient_data
 from a4d.errors import ErrorCollector
-import sys
-
-# Disable logging for clean output
-import logging
+from a4d.extract.patient import read_all_patient_sheets
 
 logging.disable(logging.CRITICAL)
 
 test_files = [
     (
         "2021_Siriraj_Thailand",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/SRJ/2021_Siriraj Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/SRJ/2021_Siriraj Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2021_UdonThani_Thailand",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/UTH/2021_Udon Thani Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Thailand/UTH/2021_Udon Thani Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2020_VNC_Vietnam",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Vietnam/VNC/2020_Vietnam National Children's Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Vietnam/VNC/2020_Vietnam National Children's Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2019_Penang_Malaysia",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2019_Penang General Hospital A4D Tracker_DC.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2019_Penang General Hospital A4D Tracker_DC.xlsx"  # noqa: E501
         ),
     ),
     (
         "2019_Mandalay_Myanmar",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/MCH/2019_Mandalay Children's Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/MCH/2019_Mandalay Children's Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2018_Yangon_Myanmar",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/YCH/2018_Yangon Children's Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Myanmar/YCH/2018_Yangon Children's Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
 ]
@@ -83,7 +83,8 @@
         )
 
         print(
-            f"  ✅ EXTRACTION: {len(df_raw)} rows, {len(df_raw.columns)} cols, year={year}, months={months}"
+            f"  ✅ EXTRACTION: {len(df_raw)} rows, "
+            f"{len(df_raw.columns)} cols, year={year}, months={months}"
         )
 
         # Clean
@@ -105,7 +106,8 @@
         }
 
         print(
-            f"  ✅ CLEANING: {len(df_clean)} rows, {len(df_clean.columns)} cols, {len(collector)} errors"
+            f"  ✅ CLEANING: {len(df_clean)} rows, "
+            f"{len(df_clean.columns)} cols, {len(collector)} errors"
         )
         print(
             f"     Key columns: insulin_type={stats['insulin_type']}/{len(df_clean)}, "

diff --git a/a4d-python/scripts/test_multiple_trackers.py b/a4d-python/scripts/test_multiple_trackers.py
@@ -1,40 +1,40 @@
 #!/usr/bin/env python3
 """Test extraction + cleaning on multiple trackers for end-to-end validation."""
 
+# Disable logging for clean output
+import logging
+import sys
 from pathlib import Path
-from a4d.extract.patient import read_all_patient_sheets
+
 from a4d.clean.patient import clean_patient_data
 from a4d.errors import ErrorCollector
-import sys
-
-# Disable logging for clean output
-import logging
+from a4d.extract.patient import read_all_patient_sheets
 
 logging.disable(logging.CRITICAL)
 
 test_files = [
     (
         "2024_ISDFI",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Philippines/ISD/2024_ISDFI A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Philippines/ISD/2024_ISDFI A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2024_Penang",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2024_Penang General Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2024_Penang General Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2023_Sibu",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/SBU/2023_Sibu Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/SBU/2023_Sibu Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
     (
         "2022_Penang",
         Path(
-            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2022_Penang General Hospital A4D Tracker.xlsx"
+            "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/Malaysia/PNG/2022_Penang General Hospital A4D Tracker.xlsx"  # noqa: E501
         ),
     ),
 ]
@@ -72,7 +72,8 @@
         )
 
         print(
-            f"  ✅ EXTRACTION: {len(df_raw)} rows, {len(df_raw.columns)} cols, year={year}, months={months}"
+            f"  ✅ EXTRACTION: {len(df_raw)} rows, "
+            f"{len(df_raw.columns)} cols, year={year}, months={months}"
         )
 
         # Clean

diff --git a/a4d-python/scripts/verify_fixes.py b/a4d-python/scripts/verify_fixes.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 """Verify that the Python fixes are working correctly by analyzing the output."""
 
-import polars as pl
 from pathlib import Path
 
+import polars as pl
+
 
 def verify_python_output():
     """Verify Python output has correct types and column ordering."""

diff --git a/a4d-python/src/a4d/clean/converters.py b/a4d-python/src/a4d/clean/converters.py
@@ -165,9 +165,11 @@ def parse_date_column(
     df = df.with_columns(pl.col(column).alias(f"_orig_{column}"))
 
     # Apply parse_date_flexible to each value
-    # NOTE: Using list-based approach instead of map_elements() because map_elements()
-    # with return_dtype=pl.Date fails when ALL values are None (all-NA columns like hospitalisation_date).
-    # Explicit Series creation with dtype=pl.Date works because it doesn't require non-null values.
+    # NOTE: Using list-based approach instead of map_elements() because
+    # map_elements() with return_dtype=pl.Date fails when ALL values are None
+    # (all-NA columns like hospitalisation_date).
+    # Explicit Series creation with dtype=pl.Date works because it doesn't
+    # require non-null values.
     column_values = df[column].cast(pl.Utf8).to_list()
     parsed_dates = [
         parse_date_flexible(val, error_val=settings.error_val_date) for val in column_values