From 38595b4133dd6c5c47aa81898bff7c96a1d48e5f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:30:19 +0000 Subject: [PATCH 1/3] Initial plan From f5db33c40457e986d2441d405679b1754c402862 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:35:15 +0000 Subject: [PATCH 2/3] Fix patient pipeline: stale column names, Python version, product ingestion skip Co-authored-by: pmayd <9614291+pmayd@users.noreply.github.com> --- ...ript3_create_table_patient_data_changes_only.R | 3 +-- scripts/R/run_pipeline.R | 15 ++++++++------- scripts/python/pyproject.toml | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/script3_create_table_patient_data_changes_only.R b/R/script3_create_table_patient_data_changes_only.R index 92a2dcc..99300a3 100644 --- a/R/script3_create_table_patient_data_changes_only.R +++ b/R/script3_create_table_patient_data_changes_only.R @@ -43,12 +43,11 @@ create_table_longitudinal_data <- "patient_id", "sheet_name", "status", - "support_from_a4d", + "support_level", "testing_frequency", "tracker_date", "tracker_month", "tracker_year", - "updated_2022_date", "weight" ) diff --git a/scripts/R/run_pipeline.R b/scripts/R/run_pipeline.R index 5c161da..cb879c5 100644 --- a/scripts/R/run_pipeline.R +++ b/scripts/R/run_pipeline.R @@ -109,13 +109,14 @@ ingest_data( table = "patient_data_hba1c", source = file.path(table_dir, "longitudinal_data_hba1c.parquet") ) -ingest_data( - project_id = config$project_id, - cluster_fields = "clinic_id,product_released_to,product_table_year,product_table_month", - dataset = config$dataset, - table = "product_data", - source = file.path(table_dir, "product_data.parquet") -) +# NOTE: product data ingestion is deliberately skipped until the product pipeline is finalized +# ingest_data( +# project_id = config$project_id, +# cluster_fields = "clinic_id,product_released_to,product_table_year,product_table_month", +# dataset = config$dataset, +# table = "product_data", +# source = file.path(table_dir, "product_data.parquet") +# ) ingest_data( project_id = config$project_id, cluster_fields = "clinic_id", diff --git a/scripts/python/pyproject.toml b/scripts/python/pyproject.toml index a21275c..67b264f 100644 --- a/scripts/python/pyproject.toml +++ b/scripts/python/pyproject.toml @@ -7,7 +7,7 @@ readme = "README.md" package-mode = false [tool.poetry.dependencies] -python = ">=3.10,<3.13" +python = ">=3.10,<3.14" pandas = "^2.2.1" openpyxl = "^3.1.5" click = "^8.1.7" From 72c7be9f01666838f3516cc9b87a67d34ae22733 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:43:04 +0000 Subject: [PATCH 3/3] Remove longitudinal data table creation and all related code Co-authored-by: pmayd <9614291+pmayd@users.noreply.github.com> --- ...3_create_table_patient_data_changes_only.R | 89 ------------------- scripts/R/run_pipeline.R | 7 -- scripts/R/run_script_3_create_tables.R | 42 --------- 3 files changed, 138 deletions(-) delete mode 100644 R/script3_create_table_patient_data_changes_only.R diff --git a/R/script3_create_table_patient_data_changes_only.R b/R/script3_create_table_patient_data_changes_only.R deleted file mode 100644 index 99300a3..0000000 --- a/R/script3_create_table_patient_data_changes_only.R +++ /dev/null @@ -1,89 +0,0 @@ -#' @title Create CSV with longitudinal patient data for a single variable. -#' -#' @description -#' Read in all cleaned patient data CSV and create a single data.frame. -#' Group this data by id and take only the months when there is a change in the medical data. -#' -#' -#' @param patient_data_files list of CSV files with cleaned patient data from step 2. -#' @param input_root root directory of the input CSV files. -#' @param output_root root directory of the output folder. -#' @param variable name of the column that should be exported. -#' @param name name used to create the export file name. -create_table_longitudinal_data <- - function(patient_data_files, - input_root, - output_root, - variable, - name) { - dynamic_patient_columns <- - c( - "blood_pressure_dias_mmhg", - "blood_pressure_sys_mmhg", - "bmi", - "bmi_date", - "clinic_id", - "fbg_updated_date", - "fbg_updated_mg", - "fbg_updated_mmol", - "file_name", - "hba1c_updated", - "hba1c_updated_exceeds", - "hba1c_updated_date", - "height", - "hospitalisation_cause", - "hospitalisation_date", - "insulin_regimen", - "insulin_type", - "insulin_subtype", - "last_clinic_visit_date", - "last_remote_followup_date", - "observations", - "observations_category", - "patient_id", - "sheet_name", - "status", - "support_level", - "testing_frequency", - "tracker_date", - "tracker_month", - "tracker_year", - "weight" - ) - - patient_data <- read_cleaned_patient_data(input_root, patient_data_files) %>% - dplyr::select(tidyselect::all_of(dynamic_patient_columns)) - - # get latest static patient data overall - variable_lag <- paste0(variable, "_lag") - longitudinal_data <- patient_data %>% - tidyr::drop_na(!!variable) %>% - dplyr::filter(get(variable) != ERROR_VAL_NUMERIC) %>% - dplyr::group_by(patient_id) %>% - dplyr::arrange(tracker_year, tracker_month) %>% - dplyr::filter( - get(variable) != tidyr::replace_na( - dplyr::lag(get(variable), default = NULL), - ERROR_VAL_NUMERIC - ) - ) %>% - dplyr::ungroup() %>% - dplyr::arrange(patient_id, tracker_year, tracker_month) - - logInfo( - log_to_json( - message = "longitudinal_data dim: {values['dim']}.", - values = list(dim = dim(longitudinal_data)), - script = "script3", - file = "create_table_patient_data_changes_only.log", - functionName = "create_table_longitudinal_data" - ) - ) - - export_data_as_parquet( - data = longitudinal_data, - filename = paste0("longitudinal_data_", name), - output_root = output_root, - suffix = "" - ) - } diff --git a/scripts/R/run_pipeline.R b/scripts/R/run_pipeline.R index cb879c5..d81a906 100644 --- a/scripts/R/run_pipeline.R +++ b/scripts/R/run_pipeline.R @@ -102,13 +102,6 @@ ingest_data( table = "patient_data_static", source = file.path(table_dir, "patient_data_static.parquet") ) -ingest_data( - project_id = config$project_id, - cluster_fields = "clinic_id,patient_id,tracker_date", - dataset = config$dataset, - table = "patient_data_hba1c", - source = file.path(table_dir, "longitudinal_data_hba1c.parquet") -) # NOTE: product data ingestion is deliberately skipped until the product pipeline is finalized # ingest_data( # project_id = config$project_id, diff --git a/scripts/R/run_script_3_create_tables.R b/scripts/R/run_script_3_create_tables.R index 8a27014..9b86568 100644 --- a/scripts/R/run_script_3_create_tables.R +++ b/scripts/R/run_script_3_create_tables.R @@ -100,48 +100,6 @@ main <- function() { output_root = paths$output_root ) - logfile <- "table_longitudinal_data_hba1c" - with_file_logger(logfile, - { - tryCatch( - { - create_table_longitudinal_data( - patient_data_files, - file.path(paths$output_root, "patient_data_cleaned"), - paths$tables, - "hba1c_updated", - "hba1c" - ) - }, - error = function(e) { - logError( - log_to_json( - "Could not create table for longitudinal patient data. Error = {values['e']}.", - values = list(e = e$message), - script = "script3", - file = "run_script_3_create_tables.R", - errorCode = "critical_abort", - functionName = "create_table_longitudinal_data" - ) - ) - }, - warning = function(w) { - logWarn( - log_to_json( - "Could not create table for longitudinal patient data. Warning = {values['w']}.", - values = list(w = w$message), - script = "script3", - file = "run_script_3_create_tables.R", - warningCode = "critical_abort", - functionName = "create_table_longitudinal_data" - ) - ) - } - ) - }, - output_root = paths$output_root - ) - logfile <- "table_patient_data_annual" with_file_logger(logfile, {