diff --git a/analysis/0_utility_functions.R b/analysis/0_utility_functions.R new file mode 100644 index 0000000..5502b9d --- /dev/null +++ b/analysis/0_utility_functions.R @@ -0,0 +1,282 @@ +# ################################################# +# +# Purpose: +# define useful functions used in the codebase +# this script should be sourced (using `source(here("analysis", "0_utility_functions.R"))`) at the start of each R script +# +################################################### + +# --------------------------------------------------------- +# Study period +# --------------------------------------------------------- + +# study_start_date <- as.Date("2009-01-01") +# study_end_date <- as.Date("2026-03-06") + +# --------------------------------------------------------- +# Rounding function (sdc) +# --------------------------------------------------------- +# - values <= 7 suppressed (NA) +rounding <- function(vars) { + case_when( + vars == 0 ~ 0, + vars > 7 ~ round(vars / 5) * 5 + ) +} + + +# --------------------------------------------------------- +# Implausible death date flag +# --------------------------------------------------------- +# TODO: parameterize start and end dates ? +# Flags implausible death dates +# Returns a categorical status +cat_implausible_death_date <- function(death_date, date_of_birth) { + case_when( + is.na(death_date) ~ "missing", + death_date < date_of_birth ~ "death_before_birth", + death_date < as.Date("2009-01-01") ~ "before_study", + death_date > as.Date("2026-03-06") ~ "after_study", + TRUE ~ "ok" + )|> + factor( + levels = c( + "ok", + "missing", + "death_before_birth", + "before_study", + "after_study" + ), + ordered = TRUE + ) +} + +# --------------------------------------------------------- +# Classify source of death +# --------------------------------------------------------- +# Categories: ONS_only, TPP_only, Both +cat_death_source <- function(ons_death_date, tpp_death_info) { + case_when( + !is.na(ons_death_date) & !is.na(tpp_death_info) ~ "Both", + !is.na(ons_death_date) & is.na(tpp_death_info) ~ "ONS_only", + is.na(ons_death_date) & !is.na(tpp_death_info) ~ "TPP_only", + TRUE ~ NA_character_ + ) |> + factor( + levels = c("ONS_only", "TPP_only", "Both") + ) +} + +# --------------------------------------------------------- +# Categorise timing: registration start and death +# --------------------------------------------------------- +# Input: days = death_date - registration_start_date +cat_last_reg_start_to_death <- function(days) { + case_when( + is.na(days) ~ "missing_registration_start", # should be no cases + days < 0 ~ "death_before_registration_start", + days == 0 ~ "same_day_as_registration_start", + days > 0 ~ "death_after_registration_start" + ) |> + factor( + levels = c( + "death_before_registration_start", + "same_day_as_registration_start", + "death_after_registration_start", + "missing_registration_start" + ), + ordered = TRUE + ) +} + + +# --------------------------------------------------------- +# Categorise timing: registration end - death +# --------------------------------------------------------- +# Input: days = registration_end_date - death_date +# Interpretation: +# negative deregistration before death +# positive deregistration after death +# 0 same day +# NA missing deregistration date +cat_last_reg_end_minus_death <- function(days) { + case_when( + is.na(days) ~ "missing_registration_end", + days <= -31 ~ "-31+", + days >= -30 & days <= -8 ~ "-30 to -8", + days >= -7 & days <= -1 ~ "-7 to -1", + days == 0 ~ "0", + days >= 1 & days <= 7 ~ "1 to 7", + days >= 8 & days <= 30 ~ "8 to 30", + days >= 31 ~ "31+" + ) |> + factor( + levels = c("-31+", "-8 to -30", "-1 to -7", "0", "1 to 7", "8 to 30", "31+", + "missing_registration_end"), + ordered = TRUE + ) +} + + + +# --------------------------------------------------------- +# Registration status at death +# --------------------------------------------------------- +# Temporal relationship between: +# - death date +# - registration start +# - registration end +# +# Includes grace period (default = 30 days) +cat_registration_status <- function( + death_date, + reg_start, + reg_end, + grace_days = 30 +) { + case_when( + is.na(death_date) ~ "no_death_date", + is.na(reg_start) ~ "no_registration", + + death_date < reg_start ~ "death_before_last_registration_start", + + is.na(reg_end) & death_date >= reg_start ~ "death_during_registration_open_end", + death_date <= reg_end ~ "death_during_registration", + + death_date > reg_end & death_date <= reg_end + grace_days ~ "death_after_deregistration_within_grace", + death_date > reg_end + grace_days ~ "death_after_deregistration_outside_grace" + ) |> + factor( + levels = c( + "death_before_last_registration_start", + "death_during_registration", + "death_during_registration_open_end", + "death_after_deregistration_within_grace", + "death_after_deregistration_outside_grace", + "no_registration", + "no_death_date" + ), + ordered = TRUE + ) +} + + +#------------------------------------ +# TPP death date vs SNOMED codes +#------------------------------------ +# IMP: individuals with a death SNOMED code only (and no ONS or TPP death date) are not +# currently included in the dataset definition. Coded deaths are only added as a variable among +# those already in the dataset (with ONS or TPP death date). +# +# - year_pref_ONS_TPP_plus_codes - prioritises ONS date, then TPP date, then coded date +# - TPP_date_or_coded - classifies TPP death information availability (date, SNOMED code, both) +# - death_source_TPP_date_or_coded - clasif source including any TPP death evidence (dated and/or coded) + +add_tpp_date_or_coded_vars <- function(data) { + data |> + mutate( + # Preferred death year: ONS > TPP dated > TPP coded + death_date_ref_year_w_tpp_codes = case_when( + !is.na(ons_death_date) ~ year(ons_death_date), + !is.na(tpp_death_date) ~ year(tpp_death_date), + !is.na(tpp_coded_death_date) ~ year(tpp_coded_death_date), + TRUE ~ NA_real_ + ), + + # TPP death information availability + tpp_date_or_coded = case_when( + !is.na(tpp_death_date) & is.na(tpp_coded_death_date) ~ "tpp_dated_only", + is.na(tpp_death_date) & !is.na(tpp_coded_death_date) ~ "tpp_coded_only", + !is.na(tpp_death_date) & !is.na(tpp_coded_death_date) ~ "tpp_dated_and_coded", + is.na(tpp_death_date) & is.na(tpp_coded_death_date) ~ NA_character_ + ) |> + factor( + levels = c("tpp_dated_only","tpp_coded_only","tpp_dated_and_coded") + ), + + # Sensitivity source: any TPP death evidence (dated or coded) + death_source_tpp_date_or_coded = cat_death_source( + ons_death_date, + coalesce(tpp_death_date, tpp_coded_death_date) + ) + ) +} + + +# --------------------------------------------------------- +# Add date-of-death difference variables (TPP vs ONS) +# --------------------------------------------------------- +# - diff_dod: difference in days (TPP - ONS) +# > positive values: TPP death date after ONS +# > negative values: TPP death date before ONS +# - dod_diff_groups: categorised difference in days (ordered factor) for tables/plots + +add_dod_diff_vars <- function(data) { + + data |> + mutate( + # difference in days between TPP and ONS death dates + diff_dod = as.numeric(tpp_death_date - ons_death_date), + + # grouped differences + dod_diff_groups = case_when( + is.na(diff_dod) ~ NA_character_, + + diff_dod == 0 ~ "0", + + diff_dod >= 1 & diff_dod <= 7 ~ "1 to 7", + diff_dod >= 8 & diff_dod <= 30 ~ "8 to 30", + diff_dod >= 31 ~ "31+", + + diff_dod <= -1 & diff_dod >= -7 ~ "-1 to -7", + diff_dod <= -8 & diff_dod >= -30 ~ "-8 to -30", + diff_dod <= -31 ~ "-31+" + ), + + # convert to ordered factor for consistent display + dod_diff_groups = factor( + dod_diff_groups, + levels = c("-31+", "-8 to -30", "-1 to -7", "0", "1 to 7", "8 to 30", "31+"), + ordered = TRUE + ) + ) +} + +# --------------------------------------------------------- +# Demographic vars +# --------------------------------------------------------- +add_demographic_vars <- function(data) { + data |> + mutate( + age_band = cut( + age, + breaks = c(-Inf, 45, 55, 65, 75, 85, Inf), + labels = c("0-44", "45-54", "55-64", "65-74", "75-84", "85+"), + right = FALSE + ), + + imd_quintile = cut( + imd, + breaks = c(0, 32844 * (1:5) / 5), + labels = c("1 (most deprived)", "2", "3", "4", "5 (least deprived)"), + include.lowest = TRUE, + right = FALSE + ), + + rural_urban = factor( + rural_urban, + levels = 1:8, + labels = c( + "Urban major conurbation", + "Urban minor conurbation", + "Urban city and town", + "Urban city and town in a sparse setting", + "Rural town and fringe", + "Rural town and fringe in a sparse setting", + "Rural village and dispersed", + "Rural village and dispersed in a sparse setting" + ), + ordered = TRUE + ) + ) +} \ No newline at end of file diff --git a/analysis/1_derive_key_variables.R b/analysis/1_derive_key_variables.R new file mode 100644 index 0000000..e4ba004 --- /dev/null +++ b/analysis/1_derive_key_variables.R @@ -0,0 +1,141 @@ +################################################### +# Author: Martina Pesce / Andrea Schaffer +# Bennett Institute for Applied Data Science +# University of Oxford, 2025 +# +# This script describe the relationship between death date and GP registration history +# +# Steps: +# 1) classify deaths by source: ONS only / TPP only / both +# 2) define whether death occurred during registration +# - main analysis: includes 30-day grace period after registration end +# - sensitivity analysis: no grace period +# 3) among registered deaths, describe timing between: +# - registration start and death +# - death and registration end +# +# Outputs: +################################################### + +# Libraries ---- + +library(tidyverse) +library(lubridate) +library(here) +library(fs) + +# Create output directories ---- + +output_dir_hs <- here("output", "highly_sensitive") +dir_create(output_dir_hs) + +output_dir_analysis_tables <- here("output", "analysis_tables") +dir_create(output_dir_analysis_tables) + +# import custom functions ---- +source(here("analysis", "0_utility_functions.R")) + +# Import data ---- + +dataset_death_raw <- read_csv( + here("output", "highly_sensitive", "dataset_death_TPP_ONS.csv.gz") +) |> + mutate( + tpp_death_date = as.Date(tpp_death_date), + ons_death_date = as.Date(ons_death_date), + date_of_birth = as.Date(date_of_birth), + last_registration_start_date = as.Date(last_registration_start_date), + last_registration_end_date = as.Date(last_registration_end_date), + region = as.factor(region), + ethnicity = as.factor(ethnicity), + sex = as.factor(sex) + ) + + +# Derive core variables ---- +death_registration_processed <- dataset_death_raw |> + mutate( + # ONS or TPP death date + has_ons_date_death = !is.na(ons_death_date), + has_tpp_death_date = !is.na(tpp_death_date), + flag_any_date_death = has_ons_date_death | has_tpp_death_date, + + # Plausibility of death dates + cat_ons_death_date = cat_implausible_death_date(ons_death_date, date_of_birth), + cat_tpp_death_date = cat_implausible_death_date(tpp_death_date, date_of_birth), + flag_any_date_death_implausible = + !(cat_ons_death_date %in% c("ok", "missing")) | + !(cat_tpp_death_date %in% c("ok", "missing")), + + # Death source + death_source = cat_death_source(ons_death_date, tpp_death_date), + + # Use ONS death date where available, otherwise TPP + death_date_ref = as.Date(if_else(!is.na(ons_death_date), ons_death_date, tpp_death_date)), + death_date_ref_year = year(death_date_ref), + + # Registration timing + death_minus_reg_start = as.numeric(death_date_ref - last_registration_start_date), + reg_end_minus_death = as.numeric(last_registration_end_date - death_date_ref), + + registration_status = cat_registration_status( + death_date = death_date_ref, + reg_start = last_registration_start_date, + reg_end = last_registration_end_date, + grace_days = 30 + ), + + # Registered flags + flag_is_registered = registration_status %in% c( + "death_during_registration", + "death_during_registration_open_end" + ), + is_registered_within_grace= registration_status %in% c( + "death_during_registration", + "death_during_registration_open_end", + "death_after_deregistration_within_grace" + ), + + # Timing groups + reg_start_timing_group = cat_last_reg_start_to_death(death_minus_reg_start), + reg_end_timing_group = cat_last_reg_end_minus_death(reg_end_minus_death) + ) |> + add_tpp_date_or_coded_vars() |> + add_dod_diff_vars() |> + add_demographic_vars() + + +write_csv(death_registration_processed, here(output_dir_hs, "death_registration_processed.csv.gz")) + + +# print details about dataset +capture.output( + skimr::skim_without_charts(death_registration_processed), + file = fs::path(output_dir_analysis_tables, "death_registration_processed_skim.txt"), + split = FALSE +) + +# raw table source by year (no filter) +table_source_raw <- death_registration_processed |> + + # Keep individuals with any recorded death date + # (exclude patients with death only recorded in TPP codes) + filter(flag_any_date_death == TRUE) |> + + group_by(death_date_ref_year, death_source) |> + summarise( + total = n(), + .groups = "drop" + )|> + group_by(death_date_ref_year) |> + mutate( + total_year = rounding(sum(total, na.rm = TRUE)), + total = rounding(total), + perc = total / total_year * 100 + ) |> + ungroup() + +write_csv(table_source_raw, here(output_dir_analysis_tables, "table_source_raw.csv")) + +# end------ + diff --git a/analysis/1_summary_stats.R b/analysis/1_summary_stats.R deleted file mode 100644 index 96f850f..0000000 --- a/analysis/1_summary_stats.R +++ /dev/null @@ -1,162 +0,0 @@ -# Preliminaries ---- - -# Import libraries -library("tidyverse") -library("dtplyr") -library("lubridate") -library("glue") -library("here") -library("skimr") - -## Create output directory -output_dir <- here("output", "summary_stats") -fs::dir_create(output_dir) - -# Import processed data ---- -dataset_death_TPP_ONS <- read_csv("output/highly_sensitive/dataset_death_TPP_ONS.csv.gz") %>% - mutate( - region = as.factor(region), - age_band = as.factor(age_band), - rural_urban = as.factor(rural_urban), - ons_death_place = as.factor(ons_death_place), - region = as.factor(region), - IMD_q10 = as.factor(IMD_q10), - ethnicity = as.factor(ethnicity), - min_DoD = pmin(TPP_death_date, ons_death_date, na.rm = TRUE) - ) %>% - mutate( - death_dereg_diff = case_when( - !is.na(last_registration_end_date) ~ as.Date(last_registration_end_date) - min_DoD, - TRUE ~ as.difftime(NA_real_, units = "days" - ) - ) - ) - -dataset_death_TPP_ONS_plus_inc_crit <- dataset_death_TPP_ONS %>% - filter( - has_registration == TRUE & # was registered at the beginning of the year the person died - any_death_during_study == TRUE # died between "2009-01-01" - "2025-06-06" + (deregistration date + 30 days) is after one date of death - ) - -dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death <- dataset_death_TPP_ONS_plus_inc_crit %>% - filter( - death_dereg_diff >= 0) - - - -# Summary -## Gral -summary_dataset_death_TPP_ONS <- skim(dataset_death_TPP_ONS) - -summary_dataset_death_TPP_ONS_plus_inc_crit <- skim(dataset_death_TPP_ONS_plus_inc_crit) - -summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death <- skim(dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death) - -write.csv(summary_dataset_death_TPP_ONS, file = here::here("output", "summary_stats","summary_stats_dataset_death_TPP_ONS.csv"), - row.names = FALSE) - -write.csv(summary_dataset_death_TPP_ONS_plus_inc_crit, file = here::here("output", "summary_stats","summary_dataset_death_TPP_ONS_plus_inc_crit.csv"), - row.names = FALSE) - -write.csv(summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death, file = here::here("output", "summary_stats","summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death.csv"), - row.names = FALSE) - -## Categorical variables - -table_freq <- dataset_death_TPP_ONS %>% - pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>% - group_by(year(min_DoD), subgroup, category) %>% - summarise( - n=n() - ) - -table_freq_plus_inc_crit <- dataset_death_TPP_ONS_plus_inc_crit %>% - pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>% - group_by(year(min_DoD), subgroup, category) %>% - summarise( - n=n() - ) - -table_freq_plus_inc_crit_0_dereg_death <- dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death %>% - pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>% - group_by(year(min_DoD), subgroup, category) %>% - summarise( - n=n() - ) - -write.csv(table_freq, file = here::here("output", "summary_stats","table_freq_DoD.csv"), - row.names = FALSE) - -write.csv(table_freq_plus_inc_crit, file = here::here("output", "summary_stats","table_freq_plus_inc_crit.csv"), - row.names = FALSE) - -write.csv(table_freq_plus_inc_crit_0_dereg_death, file = here::here("output", "summary_stats","table_freq_plus_inc_crit_0_dereg_death.csv"), - row.names = FALSE) - - -# Impossible dates of death -impossible_dod <- dataset_death_TPP_ONS %>% - mutate( - ons_DoD_impossible = case_when( - ons_death_date < date_of_birth ~ "death_before_birth", - ons_death_date < as.Date("2009-01-01") ~ "before_study", - ons_death_date > as.Date("2025-06-06") ~ "after_study", - is.na(ons_death_date) ~ "is empty", - TRUE ~ "ok" - ), - TPP_DoD_impossible = case_when( - TPP_death_date < date_of_birth ~ "death_before_birth", - TPP_death_date < as.Date("2009-01-01") ~ "before_study", - TPP_death_date > as.Date("2025-06-06") ~ "after_study", - is.na(TPP_death_date) ~ "is empty", - TRUE ~ "ok" - ), - year_month_min_dod = format(min_DoD, "%Y-%m"), - year_min_dod = format(min_DoD, "%Y") - ) %>% - pivot_longer( - cols = c(ons_DoD_impossible, TPP_DoD_impossible), - names_to = "source", - values_to = "DoD_impossible" - ) %>% - mutate( - source = case_when( - source == "ons_DoD_impossible" ~ "ONS", - source == "TPP_DoD_impossible" ~ "TPP", - TRUE ~ source - ) - ) - -# by month -impossible_dod_month <- impossible_dod %>% - group_by(year_month_min_dod, source, DoD_impossible) %>% - summarise( - n = n(), - .groups = "drop" - ) - -write.csv( - impossible_dod_month, - here::here("output", "summary_stats", "impossible_dod_month.csv"), - row.names = FALSE -) - - -# by year -impossible_dod_year <- impossible_dod %>% -group_by(year_min_dod, source, DoD_impossible) %>% - summarise( - n = n(), - .groups = "drop" - ) - -write.csv( - impossible_dod_year, - here::here("output", "summary_stats", "impossible_dod_year.csv"), - row.names = FALSE -) - - - - - diff --git a/analysis/2_implausible_death_dates.R b/analysis/2_implausible_death_dates.R new file mode 100644 index 0000000..d20d9d0 --- /dev/null +++ b/analysis/2_implausible_death_dates.R @@ -0,0 +1,90 @@ +################################################### +# Author: Martina Pesce / Andrea Schaffer +# Bennett Institute for Applied Data Science +# University of Oxford, 2025 +# +# Aim: +# Identify implausible death dates separately for ONS and TPP, +# including: before date of birth & outside study period +# +# Inclusion criteria: +# - valid age +# - non-disclosive sex +# - at least one recorded death date (ONS and/or TPP) +# +################################################### + +# Libraries ---- + +library(tidyverse) +library(here) +library(fs) + +# Create output directory ---- + +output_dir_analysis_tables <- here("output", "analysis_tables") +dir_create(output_dir_analysis_tables) + +# Import custom functions ---- +source(here("analysis", "0_utility_functions.R")) + +# Import data ---- + +death_registration_processed <- read_csv( + here("output", "highly_sensitive", "death_registration_processed.csv.gz") +) + +#-------------------------------------------------------- +# Implausible death dates by source and year +# +# Filter: Patients with at least one death date (ONS or TPP) +# +# Output: +# Counts of plausibility categories by: +# - year (reference death year) +# - source (ONS vs TPP) +# - plausibility category +#--------------------------------------------------------- + +death_implausible_source <- death_registration_processed |> + + # Keep individuals with any recorded death date + # (exclude patients with death only recorded in TPP codes) + filter(flag_any_date_death == TRUE) |> + + select( + patient_id, + death_date_ref_year, + cat_ons_death_date, + cat_tpp_death_date + ) |> + + pivot_longer( + cols = c(cat_ons_death_date, cat_tpp_death_date), + names_to = "source", + values_to = "cat_death_date_plausi" + ) |> + + mutate( + source = case_when( + source == "cat_ons_death_date" ~ "ons", + source == "cat_tpp_death_date" ~ "tpp" + ) + ) |> + + # Aggregate counts by year, source, and plausibility category + group_by(death_date_ref_year, source, cat_death_date_plausi) |> + + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + + arrange(death_date_ref_year, source, cat_death_date_plausi) + +# Export results ---- + +write_csv( + death_implausible_source, + here(output_dir_analysis_tables, "death_implausible_source.csv") +) \ No newline at end of file diff --git a/analysis/3_registration_at_death.R b/analysis/3_registration_at_death.R new file mode 100644 index 0000000..145ba48 --- /dev/null +++ b/analysis/3_registration_at_death.R @@ -0,0 +1,95 @@ +################################################### +# Author: Martina Pesce / Andrea Schaffer +# Bennett Institute for Applied Data Science +# University of Oxford, 2025 +# +# 2) Registration at time of death +# +# Describe registration status at the reference date of death +# and the timing of registration start/end relative to death, +# by death source (ONS only / TPP only / Both) and year. +# +# Inclusion criteria: +# - valid age +# - non-disclosive sex +# - death recorded in ONS and/or TPP +# +# Exclusion criteria: +# - implausible death date in either source +# +# Reference date of death: ONS death date, where available; otherwise TPP death date +################################################### + +# Libraries ---- +library(tidyverse) +library(here) +library(fs) + +# Create output directory ---- +output_dir_analysis_tables <- here("output", "analysis_tables") +dir_create(output_dir_analysis_tables) + +# Import utility functions ---- +source(here("analysis", "0_utility_functions.R")) + +# Import data ---- +death_registration_processed <- read_csv( + here("output", "highly_sensitive", "death_registration_processed.csv.gz") +) + +# Restrict to patients with any death date and no implausible death dates ---- +death_registration_clean <- death_registration_processed |> + filter( + flag_any_date_death == TRUE, + flag_any_date_death_implausible == FALSE + ) + +# Registration status at death, by year and death source ---- +registration_status_source <- death_registration_clean |> + group_by(death_date_ref_year, death_source, registration_status) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + arrange(death_date_ref_year, death_source, registration_status) + +write_csv( + registration_status_source, + here(output_dir_analysis_tables, "registration_status_source.csv") +) + +# Timing of last registration start relative to death, by year and death source ---- +# "death_before_registration_start" indicates last registration started after death +reg_start_timing_source <- death_registration_clean |> + group_by(death_date_ref_year, death_source, reg_start_timing_group) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + arrange(death_date_ref_year, death_source, reg_start_timing_group) + +write_csv( + reg_start_timing_source, + here(output_dir_analysis_tables, "reg_start_timing_source.csv") +) + +# Timing of last registration end relative to death, by year and death source ---- +# Exclude people whose registration started after death +reg_end_timing_source <- death_registration_clean |> + filter( + reg_start_timing_group %in% c( + "same_day_as_registration_start", + "death_after_registration_start" + ) + ) |> + group_by(death_date_ref_year, death_source, reg_end_timing_group) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + arrange(death_date_ref_year, death_source, reg_end_timing_group) + +write_csv( + reg_end_timing_source, + here(output_dir_analysis_tables, "reg_end_timing_source.csv") +) \ No newline at end of file diff --git a/analysis/4_death_source_comparison.R b/analysis/4_death_source_comparison.R new file mode 100644 index 0000000..e0308ba --- /dev/null +++ b/analysis/4_death_source_comparison.R @@ -0,0 +1,196 @@ +################################################### +# Author: Martina Pesce / Andrea Schaffer +# Bennett Institute for Applied Data Science +# University of Oxford, 2025 +# +# 3) Death source comparison +# +# +# Create descriptive tables of death source classifications: +# - by reference year and death source +# - overall and by subgroup +# - for deaths occurring in 2025 onwards, by month +# +# Analysis: +# 1. Counts by reference year and death source +# (overall and by subgroup: age band, sex, ethnicity, +# IMD quintile, rural/urban classification, and region) +# 2. Monthly counts by death source for deaths in 2025 onwards +# 3. Descriptive summaries including TPP-coded deaths +######################################################################### + +# Libraries ---- +library(tidyverse) +library(here) +library(fs) +library(lubridate) + +# Create output directory ---- +output_dir_analysis_tables <- here("output", "analysis_tables") +dir_create(output_dir_analysis_tables) + +# Import utility functions ---- +source(here("analysis", "0_utility_functions.R")) + +# Import data ---- +death_registration_processed <- read_csv( + here("output", "highly_sensitive", "death_registration_processed.csv.gz") +) + +# ================================================== +# Main analysis: dated deaths only +# ================================================== + +# Restrict to the main analysis population ---- +death_registration_analysis <- death_registration_processed |> + filter( + flag_any_date_death == TRUE, + flag_any_date_death_implausible == FALSE, + flag_is_registered == TRUE + ) + +# Overall counts by year and death source ---- +table_death_source_overall <- death_registration_analysis |> + group_by(death_date_ref_year, death_source) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + mutate( + subgroup = "overall", + subgroup_value = "All" + ) + +# Counts by subgroup ---- +table_death_source_subgroups <- death_registration_analysis |> + select( + death_date_ref_year, + death_source, + age_band, + sex, + ethnicity, + imd_quintile, + rural_urban, + region + ) |> + pivot_longer( + cols = c(age_band, sex, ethnicity, imd_quintile, rural_urban, region), + names_to = "subgroup", + values_to = "subgroup_value" + ) |> + group_by(death_date_ref_year, death_source, subgroup, subgroup_value) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) + +# Combine overall and subgroup counts ---- +table_death_source <- bind_rows( + table_death_source_overall, + table_death_source_subgroups +) |> + mutate( + subgroup = factor( + subgroup, + levels = c( + "overall", + "age_band", + "sex", + "ethnicity", + "imd_quintile", + "rural_urban", + "region" + ) + ) + ) |> + group_by(death_date_ref_year, subgroup, subgroup_value) |> + mutate( + total_subgroup_value = sum(total, na.rm = TRUE), # denominator + perc = total / total_subgroup_value * 100 # proportion + ) |> + ungroup() |> + arrange(death_date_ref_year, subgroup, subgroup_value, death_source) |> + select( + death_date_ref_year, + subgroup, + subgroup_value, + death_source, + total, + total_subgroup_value, + perc + ) + +# Export main analysis table ---- +write_csv( + table_death_source, + here(output_dir_analysis_tables, "table_death_source.csv") +) + +# ================================================== +# 2025-2026 sources by month +# ================================================== +table_death_source_25_26 <- death_registration_analysis |> + filter(death_date_ref_year > 2024) |> + group_by(month = floor_date(death_date_ref, unit = "month"), death_source) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) + +# Export lasts months analysis table ---- +write_csv( + table_death_source_25_26, + here(output_dir_analysis_tables, "table_death_source_25_26.csv") +) + +# ================================================== +# Sensitivity analysis: include TPP-coded deaths +# ================================================== + +# Descriptive table: +# classify TPP evidence as date, code, or neither +# across all records with a reference year including TPP-coded deaths +# Note: tpp_date_or_coded = NA indicates no TPP code / date , only ONS +tpp_death_code_or_date <- death_registration_processed |> + group_by(death_date_ref_year_w_tpp_codes, tpp_date_or_coded) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + arrange(death_date_ref_year_w_tpp_codes, tpp_date_or_coded) + +write_csv( + tpp_death_code_or_date, + here(output_dir_analysis_tables, "tpp_death_code_or_date.csv") +) + +# Restrict to the main analysis population ---- + +# Overall counts by year and death source, including TPP-coded deaths ---- +table_death_source_overall_any_tpp <- death_registration_analysis |> + group_by(death_date_ref_year_w_tpp_codes, death_source_tpp_date_or_coded) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + )|> + group_by(death_date_ref_year_w_tpp_codes) |> + mutate( + total_year = sum(total, na.rm = TRUE), + perc = total / total_year * 100 + ) |> + ungroup() |> + arrange(death_date_ref_year_w_tpp_codes, death_source_tpp_date_or_coded) |> + select( + death_date_ref_year_w_tpp_codes, + death_source_tpp_date_or_coded, + total, + total_year, + perc + ) + +# Export sensitivity analysis table ---- +write_csv( + table_death_source_overall_any_tpp, + here(output_dir_analysis_tables, "table_death_source_overall_any_tpp.csv") +) + diff --git a/analysis/5_sources_date_agreement.R b/analysis/5_sources_date_agreement.R new file mode 100644 index 0000000..837a5e6 --- /dev/null +++ b/analysis/5_sources_date_agreement.R @@ -0,0 +1,126 @@ +################################################### +# Author: Martina Pesce / Andrea Schaffer +# Bennett Institute for Applied Data Science +# University of Oxford, 2025 +# +# 4) Agreement between ONS and TPP death dates +# Assess agreement between date of death recorded in ONS and TPP +# among individuals with a death date available in both sources. +# +# Inclusion criteria: +# - valid age +# - non-disclosive sex +# - registered at death +# - death date recorded in both ONS and TPP +# +# Exclusion criteria: +# - implausible death date in either source +################################################### + +# Libraries ---- +library(tidyverse) +library(here) +library(fs) +library(lubridate) + +# Create output directory ---- +output_dir_analysis_tables <- here("output", "analysis_tables") +dir_create(output_dir_analysis_tables) + +# Import utility functions ---- +source(here("analysis", "0_utility_functions.R")) + +# Import data ---- +death_registration_processed <- read_csv( + here("output", "highly_sensitive", "death_registration_processed.csv.gz") +) + +# ================================================== +# Analysis +# ================================================== + +# Restrict to the main analysis population ---- +ons_tpp_dates_diff_analysis <- death_registration_processed |> + filter( + has_ons_date_death == TRUE, + has_tpp_death_date == TRUE, + flag_any_date_death_implausible == FALSE, + flag_is_registered == TRUE + ) + +# table dates difference bw ons - tpp +table_ons_tpp_dates_diff_overall <- ons_tpp_dates_diff_analysis |> + group_by(death_date_ref_year, dod_diff_groups) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) |> + mutate( + subgroup = "overall", + subgroup_value = "All" + ) + +# Counts by subgroup ---- +table_ons_tpp_dates_diff_subgroups <- ons_tpp_dates_diff_analysis |> + select( + death_date_ref_year, + dod_diff_groups, + age_band, + sex, + ethnicity, + imd_quintile, + rural_urban, + region + ) |> + pivot_longer( + cols = c(age_band, sex, ethnicity, imd_quintile, rural_urban, region), + names_to = "subgroup", + values_to = "subgroup_value" + ) |> + group_by(death_date_ref_year, dod_diff_groups, subgroup, subgroup_value) |> + summarise( + total = rounding(n()), # SDC rounding + .groups = "drop" + ) + +# Combine overall and subgroup counts ---- +table_ons_tpp_dates_diff <- bind_rows( + table_ons_tpp_dates_diff_overall, + table_ons_tpp_dates_diff_subgroups +) |> + mutate( + subgroup = factor( + subgroup, + levels = c( + "overall", + "age_band", + "sex", + "ethnicity", + "imd_quintile", + "rural_urban", + "region" + ) + ) + ) |> + group_by(death_date_ref_year, subgroup, subgroup_value) |> + mutate( + total_subgroup_value = sum(total, na.rm = TRUE), # denominator + perc = total / total_subgroup_value * 100 # proportion + ) |> + ungroup() |> + arrange(death_date_ref_year, subgroup, subgroup_value, dod_diff_groups) |> + select( + death_date_ref_year, + subgroup, + subgroup_value, + dod_diff_groups, + total, + total_subgroup_value, + perc + ) + +# Export main analysis table ---- +write_csv( + table_ons_tpp_dates_diff, + here(output_dir_analysis_tables, "table_ons_tpp_dates_diff.csv") +) diff --git a/analysis/6_variation_source_by_practice.R b/analysis/6_variation_source_by_practice.R new file mode 100644 index 0000000..5e0e772 --- /dev/null +++ b/analysis/6_variation_source_by_practice.R @@ -0,0 +1,101 @@ +################################################### +# Author: Martina Pesce / Andrea Schaffer +# Bennett Institute for Applied Data Science +# University of Oxford, 2025 +# +# 5) +# +# Variation across practices in the proportion +# of deaths recorded only in ONS using yearly percentiles +# +######################################################################### + +# Libraries ---- +library(tidyverse) +library(here) +library(fs) +library(lubridate) + +# Create output directory ---- +output_dir_analysis_tables <- here("output", "analysis_tables") +dir_create(output_dir_analysis_tables) + +# Import utility functions ---- +source(here("analysis", "0_utility_functions.R")) + +# Import data ---- +death_registration_processed <- read_csv( + here("output", "highly_sensitive", "death_registration_processed.csv.gz") +) + +# ================================================== +# Main analysis: dated deaths only +# ================================================== + +# Restrict to the main analysis population +death_registration_analysis <- death_registration_processed |> + filter( + flag_any_date_death == TRUE, + flag_any_date_death_implausible == FALSE, + flag_is_registered == TRUE + ) + +# ================================================== +# Practice-level % of deaths recorded only in ONS +# Includes practices with 0% ONS only +# ================================================== + +practice_ons_only <- death_registration_analysis |> + group_by(death_date_ref_year, practice) |> + summarise( + total_practice_year = n(), + ons_only_n = sum(death_source == "ONS_only", na.rm = TRUE), + perc_ons_only = 100 * ons_only_n / total_practice_year, + .groups = "drop" + ) |> + filter(total_practice_year > 9) |> + rename(year = death_date_ref_year) + +# ================================================== +# Practice-level percentiles of % ONS-only deaths +# Long format for plotting +# ================================================== + +probs <- seq(0.1, 0.9, by = 0.1) +percentiles <- probs * 100 + +n_by_year <- practice_ons_only |> + group_by(year) |> + summarise( + n_practices = rounding(n_distinct(practice)), + .groups = "drop" + ) + +table_practice_percentiles <- practice_ons_only |> + group_by(year) |> + summarise( + value = list( + round(as.numeric(quantile( + perc_ons_only, + probs = probs, + na.rm = TRUE, + type = 3)), 1) + ), + percentile = list(percentiles), + .groups = "drop" + ) |> + unnest(c(percentile, value)) |> + left_join(n_by_year, by = "year") |> + mutate( + line_group = if_else(percentile == 50, "median", "decile") + ) |> + select(year, n_practices, percentile, value, line_group) |> + arrange(year, percentile) + +table_practice_percentiles + +# Export main analysis table ---- +write_csv( + table_practice_percentiles, + here(output_dir_analysis_tables, "table_practice_percentiles.csv") +) \ No newline at end of file diff --git a/analysis/dataset_def/dataset_definition.py b/analysis/dataset_def/dataset_definition.py index 42abaec..91fb106 100644 --- a/analysis/dataset_def/dataset_definition.py +++ b/analysis/dataset_def/dataset_definition.py @@ -9,110 +9,119 @@ # University of Oxford, 2025 ################################################### -from ehrql import when, create_dataset, case, minimum_of, codelist_from_csv, days -from ehrql.tables.tpp import patients, practice_registrations, ons_deaths, addresses, clinical_events +from ehrql import create_dataset, case, when, codelist_from_csv +from ehrql.tables.tpp import ( + patients, + ons_deaths, + clinical_events, + practice_registrations, + addresses, +) -# initialise dataset dataset = create_dataset() -dataset.configure_dummy_data(population_size=1000) -#Dates -start_date = "2009-01-01" -#End date (day before last ONS deaths update) -end_date = "2025-06-06" -earliest_DoD = minimum_of( - patients.date_of_death, - ons_deaths.date, +# ----------------------------------------------------------------------------- +# Population +# ----------------------------------------------------------------------------- + +# Death definitions ------------------ + +# ONS death date recorded +has_ons_death_date = ons_deaths.date.is_not_null() + +# TPP structured death date recorded +has_tpp_death_date = patients.date_of_death.is_not_null() + +# TPP coded death recorded +tpp_coded_death_codes = codelist_from_csv( + "codelists/nhsd-primary-care-domain-refsets-death_cod.csv", + column="code", ) -year_start_DoD = earliest_DoD.to_first_of_year() +tpp_coded_death_event = ( + clinical_events + .where(clinical_events.snomedct_code.is_in(tpp_coded_death_codes)) + .sort_by(clinical_events.date) + .first_for_patient() +) -# Inclusion/exclusion criteria: +has_tpp_coded_death = tpp_coded_death_event.date.is_not_null() -## Include people alive -#was_alive = patients.date_of_death.is_after(year_start_DoD) | patients.date_of_death.is_null() | ons_deaths.date.is_after(year_start_DoD) | ons_deaths.date.is_null() +# Death recorded in any source +has_any_death = ( + has_ons_death_date + | has_tpp_death_date + | has_tpp_coded_death +) +# Reference death date: +# 1) ONS death date +# 2) TPP structured death date +# 3) TPP coded death date -## Include people registered with a TPP practice -# has_registration = practice_registrations.for_patient_on(year_start_DoD).exists_for_patient() | ((patients.date_of_birth.year == year_start_DoD.year) & practice_registrations.for_patient_on(earliest_DoD).exists_for_patient()) +ref_death_date = case( + when(has_ons_death_date).then(ons_deaths.date), + when(has_tpp_death_date).then(patients.date_of_death), + when(has_tpp_coded_death).then(tpp_coded_death_event.date), +) -## Exclude people >110 years due to risk of incorrectly recorded age; -has_possible_age= ((patients.age_on(year_start_DoD) < 110) & (patients.age_on(year_start_DoD) > 0)) | (patients.date_of_birth.year == year_start_DoD.year) -## Exclude people with non-male or female sex due to disclosure risk; -non_disclosive_sex= (patients.sex == "male") | (patients.sex == "female") +# Age at reference death date --------------- +age_at_death = patients.age_on(ref_death_date) -# TPP death -tpp_death_anytime = patients.date_of_death.is_not_null() -# ONS death -ons_death_anytime = ons_deaths.date.is_not_null() +# Keep plausible ages only. +# Also retain infants aged <1 year, whose integer age may be recorded as 0. +has_possible_age = ( + ((age_at_death >= 0) & (age_at_death < 110)) + | (patients.date_of_birth.year == ref_death_date.year) +) -# Combine both ONS/TPP -died_anytime = tpp_death_anytime | ons_death_anytime +# Restrict to male/female categories for disclosure control ----------------------- +has_non_disclosive_sex = ( + (patients.sex == "male") + | (patients.sex == "female") +) -# define dataset poppulation +# Define population dataset.define_population( - #was_alive & - #has_registration & - has_possible_age & - non_disclosive_sex & - died_anytime - ) + has_any_death + & has_possible_age + & has_non_disclosive_sex +) +# ----------------------------------------------------------------------------- # Variables +# ----------------------------------------------------------------------------- -## Death related variables - -### Date of death -dataset.TPP_death_date = patients.date_of_death +# Death ------ dataset.ons_death_date = ons_deaths.date +dataset.tpp_death_date = patients.date_of_death +dataset.tpp_coded_death_date = tpp_coded_death_event.date +# dataset.ref_death_date = ref_death_date -## Sub-analysis variables - -### Age band -dataset.date_of_birth = patients.date_of_birth -age = patients.age_on(earliest_DoD) - -dataset.age_band = case( - when((age < 45)).then("0-44"), - when((age >= 45) & (age < 65)).then("45-64"), - when((age >= 65) & (age < 75)).then("65-74"), - when((age >= 75) & (age < 85)).then("75-84"), - when(age >= 85).then("85+"), +## Last registration ----- +last_registration = ( + practice_registrations + .sort_by( + practice_registrations.start_date, + practice_registrations.end_date, ) + .last_for_patient() +) -### Practice (anonymous) -dataset.practice = practice_registrations.for_patient_on(earliest_DoD).practice_pseudo_id - -### Place of death -dataset.ons_death_place = ons_deaths.place - -### Practice region -dataset.region = practice_registrations.for_patient_on(earliest_DoD).practice_nuts1_region_name - -### Rurality -dataset.rural_urban = addresses.for_patient_on(earliest_DoD).rural_urban_classification +dataset.last_registration_start_date = last_registration.start_date +dataset.last_registration_end_date = last_registration.end_date +# Demographic -------- -#IMD -imd = addresses.for_patient_on(earliest_DoD).imd_rounded - -dataset.IMD_q10 = case( - when((imd >= 0) & (imd < int(32844 * 1 / 10))).then("1 (most deprived)"), - when(imd < int(32844 * 2 / 10)).then("2"), - when(imd < int(32844 * 3 / 10)).then("3"), - when(imd < int(32844 * 4 / 10)).then("4"), - when(imd < int(32844 * 5 / 10)).then("5"), - when(imd < int(32844 * 6 / 10)).then("6"), - when(imd < int(32844 * 7 / 10)).then("7"), - when(imd < int(32844 * 8 / 10)).then("8"), - when(imd < int(32844 * 9 / 10)).then("9"), - when(imd >= int(32844 * 9 / 10)).then("10 (least deprived)"), - otherwise="unknown" -) +### Age at ref date of death +dataset.date_of_birth = patients.date_of_birth +dataset.age = patients.age_on(ref_death_date) +# Sex +dataset.sex = patients.sex #Ethnicity ethnicity5 = codelist_from_csv( @@ -121,80 +130,104 @@ category_column="Label_6", # it's 6 because there is an additional "6 - Not stated" but this is not represented in SNOMED, instead corresponding to no ethnicity code ) -dataset.ethnicity = clinical_events.where( - clinical_events.snomedct_code.is_in(ethnicity5) - ).sort_by( - clinical_events.date - ).last_for_patient().snomedct_code.to_category(ethnicity5) - -# Sex -dataset.sex = patients.sex - - -## Last registration date per patient -last_registration = ( - practice_registrations - .sort_by( - practice_registrations.start_date, - practice_registrations.end_date - ) +dataset.ethnicity = ( + clinical_events + .where(clinical_events.snomedct_code.is_in(ethnicity5)) + .sort_by(clinical_events.date) .last_for_patient() + .snomedct_code + .to_category(ethnicity5) ) -# last registration start date -dataset.last_registration_start_date = last_registration.start_date +#IMD +dataset.imd = addresses.for_patient_on(ref_death_date).imd_rounded -# last registration end date -dataset.last_registration_end_date = last_registration.end_date +# Rurality +dataset.rural_urban = addresses.for_patient_on(ref_death_date).rural_urban_classification +# Practice region +dataset.region = practice_registrations.for_patient_on(ref_death_date).practice_nuts1_region_name -# Died during study and while registered -## TPP death during study and while registered -tpp_death_during_study = ( - patients.date_of_death.is_on_or_after(start_date) & - patients.date_of_death.is_on_or_before(end_date) & - ( - patients.date_of_death.is_on_or_before(last_registration.end_date + days(30)) | - last_registration.end_date.is_null() - ) +### Practice (anonymous) +dataset.practice = practice_registrations.for_patient_on(ref_death_date).practice_pseudo_id + +# ----------------------------------------------------------------------------- +# Dummy data +# ----------------------------------------------------------------------------- + +dataset.configure_dummy_data( + population_size=10000, + timeout=180, + additional_population_constraint=( + (dataset.tpp_death_date.is_on_or_between("2008-01-01", "2026-05-01") + | dataset.tpp_death_date.is_null() + ) & + (dataset.tpp_coded_death_date.is_on_or_between("2008-01-01", "2026-05-01") + | dataset.tpp_coded_death_date.is_null() + ) & + (dataset.ons_death_date.is_on_or_between("2015-01-01", "2026-05-01") + | dataset.ons_death_date.is_null() + ) & + (dataset.last_registration_start_date.is_on_or_between("2008-01-01", "2026-05-01") + | dataset.last_registration_start_date.is_null() + ) + ), ) -dataset.tpp_death_during_study = tpp_death_during_study +# # Died during study and while registered +# ## TPP death during study and while registered +# tpp_death_during_study = ( +# patients.date_of_death.is_on_or_after(start_date) & +# patients.date_of_death.is_on_or_before(end_date) & +# ( +# patients.date_of_death.is_on_or_before(last_registration.end_date + days(30)) | +# last_registration.end_date.is_null() +# ) +# ) -# ONS death during study and while registered, allowing for missing end_date -ons_death_during_study = ( - ons_deaths.date.is_on_or_after(start_date) & - ons_deaths.date.is_on_or_before(end_date) & - ( - ons_deaths.date.is_on_or_before(last_registration.end_date + days(30)) | - last_registration.end_date.is_null() - ) -) +# dataset.tpp_death_during_study = tpp_death_during_study -dataset.ons_death_during_study = ons_death_during_study +# # ONS death during study and while registered, allowing for missing end_date +# ons_death_during_study = ( +# ons_deaths.date.is_on_or_after(start_date) & +# ons_deaths.date.is_on_or_before(end_date) & +# ( +# ons_deaths.date.is_on_or_before(last_registration.end_date + days(30)) | +# last_registration.end_date.is_null() +# ) +# ) -# died during study any (ONS or TPP) +# dataset.ons_death_during_study = ons_death_during_study -dataset.any_death_during_study = ons_death_during_study | tpp_death_during_study +# # died during study any (ONS or TPP) +# dataset.any_death_during_study = ons_death_during_study | tpp_death_during_study -## Include people registered with a TPP practice -dataset.has_registration = practice_registrations.for_patient_on(year_start_DoD).exists_for_patient() | ((patients.date_of_birth.year == year_start_DoD.year) & practice_registrations.for_patient_on(earliest_DoD).exists_for_patient()) +# ## Include people registered with a TPP practice +# dataset.has_registration = practice_registrations.for_patient_on(year_start_DoD).exists_for_patient() | ((patients.date_of_birth.year == year_start_DoD.year) & practice_registrations.for_patient_on(earliest_DoD).exists_for_patient()) +### Practice (anonymous) +# dataset.practice = practice_registrations.for_patient_on(earliest_DoD).practice_pseudo_id +# dataset.IMD_q10 = case( +# when((imd >= 0) & (imd < int(32844 * 1 / 10))).then("1 (most deprived)"), +# when(imd < int(32844 * 2 / 10)).then("2"), +# when(imd < int(32844 * 3 / 10)).then("3"), +# when(imd < int(32844 * 4 / 10)).then("4"), +# when(imd < int(32844 * 5 / 10)).then("5"), +# when(imd < int(32844 * 6 / 10)).then("6"), +# when(imd < int(32844 * 7 / 10)).then("7"), +# when(imd < int(32844 * 8 / 10)).then("8"), +# when(imd < int(32844 * 9 / 10)).then("9"), +# when(imd >= int(32844 * 9 / 10)).then("10 (least deprived)"), +# otherwise="unknown" +# ) +# dataset.age_band = case( +# when((age < 45)).then("0-44"), +# when((age >= 45) & (age < 65)).then("45-64"), +# when((age >= 65) & (age < 75)).then("65-74"), +# when((age >= 75) & (age < 85)).then("75-84"), +# when(age >= 85).then("85+"), +# ) -# Coded date of death -death_coded = codelist_from_csv( - "codelists/nhsd-primary-care-domain-refsets-death_cod.csv", - column="code") -dataset.death_coded_date = clinical_events.where( - clinical_events.snomedct_code.is_in(death_coded) - ).sort_by(clinical_events.date).last_for_patient().date -# Dummy data configuration -dataset.configure_dummy_data(population_size=10000, timeout=180, - additional_population_constraint=( - dataset.TPP_death_date.is_on_or_between("2009-01-01","2025-05-01") & - dataset.death_coded_date.is_on_or_between("2019-01-01","2025-05-01") - ) - ) diff --git a/analysis/2_tables_ONS_TPP_reg_dereg.R b/analysis/old/2_tables_ONS_TPP_reg_dereg.R similarity index 100% rename from analysis/2_tables_ONS_TPP_reg_dereg.R rename to analysis/old/2_tables_ONS_TPP_reg_dereg.R diff --git a/analysis/3_dataset_DoD_TPP_ONS_with_grace_period.R b/analysis/old/3_dataset_DoD_TPP_ONS_with_grace_period.R similarity index 100% rename from analysis/3_dataset_DoD_TPP_ONS_with_grace_period.R rename to analysis/old/3_dataset_DoD_TPP_ONS_with_grace_period.R diff --git a/analysis/4_source_prop.R b/analysis/old/4_source_prop.R similarity index 100% rename from analysis/4_source_prop.R rename to analysis/old/4_source_prop.R diff --git a/analysis/5_DoD_diff.R b/analysis/old/5_DoD_diff.R similarity index 100% rename from analysis/5_DoD_diff.R rename to analysis/old/5_DoD_diff.R diff --git a/analysis/6_practice_by_source.R b/analysis/old/6_practice_by_source.R similarity index 100% rename from analysis/6_practice_by_source.R rename to analysis/old/6_practice_by_source.R diff --git a/analysis/7_Tables-visual.Rmd b/analysis/old/7_Tables-visual.Rmd similarity index 100% rename from analysis/7_Tables-visual.Rmd rename to analysis/old/7_Tables-visual.Rmd diff --git a/analysis/post_release/Tables_figures_paper.Rmd b/analysis/post_release/Tables_figures_paper.Rmd new file mode 100644 index 0000000..de13150 --- /dev/null +++ b/analysis/post_release/Tables_figures_paper.Rmd @@ -0,0 +1,675 @@ +--- +title: "Tables and figures death manuscript" +author: "Martina Pesce" +date: "2026-02-02" +output: html_document +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +```{r} +# Import libraries +library(tidyverse) +library(lubridate) +library(glue) +library(here) +library(patchwork) +library(scales) + +# Create output directory +output_dir <- here("output", "final_visualization") +fs::dir_create(output_dir) +``` + + +```{r} +# ===================================================== +# Import analysis tables +# ===================================================== + +# Context +table_source_raw <- read_csv( + here("output", "analysis_tables", "table_source_raw.csv") +) + +# Implausible death dates +death_implausible_source <- read_csv( + here("output", "analysis_tables", "death_implausible_source.csv") +) + +# Registration and deregistration around death +registration_status_source <- read_csv( + here("output", "analysis_tables", "registration_status_source.csv") +) + +reg_start_timing_source <- read_csv( + here("output", "analysis_tables", "reg_start_timing_source.csv") +) + +reg_end_timing_source <- read_csv( + here("output", "analysis_tables", "reg_end_timing_source.csv") +) + +# Death source comparison +table_death_source <- read_csv( + here("output", "analysis_tables", "table_death_source.csv") +) + +table_death_source_25_26 <- read_csv( + here("output", "analysis_tables", "table_death_source_25_26.csv") +) + +tpp_death_code_or_date <- read_csv( + here("output", "analysis_tables", "tpp_death_code_or_date.csv") +) + +table_death_source_overall_any_tpp <- read_csv( + here("output", "analysis_tables", "table_death_source_overall_any_tpp.csv") +) + +table_practice_percentiles <- read_csv( + here("output", "analysis_tables", "table_practice_percentiles.csv") +) + +# Date agreement between ONS and TPP-pcEHR +table_ons_tpp_dates_diff <- read_csv( + here("output", "analysis_tables", "table_ons_tpp_dates_diff.csv") +) + +``` + + +```{r} +# ===================================================== +# Overall summary numbers +# ===================================================== + +# Any dated deaths in the dataset, 2009-2025 +pcEHR_2009_2025 <- table_death_source |> + filter( + death_date_ref_year >= 2009, + death_date_ref_year <= 2025 + ) |> + summarise( + pcEHR_total = sum(total, na.rm = TRUE), + mean_pcEHR_per_year = pcEHR_total / 17 + ) + +# Deaths with an ONS date of death, 2020-2025 +pcEHR_2020_2025 <- table_death_source |> + filter( + death_date_ref_year >= 2020, + death_date_ref_year <= 2025 + ) |> + summarise( + ons_total = sum(total[death_source %in% c("ONS_only", "Both")], na.rm = TRUE), + both_total = sum(total[death_source == "Both"], na.rm = TRUE), + tpp_only_total = sum(total[death_source == "TPP_only"], na.rm = TRUE), + n_years = n_distinct(death_date_ref_year), + ons_mean = ons_total / n_years, + both_perc = both_total / ons_total * 100 + ) + +pcEHR_2009_2025 +pcEHR_2020_2025 + +``` + + +```{r} +# ===================================================== +# Table-ready summaries for 2025 +# ===================================================== + +# Registration status at death +tbl_1_2025_reg_status_by_source <- registration_status_source |> + filter(death_date_ref_year == 2025) |> + pivot_wider( + names_from = death_source, + values_from = total + ) |> + mutate( + any = rowSums(across(c(ONS_only, TPP_only, Both)), na.rm = TRUE), + ONS_all = rowSums(across(c(ONS_only, Both)), na.rm = TRUE), + TPP_all = rowSums(across(c(TPP_only, Both)), na.rm = TRUE) + ) + +# Time from death to deregistration +tbl_2_2025_time_to_dereg_by_source <- reg_end_timing_source |> + filter(death_date_ref_year == 2025) |> + pivot_wider( + names_from = death_source, + values_from = total + ) |> + mutate( + any = rowSums(across(c(ONS_only, TPP_only, Both)), na.rm = TRUE), + ONS_all = rowSums(across(c(ONS_only, Both)), na.rm = TRUE), + TPP_all = rowSums(across(c(TPP_only, Both)), na.rm = TRUE) + ) +``` + + + + +```{r} +# ===================================================== +# Figure 1: source of death over time +# ===================================================== + +fig_1_data <- table_death_source |> + filter( + subgroup == "overall", + subgroup_value == "All", + death_date_ref_year < 2026 + ) + +theme_fig_1 <- theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, hjust = 1), + legend.position = "bottom" + ) + +# Panel A: deaths recorded in both sources vs single source +fig_1_all_stacked <- fig_1_data |> + mutate( + concordance = case_when( + death_source == "Both" ~ "ONS & TPP-pcEHR", + death_source %in% c("ONS_only", "TPP_only") ~ "Single source" + ), + concordance = factor( + concordance, + levels = c("ONS & TPP-pcEHR", "Single source") + ) + ) |> + group_by(death_date_ref_year, concordance) |> + summarise( + total = sum(total, na.rm = TRUE), + .groups = "drop" + ) + +p_fig_1_all <- ggplot( + fig_1_all_stacked, + aes(x = death_date_ref_year, y = total, fill = concordance) +) + + geom_col( + position = position_stack(reverse = TRUE), + color = "white", + linewidth = 0.2 + ) + + geom_vline( + xintercept = 2018.5, + linetype = "dotted", + linewidth = 0.8 + ) + + scale_fill_manual( + values = c( + "ONS & TPP-pcEHR" = "#785EF0", + "Single source" = "#BBBBBB" + ), + name = NULL + ) + + labs( + x = "Year of death", + y = "Number of deaths", + title = "(A)" + ) + + theme_fig_1 + +# Panel B: deaths recorded in only one source +fig_1_discordant <- fig_1_data |> + filter( + death_source %in% c("ONS_only", "TPP_only"), + death_date_ref_year > 2018 + ) |> + mutate( + death_source = factor( + death_source, + levels = c("ONS_only", "TPP_only") + ) + ) + +p_fig_1_discordant <- ggplot( + fig_1_discordant, + aes(x = death_date_ref_year, y = total, fill = death_source) +) + + geom_col( + position = "dodge", + color = "white", + linewidth = 0.2 + ) + + scale_fill_manual( + values = c( + "ONS_only" = "#648FFF", + "TPP_only" = "#DC267F" + ), + labels = c( + "ONS_only" = "ONS only", + "TPP_only" = "TPP-pcEHR dated only" + ), + name = NULL + ) + + labs( + x = "Year of death", + y = "Number of Deaths", + title = "(B)" + ) + + theme_fig_1 + +fig_1 <- p_fig_1_all / p_fig_1_discordant +fig_1 +``` + + +```{r} +# ===================================================== +# Monthly source breakdown for 2025-2026 +# ===================================================== + +table_paper_3 <- table_death_source_25_26 |> + mutate(year = year(month)) |> + group_by(year, month, death_source) |> + summarise( + n = sum(total, na.rm = TRUE), + .groups = "drop" + ) |> + group_by(year, month) |> + mutate( + N = sum(n), + perc = round(100 * n / N, 1), + value = sprintf("%.1f%% (%s/%s)", perc, n, N) + ) |> + ungroup() |> + select(year, month, death_source, value) |> + pivot_wider( + names_from = death_source, + values_from = value + ) |> + rename( + `ONS & TPP-pcEHR` = Both, + `Only ONS` = ONS_only, + `Only TPP-pcEHR` = TPP_only + ) |> + mutate( + month = format(month, "%b") + ) |> + arrange(year, month) |> + select(year, month, `ONS & TPP-pcEHR`, `Only ONS`, `Only TPP-pcEHR`) + +table_paper_3 +``` + + +```{r} +# ===================================================== +# TPP-pcEHR deaths: coded vs dated +# ===================================================== + +fig_tpp_codes <- tpp_death_code_or_date |> + filter( + !is.na(tpp_date_or_coded), + death_date_ref_year_w_tpp_codes < 2026 + ) |> + mutate( + tpp_date_or_coded = factor( + tpp_date_or_coded, + levels = c( + "tpp_dated_and_coded", + "tpp_dated_only", + "tpp_coded_only" + ), + labels = c( + "TPP-pcEHR dated & coded", + "TPP-pcEHR dated only", + "TPP-pcEHR coded only" + ) + ) + ) + +p_tpp_codes <- ggplot( + fig_tpp_codes, + aes( + x = death_date_ref_year_w_tpp_codes, + y = total, + fill = tpp_date_or_coded + ) +) + + geom_col( + position = "stack", + color = "white", + linewidth = 0.2 + ) + + scale_fill_manual( + values = c( + "TPP-pcEHR dated & coded" = "#F28E2B", + "TPP-pcEHR dated only" = "#D81B60", + "TPP-pcEHR coded only" = "#FDD835" + ), + name = NULL + ) + + labs( + x = "Year of death", + y = "Number of deaths", + title = "(A)" + ) + + theme_minimal() + + theme( + legend.position = "bottom", + axis.text.x = element_text(angle = 45, hjust = 1) + ) + +p_tpp_codes +``` + +```{r} +# ===================================================== +# Table: source classification with and without TPP coded deaths +# ===================================================== + +table_source_classification <- bind_rows( + table_death_source_overall_any_tpp |> + filter( + death_date_ref_year_w_tpp_codes >= 2019, + death_date_ref_year_w_tpp_codes <= 2025 + ) |> + transmute( + year = death_date_ref_year_w_tpp_codes, + classification = "Including TPP dated or coded deaths", + source = case_when( + death_source_tpp_date_or_coded == "Both" ~ "ONS & TPP-pcEHR", + death_source_tpp_date_or_coded == "ONS_only" ~ "ONS only", + death_source_tpp_date_or_coded == "TPP_only" ~ "TPP-pcEHR only" + ), + n = total, + N = total_year + ), + fig_1_data |> + filter( + death_date_ref_year >= 2019, + death_date_ref_year <= 2025 + ) |> + transmute( + year = death_date_ref_year, + classification = "Using TPP date only", + source = case_when( + death_source == "Both" ~ "ONS & TPP-pcEHR", + death_source == "ONS_only" ~ "ONS only", + death_source == "TPP_only" ~ "TPP-pcEHR only" + ), + n = total, + N = total_subgroup_value + ) +) |> + mutate( + source = factor( + source, + levels = c("ONS & TPP-pcEHR", "ONS only", "TPP-pcEHR only") + ), + perc = round(100 * n / N, 1), + value = sprintf("%.1f%% (%s/%s)", perc, n, N) + ) |> + select(classification, year, source, value) |> + pivot_wider( + names_from = source, + values_from = value + ) |> + arrange(classification, year) + +table_source_classification +``` + + +```{r} +# ===================================================== +# Deaths recorded only in ONS, by subgroup +# ===================================================== + +fig_ons_only_subgroups <- table_death_source |> + filter( + death_source == "ONS_only", + death_date_ref_year >= 2020, + death_date_ref_year <= 2025, + subgroup != "overall", + !is.na(perc) + ) + +theme_subgroup <- theme_minimal() + + theme( + axis.text.x = element_text(angle = 0, hjust = 0.5) + ) + +scale_x_subgroup <- scale_x_continuous( + breaks = 2020:2025 +) + +scale_y_subgroup <- scale_y_continuous( + limits = c(0, 20), + breaks = seq(0, 20, by = 5), + labels = \(x) paste0(x, "%") +) + +# Age band +p_age_band <- fig_ons_only_subgroups |> + filter(subgroup == "age_band") |> + ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) + + geom_line(linewidth = 0.8) + + geom_point(size = 1.8) + + scale_x_subgroup + + scale_y_subgroup + + labs( + x = "Year of death", + y = "Deaths recorded only in ONS (%)", + color = "Age band" + ) + + theme_subgroup + +# Sex +p_sex <- fig_ons_only_subgroups |> + filter(subgroup == "sex") |> + ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) + + geom_line(linewidth = 0.8) + + geom_point(size = 1.8) + + scale_x_subgroup + + scale_y_subgroup + + labs( + x = "Year of death", + y = "Deaths recorded only in ONS (%)", + color = "Sex" + ) + + theme_subgroup + +# Ethnicity +p_ethnicity <- fig_ons_only_subgroups |> + filter(subgroup == "ethnicity") |> + ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) + + geom_line(linewidth = 0.8) + + geom_point(size = 1.8) + + scale_x_subgroup + + scale_y_subgroup + + labs( + x = "Year of death", + y = "Deaths recorded only in ONS (%)", + color = "Ethnicity" + ) + + theme_subgroup + +# IMD quintile +p_imd <- fig_ons_only_subgroups |> + filter(subgroup == "imd_quintile") |> + ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) + + geom_line(linewidth = 0.8) + + geom_point(size = 1.8) + + scale_x_subgroup + + scale_y_subgroup + + labs( + x = "Year of death", + y = "Deaths recorded only in ONS (%)", + color = "IMD quintile" + ) + + theme_subgroup + +# Rural/urban +p_rural_urban <- fig_ons_only_subgroups |> + filter(subgroup == "rural_urban") |> + ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) + + geom_line(linewidth = 0.8) + + geom_point(size = 1.8) + + scale_x_subgroup + + scale_y_subgroup + + labs( + x = "Year of death", + y = "Deaths recorded only in ONS (%)", + color = "Rural/urban" + ) + + theme_subgroup + +# Region +p_region <- fig_ons_only_subgroups |> + filter(subgroup == "region") |> + ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) + + geom_line(linewidth = 0.8) + + geom_point(size = 1.8) + + scale_x_subgroup + + scale_y_subgroup + + labs( + x = "Year of death", + y = "Deaths recorded only in ONS (%)", + color = "Region" + ) + + theme_subgroup +``` + + +```{r} +# ===================================================== +# Difference in date of death between ONS and TPP-pcEHR +# ===================================================== + +table_dod_diff_3cats <- table_ons_tpp_dates_diff |> + filter( + subgroup == "overall", + subgroup_value == "All", + death_date_ref_year %in% 2020:2025 + ) |> + mutate( + dod_diff_3cat = case_when( + dod_diff_groups == "0" ~ "Same day", + dod_diff_groups %in% c("-1 to -7", "1-7") ~ "Within 7 days", + dod_diff_groups %in% c("-8 to -30", "-31+", "8 to 30", "31+") ~ "More than 7 days", + TRUE ~ NA_character_ + ) + ) |> + filter(!is.na(dod_diff_3cat)) |> + group_by(death_date_ref_year, dod_diff_3cat) |> + summarise( + total = sum(total, na.rm = TRUE), + .groups = "drop" + ) |> + group_by(death_date_ref_year) |> + mutate( + total_year = sum(total, na.rm = TRUE), + perc = total / total_year * 100, + dod_diff_3cat = factor( + dod_diff_3cat, + levels = c("Same day", "Within 7 days", "More than 7 days") + ) + ) |> + ungroup() + +ggplot( + table_dod_diff_3cats, + aes( + x = factor(death_date_ref_year), + y = perc, + fill = dod_diff_3cat + ) +) + + geom_col(width = 0.8) + + scale_y_continuous( + labels = \(x) paste0(x, "%") + ) + + scale_fill_manual( + values = c( + "Same day" = "#44aa99", + "Within 7 days" = "#88ccee", + "More than 7 days" = "#ddcc77" + ), + name = NULL + ) + + labs( + x = "Year of death", + y = "Percentage" + ) + + theme_minimal() + +``` + +```{r} +# ===================================================== +# Distribution of ONS-only deaths across practices (deciles) +# ===================================================== + +practice_deciles_ons_perc <- ggplot( + table_practice_percentiles, + aes( + x = year, + y = value, + group = percentile, + linetype = line_group, + color = line_group, + linewidth = line_group + ) +) + + geom_line() + + + # Aesthetics: median vs deciles + scale_color_manual( + values = c(decile = "grey60", median = "black") + ) + + scale_linetype_manual( + values = c(decile = "dashed", median = "solid") + ) + + scale_linewidth_manual( + values = c(decile = 0.3, median = 0.9) + ) + + + # Axes + scale_x_continuous( + breaks = sort(unique(table_practice_percentiles$year)) + ) + + scale_y_continuous( + labels = \(x) paste0(x, "%") + ) + + + labs( + title = "% of deaths recorded only in ONS across practices", + subtitle = paste0( + "Practices per year (≥10 deaths): ", + paste( + with( + distinct(table_practice_percentiles, year, n_practices), + paste0(year, "=", n_practices) + ), + collapse = " · " + ) + ), + x = "Year", + y = "ONS-only deaths (%)", + color = NULL, + linetype = NULL, + linewidth = NULL + ) + + + theme_minimal(base_size = 12) + + theme( + legend.position = "bottom", + axis.text.x = element_text(angle = 45, hjust = 1) + ) + +practice_deciles_ons_perc +``` + diff --git a/project.yaml b/project.yaml index 889f15c..8c7e387 100644 --- a/project.yaml +++ b/project.yaml @@ -1,91 +1,63 @@ -version: '4.0' +version: '5.0' actions: -#Dataset for diff time - dataset_death_TPP_ONS: - run: ehrql:v1 generate-dataset analysis/dataset_def/dataset_definition.py --output output/highly_sensitive/dataset_death_TPP_ONS.csv.gz + + dataset_death_raw: + run: ehrql:v1 generate-dataset analysis/dataset_def/dataset_definition.py + --output output/highly_sensitive/dataset_death_TPP_ONS.csv.gz outputs: highly_sensitive: dataset: output/highly_sensitive/dataset_death_TPP_ONS.csv.gz - - summary_stat: - run: r:v2 analysis/1_summary_stats.R - --output output/summary_stats/*.csv - needs: [dataset_death_TPP_ONS] + + dataset_death_processed: + run: r:v2 analysis/1_derive_key_variables.R + needs: [dataset_death_raw] outputs: + highly_sensitive: + csv: output/highly_sensitive/death_registration_processed.csv.gz moderately_sensitive: - tables: output/summary_stats/*.csv + txt: output/analysis_tables/death_registration_processed_skim.txt + csv: output/analysis_tables/table_source_raw.csv - reg_dereg_ONS_TPP: - run: r:v2 analysis/2_tables_ONS_TPP_reg_dereg.R - --output output/report/reg_dereg_ONS_TPP/*.csv - needs: [dataset_death_TPP_ONS] + + report_implausible_death_dates: + run: r:v2 analysis/2_implausible_death_dates.R + needs: [dataset_death_processed] outputs: moderately_sensitive: - csv: output/report/reg_dereg_ONS_TPP/*.csv + death_implausible_source: output/analysis_tables/death_implausible_source.csv - Dataset_source_dod_diff_processed: - run: r:v2 analysis/3_dataset_DoD_TPP_ONS_with_grace_period.R - --output output/highly_sensitive/DoD_TPP_ONS_with_grace_period.csv - needs: [dataset_death_TPP_ONS] + report_registration_at_death: + run: r:v2 analysis/3_registration_at_death.R + needs: [dataset_death_processed] outputs: - highly_sensitive: - csv: output/highly_sensitive/DoD_TPP_ONS_with_grace_period.csv + moderately_sensitive: + registration_status_source: output/analysis_tables/registration_status_source.csv + reg_start_timing_source: output/analysis_tables/reg_start_timing_source.csv + reg_end_timing_source: output/analysis_tables/reg_end_timing_source.csv - death_source: - run: r:v2 analysis/4_source_prop.R - --output output/report/by_source/*.csv - needs: [Dataset_source_dod_diff_processed] + report_death_source_comparison: + run: r:v2 analysis/4_death_source_comparison.R + needs: [dataset_death_processed] outputs: moderately_sensitive: - csv: output/report/by_source/*.csv + table_death_source: output/analysis_tables/table_death_source.csv + table_death_source_25_26: output/analysis_tables/table_death_source_25_26.csv + tpp_death_code_or_date: output/analysis_tables/tpp_death_code_or_date.csv + table_death_source_overall_any_tpp: output/analysis_tables/table_death_source_overall_any_tpp.csv - DoD_difference: - run: r:v2 analysis/5_DoD_diff.R - --output output/report/DoD_diff/*.csv - needs: [Dataset_source_dod_diff_processed] + report_sources_date_agreement: + run: r:v2 analysis/5_sources_date_agreement.R + needs: [dataset_death_processed] outputs: moderately_sensitive: - csv: output/report/DoD_diff/*.csv - -# Measures - # measures_overall: - # run: ehrql:v1 generate-measures analysis/measure_def.py - # --output output/measures/measures_overall.csv - # outputs: - # moderately_sensitive: - # measure_csv: output/measures/measures_overall.csv + table_ons_tpp_dates_diff: output/analysis_tables/table_ons_tpp_dates_diff.csv - # table_measure_rate: - # run: r:v2 analysis/Table_measure_rate.R - # --output output/report/*.csv - # needs: [measures_overall] - # outputs: - # moderately_sensitive: - # categorical: output/report/collate_measures_rate_table.csv - - measures_by_practice: - run: ehrql:v1 generate-measures analysis/dataset_def/measure_practice.py - --output output/highly_sensitive/measures_by_practice.csv - outputs: - highly_sensitive: - measure_csv: output/highly_sensitive/measures_by_practice.csv - - decile_table_figure: - run: r:v2 analysis/6_practice_by_source.R - needs: [measures_by_practice] + report_variation_source_by_practice: + run: r:v2 analysis/6_variation_source_by_practice.R + needs: [dataset_death_processed] outputs: moderately_sensitive: - csv: output/report/source_by_practice/*.csv - png: output/report/source_by_practice/*.png - - - # visualization: - # run: r:v2 Final-visualizations.R - # needs: [table_DoD] - # outputs: - # moderately_sensitive: - # csv: output/final_visualization/*.csv - # png: output/final_visualization/*.png - -# \ No newline at end of file + table_practice_percentiles: output/analysis_tables/table_practice_percentiles.csv + +# end----