diff --git a/analysis/0_utility_functions.R b/analysis/0_utility_functions.R
new file mode 100644
index 0000000..5502b9d
--- /dev/null
+++ b/analysis/0_utility_functions.R
@@ -0,0 +1,282 @@
+# #################################################
+# 
+# Purpose:
+# define useful functions used in the codebase
+# this script should be sourced (using `source(here("analysis", "0_utility_functions.R"))`) at the start of each R script
+#
+###################################################
+
+# ---------------------------------------------------------
+# Study period
+# ---------------------------------------------------------
+
+# study_start_date <- as.Date("2009-01-01")
+# study_end_date   <- as.Date("2026-03-06")
+
+# ---------------------------------------------------------
+# Rounding function (sdc)
+# ---------------------------------------------------------
+# - values <= 7 suppressed (NA)
+rounding <- function(vars) {
+  case_when(
+    vars == 0 ~ 0,
+    vars > 7 ~ round(vars / 5) * 5
+  )
+}
+
+
+# ---------------------------------------------------------
+# Implausible death date flag
+# ---------------------------------------------------------
+# TODO: parameterize start and end dates ?
+# Flags implausible death dates
+# Returns a categorical status
+cat_implausible_death_date <- function(death_date, date_of_birth) {
+  case_when(
+    is.na(death_date) ~ "missing",
+    death_date < date_of_birth ~ "death_before_birth",
+    death_date < as.Date("2009-01-01") ~ "before_study",
+    death_date > as.Date("2026-03-06") ~ "after_study",
+    TRUE ~ "ok"
+  )|>
+    factor(
+      levels = c(
+        "ok",
+        "missing",
+        "death_before_birth",
+        "before_study",
+        "after_study"
+      ),
+      ordered = TRUE
+    )
+}
+
+# ---------------------------------------------------------
+# Classify source of death
+# ---------------------------------------------------------
+# Categories: ONS_only, TPP_only, Both
+cat_death_source <- function(ons_death_date, tpp_death_info) {
+  case_when(
+    !is.na(ons_death_date) & !is.na(tpp_death_info) ~ "Both",
+    !is.na(ons_death_date) &  is.na(tpp_death_info) ~ "ONS_only",
+     is.na(ons_death_date) & !is.na(tpp_death_info) ~ "TPP_only",
+    TRUE ~ NA_character_
+  )  |>
+    factor(
+      levels = c("ONS_only", "TPP_only", "Both")
+    )
+}
+
+# ---------------------------------------------------------
+# Categorise timing: registration start and death
+# ---------------------------------------------------------
+# Input: days = death_date - registration_start_date
+cat_last_reg_start_to_death <- function(days) {
+  case_when(
+    is.na(days) ~ "missing_registration_start", # should be no cases
+    days < 0 ~ "death_before_registration_start",
+    days == 0 ~ "same_day_as_registration_start",
+    days > 0 ~ "death_after_registration_start"
+  ) |>
+    factor(
+      levels = c(
+        "death_before_registration_start",
+        "same_day_as_registration_start",
+        "death_after_registration_start",
+        "missing_registration_start"
+      ),
+      ordered = TRUE
+    )
+}
+
+
+# ---------------------------------------------------------
+# Categorise timing: registration end - death
+# ---------------------------------------------------------
+# Input: days = registration_end_date - death_date
+# Interpretation:
+#   negative  deregistration before death
+#   positive  deregistration after death
+#   0         same day
+#   NA        missing deregistration date
+cat_last_reg_end_minus_death <- function(days) {
+  case_when(
+    is.na(days) ~ "missing_registration_end",
+    days <= -31 ~ "-31+",
+    days >= -30 & days <= -8 ~ "-30 to -8",
+    days >= -7 & days <= -1 ~ "-7 to -1",
+    days == 0 ~ "0",
+    days >= 1 & days <= 7 ~ "1 to 7",
+    days >= 8 & days <= 30 ~ "8 to 30",
+    days >= 31 ~ "31+"
+  ) |>
+    factor(
+      levels = c("-31+", "-8 to -30", "-1 to -7", "0", "1 to 7", "8 to 30", "31+", 
+      "missing_registration_end"),
+      ordered = TRUE
+    )
+}
+
+
+
+# ---------------------------------------------------------
+# Registration status at death
+# ---------------------------------------------------------
+# Temporal relationship between:
+# - death date
+# - registration start
+# - registration end
+#
+# Includes grace period (default = 30 days)
+cat_registration_status <- function(
+    death_date,
+    reg_start,
+    reg_end,
+    grace_days = 30 
+) {
+  case_when(
+    is.na(death_date) ~ "no_death_date",
+    is.na(reg_start) ~ "no_registration",
+    
+    death_date < reg_start ~ "death_before_last_registration_start",
+    
+    is.na(reg_end) & death_date >= reg_start ~ "death_during_registration_open_end",
+    death_date <= reg_end ~ "death_during_registration",
+    
+    death_date > reg_end & death_date <= reg_end + grace_days ~ "death_after_deregistration_within_grace",
+    death_date > reg_end + grace_days ~ "death_after_deregistration_outside_grace"
+  ) |>
+    factor(
+      levels = c(
+        "death_before_last_registration_start",
+        "death_during_registration",
+        "death_during_registration_open_end",
+        "death_after_deregistration_within_grace",
+        "death_after_deregistration_outside_grace",
+        "no_registration",
+        "no_death_date"
+      ),
+      ordered = TRUE
+    )
+}
+
+
+#------------------------------------
+# TPP death date vs SNOMED codes 
+#------------------------------------
+# IMP: individuals with a death SNOMED code only (and no ONS or TPP death date) are not
+# currently included in the dataset definition. Coded deaths are only added as a variable among 
+# those already in the dataset (with ONS or TPP death date).
+#
+# - year_pref_ONS_TPP_plus_codes - prioritises ONS date, then TPP date, then coded date
+# - TPP_date_or_coded - classifies TPP death information availability (date, SNOMED code, both)
+# - death_source_TPP_date_or_coded - clasif source including any TPP death evidence (dated and/or coded)
+
+add_tpp_date_or_coded_vars <- function(data) {
+  data |>
+    mutate(
+      # Preferred death year: ONS > TPP dated > TPP coded
+      death_date_ref_year_w_tpp_codes = case_when(
+        !is.na(ons_death_date)        ~ year(ons_death_date),
+        !is.na(tpp_death_date)        ~ year(tpp_death_date),
+        !is.na(tpp_coded_death_date)  ~ year(tpp_coded_death_date),
+        TRUE ~ NA_real_
+      ),
+      
+      # TPP death information availability
+      tpp_date_or_coded = case_when(
+        !is.na(tpp_death_date)       &  is.na(tpp_coded_death_date) ~ "tpp_dated_only",
+        is.na(tpp_death_date)       & !is.na(tpp_coded_death_date) ~ "tpp_coded_only",
+        !is.na(tpp_death_date)       & !is.na(tpp_coded_death_date) ~ "tpp_dated_and_coded",
+        is.na(tpp_death_date)       &  is.na(tpp_coded_death_date) ~ NA_character_
+      ) |>
+        factor(
+          levels = c("tpp_dated_only","tpp_coded_only","tpp_dated_and_coded")
+          ),
+      
+      # Sensitivity source: any TPP death evidence (dated or coded)
+      death_source_tpp_date_or_coded = cat_death_source(
+        ons_death_date,
+        coalesce(tpp_death_date, tpp_coded_death_date)
+      )
+    )
+}
+
+
+# ---------------------------------------------------------
+# Add date-of-death difference variables (TPP vs ONS)
+# ---------------------------------------------------------
+# - diff_dod: difference in days (TPP - ONS)
+#   > positive values: TPP death date after ONS
+#   > negative values: TPP death date before ONS
+# - dod_diff_groups: categorised difference in days (ordered factor) for tables/plots
+
+add_dod_diff_vars <- function(data) {
+
+  data |>
+    mutate(
+      # difference in days between TPP and ONS death dates
+      diff_dod = as.numeric(tpp_death_date - ons_death_date),
+
+      # grouped differences
+      dod_diff_groups = case_when(
+        is.na(diff_dod) ~ NA_character_,
+
+        diff_dod == 0 ~ "0",
+
+        diff_dod >= 1  & diff_dod <= 7  ~ "1 to 7",
+        diff_dod >= 8  & diff_dod <= 30 ~ "8 to 30",
+        diff_dod >= 31                  ~ "31+",
+
+        diff_dod <= -1 & diff_dod >= -7  ~ "-1 to -7",
+        diff_dod <= -8 & diff_dod >= -30 ~ "-8 to -30",
+        diff_dod <= -31                  ~ "-31+"
+      ),
+
+      # convert to ordered factor for consistent display
+      dod_diff_groups = factor(
+        dod_diff_groups,
+        levels = c("-31+", "-8 to -30", "-1 to -7", "0", "1 to 7", "8 to 30", "31+"),
+        ordered = TRUE
+      )
+    )
+}
+
+# ---------------------------------------------------------
+# Demographic vars
+# ---------------------------------------------------------
+add_demographic_vars <- function(data) {
+  data |>
+    mutate(
+      age_band = cut(
+        age,
+        breaks = c(-Inf, 45, 55, 65, 75, 85, Inf),
+        labels = c("0-44", "45-54", "55-64", "65-74", "75-84", "85+"),
+        right = FALSE
+      ),
+      
+      imd_quintile = cut(
+        imd,
+        breaks = c(0, 32844 * (1:5) / 5),
+        labels = c("1 (most deprived)", "2", "3", "4", "5 (least deprived)"),
+        include.lowest = TRUE,
+        right = FALSE
+      ),
+      
+      rural_urban = factor(
+        rural_urban,
+        levels = 1:8,
+        labels = c(
+          "Urban major conurbation",
+          "Urban minor conurbation",
+          "Urban city and town",
+          "Urban city and town in a sparse setting",
+          "Rural town and fringe",
+          "Rural town and fringe in a sparse setting",
+          "Rural village and dispersed",
+          "Rural village and dispersed in a sparse setting"
+        ),
+        ordered = TRUE
+      )
+    )
+}
\ No newline at end of file
diff --git a/analysis/1_derive_key_variables.R b/analysis/1_derive_key_variables.R
new file mode 100644
index 0000000..e4ba004
--- /dev/null
+++ b/analysis/1_derive_key_variables.R
@@ -0,0 +1,141 @@
+###################################################
+# Author: Martina Pesce / Andrea Schaffer
+#   Bennett Institute for Applied Data Science
+#   University of Oxford, 2025
+#
+# This script describe the relationship between death date and GP registration history
+#
+# Steps:
+# 1) classify deaths by source: ONS only / TPP only / both
+# 2) define whether death occurred during registration
+#    - main analysis: includes 30-day grace period after registration end
+#    - sensitivity analysis: no grace period
+# 3) among registered deaths, describe timing between:
+#    - registration start and death
+#    - death and registration end
+#
+# Outputs:
+###################################################
+
+# Libraries ----
+
+library(tidyverse)
+library(lubridate)
+library(here)
+library(fs)
+
+# Create output directories ----
+
+output_dir_hs <- here("output", "highly_sensitive")
+dir_create(output_dir_hs)
+
+output_dir_analysis_tables <- here("output", "analysis_tables")
+dir_create(output_dir_analysis_tables)
+
+# import custom functions ----
+source(here("analysis", "0_utility_functions.R"))
+
+# Import data ----
+
+dataset_death_raw <- read_csv(
+  here("output", "highly_sensitive", "dataset_death_TPP_ONS.csv.gz")
+) |>
+  mutate(
+    tpp_death_date = as.Date(tpp_death_date),
+    ons_death_date = as.Date(ons_death_date),
+    date_of_birth = as.Date(date_of_birth),
+    last_registration_start_date = as.Date(last_registration_start_date),
+    last_registration_end_date = as.Date(last_registration_end_date),
+    region = as.factor(region),
+    ethnicity = as.factor(ethnicity),
+    sex = as.factor(sex)
+  )
+
+
+# Derive core variables ----
+death_registration_processed <- dataset_death_raw |>
+  mutate(
+    # ONS or TPP death date
+    has_ons_date_death = !is.na(ons_death_date),
+    has_tpp_death_date = !is.na(tpp_death_date),
+    flag_any_date_death = has_ons_date_death | has_tpp_death_date, 
+    
+    # Plausibility of death dates
+    cat_ons_death_date = cat_implausible_death_date(ons_death_date, date_of_birth),
+    cat_tpp_death_date = cat_implausible_death_date(tpp_death_date, date_of_birth),
+    flag_any_date_death_implausible = 
+      !(cat_ons_death_date %in% c("ok", "missing")) |
+      !(cat_tpp_death_date %in% c("ok", "missing")),
+    
+    # Death source
+    death_source = cat_death_source(ons_death_date, tpp_death_date),
+    
+    # Use ONS death date where available, otherwise TPP
+    death_date_ref = as.Date(if_else(!is.na(ons_death_date), ons_death_date, tpp_death_date)),
+    death_date_ref_year = year(death_date_ref),
+    
+    # Registration timing
+    death_minus_reg_start = as.numeric(death_date_ref - last_registration_start_date),
+    reg_end_minus_death = as.numeric(last_registration_end_date - death_date_ref),
+    
+    registration_status = cat_registration_status(
+      death_date = death_date_ref,
+      reg_start = last_registration_start_date,
+      reg_end = last_registration_end_date,
+      grace_days = 30
+    ),
+    
+    # Registered flags
+    flag_is_registered = registration_status %in% c( 
+      "death_during_registration",
+      "death_during_registration_open_end"
+    ),
+    is_registered_within_grace= registration_status %in% c(
+      "death_during_registration",
+      "death_during_registration_open_end",
+      "death_after_deregistration_within_grace"
+    ),
+    
+    # Timing groups
+    reg_start_timing_group = cat_last_reg_start_to_death(death_minus_reg_start),
+    reg_end_timing_group = cat_last_reg_end_minus_death(reg_end_minus_death)
+  ) |>
+  add_tpp_date_or_coded_vars() |>
+  add_dod_diff_vars() |>
+  add_demographic_vars()
+  
+
+write_csv(death_registration_processed, here(output_dir_hs, "death_registration_processed.csv.gz"))
+
+
+# print details about dataset
+capture.output(
+  skimr::skim_without_charts(death_registration_processed),
+  file = fs::path(output_dir_analysis_tables, "death_registration_processed_skim.txt"),
+  split = FALSE
+)
+
+# raw table source by year (no filter)
+table_source_raw <- death_registration_processed |>
+  
+  # Keep individuals with any recorded death date 
+  # (exclude patients with death only recorded in TPP codes)
+  filter(flag_any_date_death == TRUE) |>
+  
+  group_by(death_date_ref_year, death_source) |>
+  summarise(
+    total = n(), 
+    .groups = "drop"
+  )|>
+  group_by(death_date_ref_year) |>
+  mutate(
+    total_year = rounding(sum(total, na.rm = TRUE)),    
+    total = rounding(total),
+    perc = total / total_year * 100          
+  ) |>
+  ungroup()
+
+write_csv(table_source_raw, here(output_dir_analysis_tables, "table_source_raw.csv"))
+
+#  end------
+
diff --git a/analysis/1_summary_stats.R b/analysis/1_summary_stats.R
deleted file mode 100644
index 96f850f..0000000
--- a/analysis/1_summary_stats.R
+++ /dev/null
@@ -1,162 +0,0 @@
-# Preliminaries ----
-
-# Import libraries
-library("tidyverse")
-library("dtplyr")
-library("lubridate")
-library("glue")
-library("here")
-library("skimr")
-
-## Create output directory
-output_dir <- here("output", "summary_stats")
-fs::dir_create(output_dir)
-
-# Import processed data ----
-dataset_death_TPP_ONS <- read_csv("output/highly_sensitive/dataset_death_TPP_ONS.csv.gz") %>%
-  mutate(
-    region = as.factor(region),
-    age_band = as.factor(age_band),
-    rural_urban = as.factor(rural_urban),
-    ons_death_place = as.factor(ons_death_place),
-    region = as.factor(region),
-    IMD_q10 = as.factor(IMD_q10),
-    ethnicity = as.factor(ethnicity),
-    min_DoD = pmin(TPP_death_date, ons_death_date, na.rm = TRUE)
-  ) %>%
-  mutate(
-    death_dereg_diff = case_when(
-      !is.na(last_registration_end_date) ~ as.Date(last_registration_end_date) - min_DoD,
-      TRUE ~ as.difftime(NA_real_, units = "days"
-      )
-    )
-  )
-                                                      
-dataset_death_TPP_ONS_plus_inc_crit <- dataset_death_TPP_ONS %>% 
-  filter(
-    has_registration == TRUE & # was registered at the beginning of the year the person died
-      any_death_during_study == TRUE # died between "2009-01-01" - "2025-06-06" + (deregistration date + 30 days) is after one date of death
-  )
-
-dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death <- dataset_death_TPP_ONS_plus_inc_crit %>% 
-  filter(
-    death_dereg_diff >= 0)
-    
-  
-
-# Summary
-## Gral
-summary_dataset_death_TPP_ONS <- skim(dataset_death_TPP_ONS)
-
-summary_dataset_death_TPP_ONS_plus_inc_crit <- skim(dataset_death_TPP_ONS_plus_inc_crit)
-
-summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death <- skim(dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death)
-
-write.csv(summary_dataset_death_TPP_ONS, file = here::here("output", "summary_stats","summary_stats_dataset_death_TPP_ONS.csv"),
-          row.names = FALSE)
-
-write.csv(summary_dataset_death_TPP_ONS_plus_inc_crit, file = here::here("output", "summary_stats","summary_dataset_death_TPP_ONS_plus_inc_crit.csv"),
-          row.names = FALSE)
-
-write.csv(summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death, file = here::here("output", "summary_stats","summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death.csv"),
-          row.names = FALSE)
-
-## Categorical variables
-
-table_freq <- dataset_death_TPP_ONS %>% 
-  pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>% 
-  group_by(year(min_DoD), subgroup, category) %>% 
-  summarise(
-    n=n()
-  )
-
-table_freq_plus_inc_crit <- dataset_death_TPP_ONS_plus_inc_crit %>% 
-  pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>% 
-  group_by(year(min_DoD), subgroup, category) %>% 
-  summarise(
-    n=n()
-  )
-
-table_freq_plus_inc_crit_0_dereg_death <- dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death %>% 
-  pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>% 
-  group_by(year(min_DoD), subgroup, category) %>% 
-  summarise(
-    n=n()
-  )
-
-write.csv(table_freq, file = here::here("output", "summary_stats","table_freq_DoD.csv"),
-          row.names = FALSE)
-
-write.csv(table_freq_plus_inc_crit, file = here::here("output", "summary_stats","table_freq_plus_inc_crit.csv"),
-          row.names = FALSE)
-
-write.csv(table_freq_plus_inc_crit_0_dereg_death, file = here::here("output", "summary_stats","table_freq_plus_inc_crit_0_dereg_death.csv"),
-          row.names = FALSE)
-
-
-# Impossible dates of death
-impossible_dod <- dataset_death_TPP_ONS %>% 
-  mutate(
-    ons_DoD_impossible = case_when(
-      ons_death_date < date_of_birth                ~ "death_before_birth",
-      ons_death_date < as.Date("2009-01-01")        ~ "before_study",
-      ons_death_date > as.Date("2025-06-06")        ~ "after_study",
-      is.na(ons_death_date)                         ~ "is empty",   
-      TRUE                                          ~ "ok"
-    ),
-    TPP_DoD_impossible = case_when(
-      TPP_death_date < date_of_birth                ~ "death_before_birth",
-      TPP_death_date < as.Date("2009-01-01")        ~ "before_study",
-      TPP_death_date > as.Date("2025-06-06")        ~ "after_study",
-      is.na(TPP_death_date)                         ~ "is empty",
-      TRUE                                          ~ "ok"
-    ),
-    year_month_min_dod = format(min_DoD, "%Y-%m"),
-    year_min_dod = format(min_DoD, "%Y")
-  ) %>% 
-  pivot_longer(
-    cols = c(ons_DoD_impossible, TPP_DoD_impossible),
-    names_to = "source",
-    values_to = "DoD_impossible"
-  ) %>% 
-  mutate(
-    source = case_when(
-      source == "ons_DoD_impossible" ~ "ONS",
-      source == "TPP_DoD_impossible" ~ "TPP",
-      TRUE                           ~ source
-    )
-  ) 
-
-# by month
-impossible_dod_month <- impossible_dod %>% 
-  group_by(year_month_min_dod, source, DoD_impossible) %>% 
-  summarise(
-    n = n(),
-    .groups = "drop"
-  )
-
-write.csv(
-  impossible_dod_month,
-  here::here("output", "summary_stats", "impossible_dod_month.csv"),
-  row.names = FALSE
-)
-
-
-# by year
-impossible_dod_year <- impossible_dod %>% 
-group_by(year_min_dod, source, DoD_impossible) %>% 
-  summarise(
-    n = n(),
-    .groups = "drop"
-  )
-
-write.csv(
-  impossible_dod_year,
-  here::here("output", "summary_stats", "impossible_dod_year.csv"),
-  row.names = FALSE
-)
-  
-
-
-
-
diff --git a/analysis/2_implausible_death_dates.R b/analysis/2_implausible_death_dates.R
new file mode 100644
index 0000000..d20d9d0
--- /dev/null
+++ b/analysis/2_implausible_death_dates.R
@@ -0,0 +1,90 @@
+###################################################
+# Author: Martina Pesce / Andrea Schaffer
+# Bennett Institute for Applied Data Science
+# University of Oxford, 2025
+#
+# Aim:
+# Identify implausible death dates separately for ONS and TPP,
+# including: before date of birth & outside study period
+#
+# Inclusion criteria:
+#   - valid age
+#   - non-disclosive sex
+#   - at least one recorded death date (ONS and/or TPP)
+#
+###################################################
+
+# Libraries ----
+
+library(tidyverse)
+library(here)
+library(fs)
+
+# Create output directory ----
+
+output_dir_analysis_tables <- here("output", "analysis_tables")
+dir_create(output_dir_analysis_tables)
+
+# Import custom functions ----
+source(here("analysis", "0_utility_functions.R"))
+
+# Import data ----
+
+death_registration_processed <- read_csv(
+  here("output", "highly_sensitive", "death_registration_processed.csv.gz")
+)
+
+#--------------------------------------------------------
+# Implausible death dates by source and year
+#
+# Filter: Patients with at least one death date (ONS or TPP)
+#
+# Output:
+# Counts of plausibility categories by:
+#   - year (reference death year)
+#   - source (ONS vs TPP)
+#   - plausibility category
+#---------------------------------------------------------
+
+death_implausible_source <- death_registration_processed |>
+  
+  # Keep individuals with any recorded death date 
+  # (exclude patients with death only recorded in TPP codes)
+  filter(flag_any_date_death == TRUE) |>
+  
+  select(
+    patient_id,
+    death_date_ref_year,
+    cat_ons_death_date,
+    cat_tpp_death_date
+  ) |>
+  
+  pivot_longer(
+    cols = c(cat_ons_death_date, cat_tpp_death_date),
+    names_to = "source",
+    values_to = "cat_death_date_plausi"
+  ) |>
+  
+  mutate(
+    source = case_when(
+      source == "cat_ons_death_date" ~ "ons",
+      source == "cat_tpp_death_date" ~ "tpp"
+    )
+  ) |>
+  
+  # Aggregate counts by year, source, and plausibility category
+  group_by(death_date_ref_year, source, cat_death_date_plausi) |>
+  
+  summarise(
+    total = rounding(n()), # SDC rounding
+    .groups = "drop"
+  ) |>
+  
+  arrange(death_date_ref_year, source, cat_death_date_plausi)
+
+# Export results ----
+
+write_csv(
+  death_implausible_source,
+  here(output_dir_analysis_tables, "death_implausible_source.csv")
+)
\ No newline at end of file
diff --git a/analysis/3_registration_at_death.R b/analysis/3_registration_at_death.R
new file mode 100644
index 0000000..145ba48
--- /dev/null
+++ b/analysis/3_registration_at_death.R
@@ -0,0 +1,95 @@
+###################################################
+# Author: Martina Pesce / Andrea Schaffer
+# Bennett Institute for Applied Data Science
+# University of Oxford, 2025
+#
+# 2) Registration at time of death
+#
+# Describe registration status at the reference date of death
+# and the timing of registration start/end relative to death,
+# by death source (ONS only / TPP only / Both) and year.
+#
+# Inclusion criteria:
+# - valid age
+# - non-disclosive sex
+# - death recorded in ONS and/or TPP
+#
+# Exclusion criteria:
+# - implausible death date in either source
+#
+# Reference date of death: ONS death date, where available; otherwise TPP death date
+###################################################
+
+# Libraries ----
+library(tidyverse)
+library(here)
+library(fs)
+
+# Create output directory ----
+output_dir_analysis_tables <- here("output", "analysis_tables")
+dir_create(output_dir_analysis_tables)
+
+# Import utility functions ----
+source(here("analysis", "0_utility_functions.R"))
+
+# Import data ----
+death_registration_processed <- read_csv(
+  here("output", "highly_sensitive", "death_registration_processed.csv.gz")
+)
+
+# Restrict to patients with any death date and no implausible death dates ----
+death_registration_clean <- death_registration_processed |>
+  filter(
+    flag_any_date_death == TRUE,
+    flag_any_date_death_implausible == FALSE
+  )
+
+# Registration status at death, by year and death source ----
+registration_status_source <- death_registration_clean |>
+  group_by(death_date_ref_year, death_source, registration_status) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) |>
+  arrange(death_date_ref_year, death_source, registration_status)
+
+write_csv(
+  registration_status_source,
+  here(output_dir_analysis_tables, "registration_status_source.csv")
+)
+
+# Timing of last registration start relative to death, by year and death source ----
+# "death_before_registration_start" indicates last registration started after death
+reg_start_timing_source <- death_registration_clean |>
+  group_by(death_date_ref_year, death_source, reg_start_timing_group) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) |>
+  arrange(death_date_ref_year, death_source, reg_start_timing_group)
+
+write_csv(
+  reg_start_timing_source,
+  here(output_dir_analysis_tables, "reg_start_timing_source.csv")
+)
+
+# Timing of last registration end relative to death, by year and death source ----
+# Exclude people whose registration started after death
+reg_end_timing_source <- death_registration_clean |>
+  filter(
+    reg_start_timing_group %in% c(
+      "same_day_as_registration_start",
+      "death_after_registration_start"
+    )
+  ) |>
+  group_by(death_date_ref_year, death_source, reg_end_timing_group) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) |>
+  arrange(death_date_ref_year, death_source, reg_end_timing_group)
+
+write_csv(
+  reg_end_timing_source,
+  here(output_dir_analysis_tables, "reg_end_timing_source.csv")
+)
\ No newline at end of file
diff --git a/analysis/4_death_source_comparison.R b/analysis/4_death_source_comparison.R
new file mode 100644
index 0000000..e0308ba
--- /dev/null
+++ b/analysis/4_death_source_comparison.R
@@ -0,0 +1,196 @@
+###################################################
+# Author: Martina Pesce / Andrea Schaffer
+# Bennett Institute for Applied Data Science
+# University of Oxford, 2025
+#
+# 3) Death source comparison
+#
+#
+# Create descriptive tables of death source classifications:
+# - by reference year and death source
+# - overall and by subgroup
+# - for deaths occurring in 2025 onwards, by month
+#
+# Analysis:
+# 1. Counts by reference year and death source
+#    (overall and by subgroup: age band, sex, ethnicity,
+#    IMD quintile, rural/urban classification, and region)
+# 2. Monthly counts by death source for deaths in 2025 onwards
+# 3. Descriptive summaries including TPP-coded deaths
+#########################################################################
+
+# Libraries ----
+library(tidyverse)
+library(here)
+library(fs)
+library(lubridate)
+
+# Create output directory ----
+output_dir_analysis_tables <- here("output", "analysis_tables")
+dir_create(output_dir_analysis_tables)
+
+# Import utility functions ----
+source(here("analysis", "0_utility_functions.R"))
+
+# Import data ----
+death_registration_processed <- read_csv(
+  here("output", "highly_sensitive", "death_registration_processed.csv.gz")
+)
+
+# ==================================================
+# Main analysis: dated deaths only
+# ==================================================
+
+# Restrict to the main analysis population ----
+death_registration_analysis <- death_registration_processed |>
+  filter(
+    flag_any_date_death == TRUE,
+    flag_any_date_death_implausible == FALSE,
+    flag_is_registered == TRUE
+  )
+
+# Overall counts by year and death source ----
+table_death_source_overall <- death_registration_analysis |>
+  group_by(death_date_ref_year, death_source) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) |>
+  mutate(
+    subgroup = "overall",
+    subgroup_value = "All"
+  )
+
+# Counts by subgroup ----
+table_death_source_subgroups <- death_registration_analysis |>
+  select(
+    death_date_ref_year,
+    death_source,
+    age_band,
+    sex,
+    ethnicity,
+    imd_quintile,
+    rural_urban,
+    region
+  ) |>
+  pivot_longer(
+    cols = c(age_band, sex, ethnicity, imd_quintile, rural_urban, region),
+    names_to = "subgroup",
+    values_to = "subgroup_value"
+  ) |>
+  group_by(death_date_ref_year, death_source, subgroup, subgroup_value) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  )
+
+# Combine overall and subgroup counts ----
+table_death_source <- bind_rows(
+  table_death_source_overall,
+  table_death_source_subgroups
+) |>
+  mutate(
+    subgroup = factor(
+      subgroup,
+      levels = c(
+        "overall",
+        "age_band",
+        "sex",
+        "ethnicity",
+        "imd_quintile",
+        "rural_urban",
+        "region"
+      )
+    )
+  ) |>
+  group_by(death_date_ref_year, subgroup, subgroup_value) |>
+  mutate(
+    total_subgroup_value = sum(total, na.rm = TRUE),             # denominator
+    perc = total / total_subgroup_value * 100           # proportion
+  ) |>
+  ungroup() |>
+  arrange(death_date_ref_year, subgroup, subgroup_value, death_source) |>
+  select(
+    death_date_ref_year,
+    subgroup,
+    subgroup_value,
+    death_source,
+    total,
+    total_subgroup_value,
+    perc
+  )
+
+# Export main analysis table ----
+write_csv(
+  table_death_source,
+  here(output_dir_analysis_tables, "table_death_source.csv")
+)
+
+# ==================================================
+# 2025-2026 sources by month
+# ==================================================
+table_death_source_25_26 <- death_registration_analysis |>
+  filter(death_date_ref_year > 2024) |>
+  group_by(month = floor_date(death_date_ref, unit = "month"), death_source) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) 
+
+# Export lasts months analysis table ----
+write_csv(
+  table_death_source_25_26,
+  here(output_dir_analysis_tables, "table_death_source_25_26.csv")
+)
+
+# ==================================================
+# Sensitivity analysis: include TPP-coded deaths
+# ==================================================
+
+# Descriptive table:
+# classify TPP evidence as date, code, or neither
+# across all records with a reference year including TPP-coded deaths
+# Note: tpp_date_or_coded = NA indicates no TPP code / date , only ONS
+tpp_death_code_or_date <- death_registration_processed |>
+  group_by(death_date_ref_year_w_tpp_codes, tpp_date_or_coded) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) |>
+  arrange(death_date_ref_year_w_tpp_codes, tpp_date_or_coded)
+
+write_csv(
+  tpp_death_code_or_date,
+  here(output_dir_analysis_tables, "tpp_death_code_or_date.csv")
+)
+
+# Restrict to the main analysis population ----
+
+# Overall counts by year and death source, including TPP-coded deaths ----
+table_death_source_overall_any_tpp <- death_registration_analysis |>
+  group_by(death_date_ref_year_w_tpp_codes, death_source_tpp_date_or_coded) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  )|>
+  group_by(death_date_ref_year_w_tpp_codes) |>
+  mutate(
+    total_year = sum(total, na.rm = TRUE),            
+    perc = total / total_year * 100          
+  ) |>
+  ungroup() |>
+  arrange(death_date_ref_year_w_tpp_codes, death_source_tpp_date_or_coded) |>
+  select(
+    death_date_ref_year_w_tpp_codes,
+    death_source_tpp_date_or_coded,
+    total,
+    total_year,
+    perc
+  )
+
+# Export sensitivity analysis table ----
+write_csv(
+  table_death_source_overall_any_tpp,
+  here(output_dir_analysis_tables, "table_death_source_overall_any_tpp.csv")
+)
+
diff --git a/analysis/5_sources_date_agreement.R b/analysis/5_sources_date_agreement.R
new file mode 100644
index 0000000..837a5e6
--- /dev/null
+++ b/analysis/5_sources_date_agreement.R
@@ -0,0 +1,126 @@
+###################################################
+# Author: Martina Pesce / Andrea Schaffer
+# Bennett Institute for Applied Data Science
+# University of Oxford, 2025
+#
+# 4) Agreement between ONS and TPP death dates
+# Assess agreement between date of death recorded in ONS and TPP
+# among individuals with a death date available in both sources.
+#
+# Inclusion criteria:
+# - valid age
+# - non-disclosive sex
+# - registered at death
+# - death date recorded in both ONS and TPP
+#
+# Exclusion criteria:
+# - implausible death date in either source
+###################################################
+
+# Libraries ----
+library(tidyverse)
+library(here)
+library(fs)
+library(lubridate)
+
+# Create output directory ----
+output_dir_analysis_tables <- here("output", "analysis_tables")
+dir_create(output_dir_analysis_tables)
+
+# Import utility functions ----
+source(here("analysis", "0_utility_functions.R"))
+
+# Import data ----
+death_registration_processed <- read_csv(
+  here("output", "highly_sensitive", "death_registration_processed.csv.gz")
+)
+
+# ==================================================
+# Analysis
+# ==================================================
+
+# Restrict to the main analysis population ----
+ons_tpp_dates_diff_analysis <- death_registration_processed |>
+  filter(
+    has_ons_date_death == TRUE,
+    has_tpp_death_date == TRUE,
+    flag_any_date_death_implausible == FALSE,
+    flag_is_registered == TRUE
+  )
+
+# table dates difference bw ons - tpp
+table_ons_tpp_dates_diff_overall  <- ons_tpp_dates_diff_analysis |>
+  group_by(death_date_ref_year, dod_diff_groups) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  ) |>
+  mutate(
+    subgroup = "overall",
+    subgroup_value = "All"
+  )
+
+# Counts by subgroup ----
+table_ons_tpp_dates_diff_subgroups <- ons_tpp_dates_diff_analysis |>
+  select(
+    death_date_ref_year,
+    dod_diff_groups,
+    age_band,
+    sex,
+    ethnicity,
+    imd_quintile,
+    rural_urban,
+    region
+  ) |>
+  pivot_longer(
+    cols = c(age_band, sex, ethnicity, imd_quintile, rural_urban, region),
+    names_to = "subgroup",
+    values_to = "subgroup_value"
+  ) |>
+  group_by(death_date_ref_year, dod_diff_groups, subgroup, subgroup_value) |>
+  summarise(
+    total = rounding(n()),  # SDC rounding
+    .groups = "drop"
+  )
+
+# Combine overall and subgroup counts ----
+table_ons_tpp_dates_diff <- bind_rows(
+  table_ons_tpp_dates_diff_overall,
+  table_ons_tpp_dates_diff_subgroups
+) |>
+  mutate(
+    subgroup = factor(
+      subgroup,
+      levels = c(
+        "overall",
+        "age_band",
+        "sex",
+        "ethnicity",
+        "imd_quintile",
+        "rural_urban",
+        "region"
+      )
+    )
+  ) |>
+  group_by(death_date_ref_year, subgroup, subgroup_value) |>
+  mutate(
+    total_subgroup_value = sum(total, na.rm = TRUE),             # denominator
+    perc = total / total_subgroup_value * 100           # proportion
+  ) |>
+  ungroup() |>
+  arrange(death_date_ref_year, subgroup, subgroup_value, dod_diff_groups) |>
+  select(
+    death_date_ref_year,
+    subgroup,
+    subgroup_value,
+    dod_diff_groups,
+    total,
+    total_subgroup_value,
+    perc
+  )
+
+# Export main analysis table ----
+write_csv(
+  table_ons_tpp_dates_diff,
+  here(output_dir_analysis_tables, "table_ons_tpp_dates_diff.csv")
+)
diff --git a/analysis/6_variation_source_by_practice.R b/analysis/6_variation_source_by_practice.R
new file mode 100644
index 0000000..5e0e772
--- /dev/null
+++ b/analysis/6_variation_source_by_practice.R
@@ -0,0 +1,101 @@
+###################################################
+# Author: Martina Pesce / Andrea Schaffer
+# Bennett Institute for Applied Data Science
+# University of Oxford, 2025
+#
+# 5) 
+#
+# Variation across practices in the proportion 
+# of deaths recorded only in ONS using yearly percentiles
+#
+#########################################################################
+
+# Libraries ----
+library(tidyverse)
+library(here)
+library(fs)
+library(lubridate)
+
+# Create output directory ----
+output_dir_analysis_tables <- here("output", "analysis_tables")
+dir_create(output_dir_analysis_tables)
+
+# Import utility functions ----
+source(here("analysis", "0_utility_functions.R"))
+
+# Import data ----
+death_registration_processed <- read_csv(
+  here("output", "highly_sensitive", "death_registration_processed.csv.gz")
+)
+
+# ==================================================
+# Main analysis: dated deaths only
+# ==================================================
+
+# Restrict to the main analysis population
+death_registration_analysis <- death_registration_processed |>
+  filter(
+    flag_any_date_death == TRUE,
+    flag_any_date_death_implausible == FALSE,
+    flag_is_registered == TRUE
+  )
+
+# ==================================================
+# Practice-level % of deaths recorded only in ONS
+# Includes practices with 0% ONS only
+# ==================================================
+
+practice_ons_only <- death_registration_analysis |>
+  group_by(death_date_ref_year, practice) |>
+  summarise(
+    total_practice_year = n(),
+    ons_only_n = sum(death_source == "ONS_only", na.rm = TRUE),
+    perc_ons_only = 100 * ons_only_n / total_practice_year,
+    .groups = "drop"
+  ) |>
+  filter(total_practice_year > 9) |>
+  rename(year = death_date_ref_year)
+
+# ==================================================
+# Practice-level percentiles of % ONS-only deaths
+# Long format for plotting
+# ==================================================
+
+probs <- seq(0.1, 0.9, by = 0.1)
+percentiles <- probs * 100
+
+n_by_year <- practice_ons_only |>
+  group_by(year) |>
+  summarise(
+    n_practices = rounding(n_distinct(practice)),
+    .groups = "drop"
+  )
+
+table_practice_percentiles <- practice_ons_only |>
+  group_by(year) |>
+  summarise(
+    value = list(
+      round(as.numeric(quantile(
+        perc_ons_only,
+        probs = probs,
+        na.rm = TRUE,
+        type = 3)), 1)
+    ),
+    percentile = list(percentiles),
+    .groups = "drop"
+  ) |>
+  unnest(c(percentile, value)) |>
+  left_join(n_by_year, by = "year") |>
+  mutate(
+    line_group = if_else(percentile == 50, "median", "decile")
+  ) |>
+  select(year, n_practices, percentile, value, line_group) |>
+  arrange(year, percentile)
+
+table_practice_percentiles
+
+# Export main analysis table ----
+write_csv(
+  table_practice_percentiles,
+  here(output_dir_analysis_tables, "table_practice_percentiles.csv")
+)
\ No newline at end of file
diff --git a/analysis/dataset_def/dataset_definition.py b/analysis/dataset_def/dataset_definition.py
index 42abaec..91fb106 100644
--- a/analysis/dataset_def/dataset_definition.py
+++ b/analysis/dataset_def/dataset_definition.py
@@ -9,110 +9,119 @@
 #   University of Oxford, 2025
 ###################################################
 
-from ehrql import  when, create_dataset, case, minimum_of, codelist_from_csv, days
-from ehrql.tables.tpp import patients, practice_registrations, ons_deaths, addresses, clinical_events
+from ehrql import create_dataset, case, when, codelist_from_csv
+from ehrql.tables.tpp import (
+    patients,
+    ons_deaths,
+    clinical_events,
+    practice_registrations,
+    addresses,
+)
 
-# initialise dataset
 dataset = create_dataset()
-dataset.configure_dummy_data(population_size=1000)
 
 
-#Dates 
-start_date = "2009-01-01"
-#End date (day before last ONS deaths update)
-end_date = "2025-06-06"
-earliest_DoD = minimum_of(
-    patients.date_of_death,
-    ons_deaths.date,
+# -----------------------------------------------------------------------------
+# Population
+# -----------------------------------------------------------------------------
+
+# Death definitions ------------------
+
+# ONS death date recorded
+has_ons_death_date = ons_deaths.date.is_not_null()
+
+# TPP structured death date recorded
+has_tpp_death_date = patients.date_of_death.is_not_null()
+
+# TPP coded death recorded
+tpp_coded_death_codes = codelist_from_csv(
+    "codelists/nhsd-primary-care-domain-refsets-death_cod.csv",
+    column="code",
 )
 
-year_start_DoD = earliest_DoD.to_first_of_year()
+tpp_coded_death_event = (
+    clinical_events
+    .where(clinical_events.snomedct_code.is_in(tpp_coded_death_codes))
+    .sort_by(clinical_events.date)
+    .first_for_patient()
+)
 
-# Inclusion/exclusion criteria:
+has_tpp_coded_death = tpp_coded_death_event.date.is_not_null()
 
-## Include people alive
-#was_alive = patients.date_of_death.is_after(year_start_DoD) | patients.date_of_death.is_null() | ons_deaths.date.is_after(year_start_DoD) | ons_deaths.date.is_null()
+# Death recorded in any source
+has_any_death = (
+    has_ons_death_date
+    | has_tpp_death_date
+    | has_tpp_coded_death
+)
 
+# Reference death date:
+# 1) ONS death date
+# 2) TPP structured death date
+# 3) TPP coded death date
 
-## Include people registered with a TPP practice
-# has_registration = practice_registrations.for_patient_on(year_start_DoD).exists_for_patient() |  ((patients.date_of_birth.year == year_start_DoD.year) & practice_registrations.for_patient_on(earliest_DoD).exists_for_patient())
+ref_death_date = case(
+    when(has_ons_death_date).then(ons_deaths.date),
+    when(has_tpp_death_date).then(patients.date_of_death),
+    when(has_tpp_coded_death).then(tpp_coded_death_event.date),
+)
 
-## Exclude people >110 years due to risk of incorrectly recorded age;
-has_possible_age= ((patients.age_on(year_start_DoD) < 110) & (patients.age_on(year_start_DoD) > 0)) | (patients.date_of_birth.year == year_start_DoD.year)
 
-## Exclude people with non-male or female sex due to disclosure risk;
-non_disclosive_sex= (patients.sex == "male") | (patients.sex == "female")
+# Age at reference death date ---------------
+age_at_death = patients.age_on(ref_death_date)
 
-# TPP death
-tpp_death_anytime = patients.date_of_death.is_not_null()
-# ONS death
-ons_death_anytime = ons_deaths.date.is_not_null()
+# Keep plausible ages only.
+# Also retain infants aged <1 year, whose integer age may be recorded as 0.
+has_possible_age = (
+    ((age_at_death >= 0) & (age_at_death < 110))
+    | (patients.date_of_birth.year == ref_death_date.year)
+)
 
-# Combine both ONS/TPP
-died_anytime = tpp_death_anytime | ons_death_anytime
+# Restrict to male/female categories for disclosure control -----------------------
+has_non_disclosive_sex = (
+    (patients.sex == "male")
+    | (patients.sex == "female")
+)
 
-# define dataset poppulation
+# Define population
 dataset.define_population(
-    #was_alive & 
-    #has_registration & 
-    has_possible_age & 
-    non_disclosive_sex &
-    died_anytime
-    )
+    has_any_death
+    & has_possible_age
+    & has_non_disclosive_sex
+)
 
+# -----------------------------------------------------------------------------
 # Variables
+# -----------------------------------------------------------------------------
 
-## Death related variables
-
-### Date of death
-dataset.TPP_death_date = patients.date_of_death
+# Death ------
 dataset.ons_death_date = ons_deaths.date
+dataset.tpp_death_date = patients.date_of_death
+dataset.tpp_coded_death_date = tpp_coded_death_event.date
+# dataset.ref_death_date = ref_death_date
 
 
-## Sub-analysis variables
-
-### Age band
-dataset.date_of_birth = patients.date_of_birth
-age = patients.age_on(earliest_DoD)
-
-dataset.age_band = case(
-    when((age < 45)).then("0-44"),
-    when((age >= 45) & (age < 65)).then("45-64"),
-    when((age >= 65) & (age < 75)).then("65-74"),
-    when((age >= 75) & (age < 85)).then("75-84"),
-    when(age >= 85).then("85+"),
+## Last registration -----
+last_registration = (
+    practice_registrations
+    .sort_by(
+        practice_registrations.start_date,
+        practice_registrations.end_date,
     )
+    .last_for_patient()
+)
 
-### Practice (anonymous)
-dataset.practice = practice_registrations.for_patient_on(earliest_DoD).practice_pseudo_id
-
-### Place of death
-dataset.ons_death_place = ons_deaths.place
-
-### Practice region
-dataset.region = practice_registrations.for_patient_on(earliest_DoD).practice_nuts1_region_name
-
-### Rurality 
-dataset.rural_urban = addresses.for_patient_on(earliest_DoD).rural_urban_classification
+dataset.last_registration_start_date = last_registration.start_date
+dataset.last_registration_end_date = last_registration.end_date
 
+# Demographic --------
 
-#IMD
-imd = addresses.for_patient_on(earliest_DoD).imd_rounded
-
-dataset.IMD_q10 = case(
-        when((imd >= 0) & (imd < int(32844 * 1 / 10))).then("1 (most deprived)"),
-        when(imd < int(32844 * 2 / 10)).then("2"),
-        when(imd < int(32844 * 3 / 10)).then("3"),
-        when(imd < int(32844 * 4 / 10)).then("4"),
-        when(imd < int(32844 * 5 / 10)).then("5"),
-        when(imd < int(32844 * 6 / 10)).then("6"),
-        when(imd < int(32844 * 7 / 10)).then("7"),
-        when(imd < int(32844 * 8 / 10)).then("8"),
-        when(imd < int(32844 * 9 / 10)).then("9"),
-        when(imd >= int(32844 * 9 / 10)).then("10 (least deprived)"),
-        otherwise="unknown"
-)
+### Age at ref date of death 
+dataset.date_of_birth = patients.date_of_birth
+dataset.age = patients.age_on(ref_death_date)
 
+# Sex
+dataset.sex = patients.sex
 
 #Ethnicity
 ethnicity5 = codelist_from_csv(
@@ -121,80 +130,104 @@
   category_column="Label_6", # it's 6 because there is an additional "6 - Not stated" but this is not represented in SNOMED, instead corresponding to no ethnicity code
 )
 
-dataset.ethnicity = clinical_events.where(
-        clinical_events.snomedct_code.is_in(ethnicity5)
-    ).sort_by(
-        clinical_events.date
-    ).last_for_patient().snomedct_code.to_category(ethnicity5)
-
-# Sex
-dataset.sex = patients.sex
-
-
-## Last registration date per patient
-last_registration = (
-    practice_registrations
-    .sort_by(
-        practice_registrations.start_date,
-        practice_registrations.end_date
-    )
+dataset.ethnicity = (
+    clinical_events
+    .where(clinical_events.snomedct_code.is_in(ethnicity5))
+    .sort_by(clinical_events.date)
     .last_for_patient()
+    .snomedct_code
+    .to_category(ethnicity5)
 )
 
-# last registration start date
-dataset.last_registration_start_date = last_registration.start_date
+#IMD
+dataset.imd = addresses.for_patient_on(ref_death_date).imd_rounded
 
-# last registration end date
-dataset.last_registration_end_date = last_registration.end_date
+# Rurality 
+dataset.rural_urban = addresses.for_patient_on(ref_death_date).rural_urban_classification
 
+# Practice region
+dataset.region = practice_registrations.for_patient_on(ref_death_date).practice_nuts1_region_name
 
-# Died during study and while registered
-## TPP death during study and while registered
-tpp_death_during_study = (
-    patients.date_of_death.is_on_or_after(start_date) &
-    patients.date_of_death.is_on_or_before(end_date) &    
-    (
-        patients.date_of_death.is_on_or_before(last_registration.end_date + days(30)) |
-        last_registration.end_date.is_null()
-    )
+### Practice (anonymous)
+dataset.practice = practice_registrations.for_patient_on(ref_death_date).practice_pseudo_id
+
+# -----------------------------------------------------------------------------
+# Dummy data
+# -----------------------------------------------------------------------------
+
+dataset.configure_dummy_data(
+    population_size=10000,
+    timeout=180,
+    additional_population_constraint=(
+        (dataset.tpp_death_date.is_on_or_between("2008-01-01", "2026-05-01")
+            | dataset.tpp_death_date.is_null()
+        ) &
+        (dataset.tpp_coded_death_date.is_on_or_between("2008-01-01", "2026-05-01")
+            | dataset.tpp_coded_death_date.is_null()
+        ) &
+        (dataset.ons_death_date.is_on_or_between("2015-01-01", "2026-05-01")
+            | dataset.ons_death_date.is_null()
+        ) &
+        (dataset.last_registration_start_date.is_on_or_between("2008-01-01", "2026-05-01")
+            | dataset.last_registration_start_date.is_null()
+        )
+    ),
 )
 
-dataset.tpp_death_during_study = tpp_death_during_study
+# # Died during study and while registered
+# ## TPP death during study and while registered
+# tpp_death_during_study = (
+#     patients.date_of_death.is_on_or_after(start_date) &
+#     patients.date_of_death.is_on_or_before(end_date) &    
+#     (
+#         patients.date_of_death.is_on_or_before(last_registration.end_date + days(30)) |
+#         last_registration.end_date.is_null()
+#     )
+# )
 
-# ONS death during study and while registered, allowing for missing end_date
-ons_death_during_study = (
-    ons_deaths.date.is_on_or_after(start_date) &
-    ons_deaths.date.is_on_or_before(end_date) &    
-    (
-        ons_deaths.date.is_on_or_before(last_registration.end_date  + days(30)) |
-        last_registration.end_date.is_null()
-    )
-)
+# dataset.tpp_death_during_study = tpp_death_during_study
 
-dataset.ons_death_during_study = ons_death_during_study
+# # ONS death during study and while registered, allowing for missing end_date
+# ons_death_during_study = (
+#     ons_deaths.date.is_on_or_after(start_date) &
+#     ons_deaths.date.is_on_or_before(end_date) &    
+#     (
+#         ons_deaths.date.is_on_or_before(last_registration.end_date  + days(30)) |
+#         last_registration.end_date.is_null()
+#     )
+# )
 
-# died during study any (ONS or TPP)
+# dataset.ons_death_during_study = ons_death_during_study
 
-dataset.any_death_during_study = ons_death_during_study | tpp_death_during_study
+# # died during study any (ONS or TPP)
 
+# dataset.any_death_during_study = ons_death_during_study | tpp_death_during_study
 
-## Include people registered with a TPP practice
-dataset.has_registration = practice_registrations.for_patient_on(year_start_DoD).exists_for_patient() |  ((patients.date_of_birth.year == year_start_DoD.year) & practice_registrations.for_patient_on(earliest_DoD).exists_for_patient())
 
+# ## Include people registered with a TPP practice
+# dataset.has_registration = practice_registrations.for_patient_on(year_start_DoD).exists_for_patient() |  ((patients.date_of_birth.year == year_start_DoD.year) & practice_registrations.for_patient_on(earliest_DoD).exists_for_patient())
+### Practice (anonymous)
+# dataset.practice = practice_registrations.for_patient_on(earliest_DoD).practice_pseudo_id
+# dataset.IMD_q10 = case(
+#         when((imd >= 0) & (imd < int(32844 * 1 / 10))).then("1 (most deprived)"),
+#         when(imd < int(32844 * 2 / 10)).then("2"),
+#         when(imd < int(32844 * 3 / 10)).then("3"),
+#         when(imd < int(32844 * 4 / 10)).then("4"),
+#         when(imd < int(32844 * 5 / 10)).then("5"),
+#         when(imd < int(32844 * 6 / 10)).then("6"),
+#         when(imd < int(32844 * 7 / 10)).then("7"),
+#         when(imd < int(32844 * 8 / 10)).then("8"),
+#         when(imd < int(32844 * 9 / 10)).then("9"),
+#         when(imd >= int(32844 * 9 / 10)).then("10 (least deprived)"),
+#         otherwise="unknown"
+# )
+# dataset.age_band = case(
+#     when((age < 45)).then("0-44"),
+#     when((age >= 45) & (age < 65)).then("45-64"),
+#     when((age >= 65) & (age < 75)).then("65-74"),
+#     when((age >= 75) & (age < 85)).then("75-84"),
+#     when(age >= 85).then("85+"),
+#     )
 
-# Coded date of death
-death_coded = codelist_from_csv(
-  "codelists/nhsd-primary-care-domain-refsets-death_cod.csv",
-  column="code")
 
-dataset.death_coded_date = clinical_events.where(
-        clinical_events.snomedct_code.is_in(death_coded)
-        ).sort_by(clinical_events.date).last_for_patient().date
 
-# Dummy data configuration
-dataset.configure_dummy_data(population_size=10000, timeout=180,
-                             additional_population_constraint=(
-                                 dataset.TPP_death_date.is_on_or_between("2009-01-01","2025-05-01") & 
-                                 dataset.death_coded_date.is_on_or_between("2019-01-01","2025-05-01")
-                                 )
-                                 )
diff --git a/analysis/2_tables_ONS_TPP_reg_dereg.R b/analysis/old/2_tables_ONS_TPP_reg_dereg.R
similarity index 100%
rename from analysis/2_tables_ONS_TPP_reg_dereg.R
rename to analysis/old/2_tables_ONS_TPP_reg_dereg.R
diff --git a/analysis/3_dataset_DoD_TPP_ONS_with_grace_period.R b/analysis/old/3_dataset_DoD_TPP_ONS_with_grace_period.R
similarity index 100%
rename from analysis/3_dataset_DoD_TPP_ONS_with_grace_period.R
rename to analysis/old/3_dataset_DoD_TPP_ONS_with_grace_period.R
diff --git a/analysis/4_source_prop.R b/analysis/old/4_source_prop.R
similarity index 100%
rename from analysis/4_source_prop.R
rename to analysis/old/4_source_prop.R
diff --git a/analysis/5_DoD_diff.R b/analysis/old/5_DoD_diff.R
similarity index 100%
rename from analysis/5_DoD_diff.R
rename to analysis/old/5_DoD_diff.R
diff --git a/analysis/6_practice_by_source.R b/analysis/old/6_practice_by_source.R
similarity index 100%
rename from analysis/6_practice_by_source.R
rename to analysis/old/6_practice_by_source.R
diff --git a/analysis/7_Tables-visual.Rmd b/analysis/old/7_Tables-visual.Rmd
similarity index 100%
rename from analysis/7_Tables-visual.Rmd
rename to analysis/old/7_Tables-visual.Rmd
diff --git a/analysis/post_release/Tables_figures_paper.Rmd b/analysis/post_release/Tables_figures_paper.Rmd
new file mode 100644
index 0000000..de13150
--- /dev/null
+++ b/analysis/post_release/Tables_figures_paper.Rmd
@@ -0,0 +1,675 @@
+---
+title: "Tables and figures death manuscript"
+author: "Martina Pesce"
+date: "2026-02-02"
+output: html_document
+editor_options: 
+  chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+
+```{r}
+# Import libraries
+library(tidyverse)
+library(lubridate)
+library(glue)
+library(here)
+library(patchwork)
+library(scales)
+
+# Create output directory
+output_dir <- here("output", "final_visualization")
+fs::dir_create(output_dir)
+```
+
+
+```{r}
+# =====================================================
+# Import analysis tables
+# =====================================================
+
+# Context
+table_source_raw <- read_csv(
+  here("output", "analysis_tables", "table_source_raw.csv")
+)
+
+# Implausible death dates
+death_implausible_source <- read_csv(
+  here("output", "analysis_tables", "death_implausible_source.csv")
+)
+
+# Registration and deregistration around death
+registration_status_source <- read_csv(
+  here("output", "analysis_tables", "registration_status_source.csv")
+)
+
+reg_start_timing_source <- read_csv(
+  here("output", "analysis_tables", "reg_start_timing_source.csv")
+)
+
+reg_end_timing_source <- read_csv(
+  here("output", "analysis_tables", "reg_end_timing_source.csv")
+)
+
+# Death source comparison
+table_death_source <- read_csv(
+  here("output", "analysis_tables", "table_death_source.csv")
+)
+
+table_death_source_25_26 <- read_csv(
+  here("output", "analysis_tables", "table_death_source_25_26.csv")
+)
+
+tpp_death_code_or_date <- read_csv(
+  here("output", "analysis_tables", "tpp_death_code_or_date.csv")
+)
+
+table_death_source_overall_any_tpp <- read_csv(
+  here("output", "analysis_tables", "table_death_source_overall_any_tpp.csv")
+)
+
+table_practice_percentiles <- read_csv(
+  here("output", "analysis_tables", "table_practice_percentiles.csv")
+)
+
+# Date agreement between ONS and TPP-pcEHR
+table_ons_tpp_dates_diff <- read_csv(
+  here("output", "analysis_tables", "table_ons_tpp_dates_diff.csv")
+)
+
+```
+
+
+```{r}
+# =====================================================
+# Overall summary numbers
+# =====================================================
+
+# Any dated deaths in the dataset, 2009-2025
+pcEHR_2009_2025 <- table_death_source |>
+  filter(
+    death_date_ref_year >= 2009,
+    death_date_ref_year <= 2025
+  ) |>
+  summarise(
+    pcEHR_total = sum(total, na.rm = TRUE),
+    mean_pcEHR_per_year = pcEHR_total / 17
+  )
+
+# Deaths with an ONS date of death, 2020-2025
+pcEHR_2020_2025 <- table_death_source |>
+  filter(
+    death_date_ref_year >= 2020,
+    death_date_ref_year <= 2025
+  ) |>
+  summarise(
+    ons_total = sum(total[death_source %in% c("ONS_only", "Both")], na.rm = TRUE),
+    both_total = sum(total[death_source == "Both"], na.rm = TRUE),
+    tpp_only_total = sum(total[death_source == "TPP_only"], na.rm = TRUE),
+    n_years = n_distinct(death_date_ref_year),
+    ons_mean = ons_total / n_years,
+    both_perc = both_total / ons_total * 100
+  )
+
+pcEHR_2009_2025
+pcEHR_2020_2025
+
+```
+
+
+```{r}
+# =====================================================
+# Table-ready summaries for 2025
+# =====================================================
+
+# Registration status at death
+tbl_1_2025_reg_status_by_source <- registration_status_source |>
+  filter(death_date_ref_year == 2025) |>
+  pivot_wider(
+    names_from = death_source,
+    values_from = total
+  ) |>
+  mutate(
+    any = rowSums(across(c(ONS_only, TPP_only, Both)), na.rm = TRUE),
+    ONS_all = rowSums(across(c(ONS_only, Both)), na.rm = TRUE),
+    TPP_all = rowSums(across(c(TPP_only, Both)), na.rm = TRUE)
+  )
+
+# Time from death to deregistration
+tbl_2_2025_time_to_dereg_by_source <- reg_end_timing_source |>
+  filter(death_date_ref_year == 2025) |>
+  pivot_wider(
+    names_from = death_source,
+    values_from = total
+  ) |>
+  mutate(
+    any = rowSums(across(c(ONS_only, TPP_only, Both)), na.rm = TRUE),
+    ONS_all = rowSums(across(c(ONS_only, Both)), na.rm = TRUE),
+    TPP_all = rowSums(across(c(TPP_only, Both)), na.rm = TRUE)
+  )
+```
+
+
+
+
+```{r}
+# =====================================================
+# Figure 1: source of death over time
+# =====================================================
+
+fig_1_data <- table_death_source |>
+  filter(
+    subgroup == "overall",
+    subgroup_value == "All",
+    death_date_ref_year < 2026
+  )
+
+theme_fig_1 <- theme_minimal() +
+  theme(
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    legend.position = "bottom"
+  )
+
+# Panel A: deaths recorded in both sources vs single source
+fig_1_all_stacked <- fig_1_data |>
+  mutate(
+    concordance = case_when(
+      death_source == "Both" ~ "ONS & TPP-pcEHR",
+      death_source %in% c("ONS_only", "TPP_only") ~ "Single source"
+    ),
+    concordance = factor(
+      concordance,
+      levels = c("ONS & TPP-pcEHR", "Single source")
+    )
+  ) |>
+  group_by(death_date_ref_year, concordance) |>
+  summarise(
+    total = sum(total, na.rm = TRUE),
+    .groups = "drop"
+  )
+
+p_fig_1_all <- ggplot(
+  fig_1_all_stacked,
+  aes(x = death_date_ref_year, y = total, fill = concordance)
+) +
+  geom_col(
+    position = position_stack(reverse = TRUE),
+    color = "white",
+    linewidth = 0.2
+  ) +
+  geom_vline(
+    xintercept = 2018.5,
+    linetype = "dotted",
+    linewidth = 0.8
+  ) +
+  scale_fill_manual(
+    values = c(
+      "ONS & TPP-pcEHR" = "#785EF0",
+      "Single source" = "#BBBBBB"
+    ),
+    name = NULL
+  ) +
+  labs(
+    x = "Year of death",
+    y = "Number of deaths",
+    title = "(A)"
+  ) +
+  theme_fig_1
+
+# Panel B: deaths recorded in only one source
+fig_1_discordant <- fig_1_data |>
+  filter(
+    death_source %in% c("ONS_only", "TPP_only"),
+    death_date_ref_year > 2018
+  ) |>
+  mutate(
+    death_source = factor(
+      death_source,
+      levels = c("ONS_only", "TPP_only")
+    )
+  )
+
+p_fig_1_discordant <- ggplot(
+  fig_1_discordant,
+  aes(x = death_date_ref_year, y = total, fill = death_source)
+) +
+  geom_col(
+    position = "dodge",
+    color = "white",
+    linewidth = 0.2
+  ) +
+  scale_fill_manual(
+    values = c(
+      "ONS_only" = "#648FFF",
+      "TPP_only" = "#DC267F"
+    ),
+    labels = c(
+      "ONS_only" = "ONS only",
+      "TPP_only" = "TPP-pcEHR dated only"
+    ),
+    name = NULL
+  ) +
+  labs(
+    x = "Year of death",
+    y = "Number of Deaths",
+    title = "(B)"
+  ) +
+  theme_fig_1
+
+fig_1 <- p_fig_1_all / p_fig_1_discordant
+fig_1
+```
+
+
+```{r}
+# =====================================================
+# Monthly source breakdown for 2025-2026
+# =====================================================
+
+table_paper_3 <- table_death_source_25_26 |>
+  mutate(year = year(month)) |>
+  group_by(year, month, death_source) |>
+  summarise(
+    n = sum(total, na.rm = TRUE),
+    .groups = "drop"
+  ) |>
+  group_by(year, month) |>
+  mutate(
+    N = sum(n),
+    perc = round(100 * n / N, 1),
+    value = sprintf("%.1f%% (%s/%s)", perc, n, N)
+  ) |>
+  ungroup() |>
+  select(year, month, death_source, value) |>
+  pivot_wider(
+    names_from = death_source,
+    values_from = value
+  ) |>
+  rename(
+    `ONS & TPP-pcEHR` = Both,
+    `Only ONS` = ONS_only,
+    `Only TPP-pcEHR` = TPP_only
+  ) |>
+  mutate(
+    month = format(month, "%b")
+  ) |>
+  arrange(year, month) |>
+  select(year, month, `ONS & TPP-pcEHR`, `Only ONS`, `Only TPP-pcEHR`)
+
+table_paper_3
+```
+
+
+```{r}
+# =====================================================
+# TPP-pcEHR deaths: coded vs dated
+# =====================================================
+
+fig_tpp_codes <- tpp_death_code_or_date |>
+  filter(
+    !is.na(tpp_date_or_coded),
+    death_date_ref_year_w_tpp_codes < 2026
+  ) |>
+  mutate(
+    tpp_date_or_coded = factor(
+      tpp_date_or_coded,
+      levels = c(
+        "tpp_dated_and_coded",
+        "tpp_dated_only",
+        "tpp_coded_only"
+      ),
+      labels = c(
+        "TPP-pcEHR dated & coded",
+        "TPP-pcEHR dated only",
+        "TPP-pcEHR coded only"
+      )
+    )
+  )
+
+p_tpp_codes <- ggplot(
+  fig_tpp_codes,
+  aes(
+    x = death_date_ref_year_w_tpp_codes,
+    y = total,
+    fill = tpp_date_or_coded
+  )
+) +
+  geom_col(
+    position = "stack",
+    color = "white",
+    linewidth = 0.2
+  ) +
+  scale_fill_manual(
+    values = c(
+      "TPP-pcEHR dated & coded" = "#F28E2B",
+      "TPP-pcEHR dated only" = "#D81B60",
+      "TPP-pcEHR coded only" = "#FDD835"
+    ),
+    name = NULL
+  ) +
+  labs(
+    x = "Year of death",
+    y = "Number of deaths",
+    title = "(A)"
+  ) +
+  theme_minimal() +
+  theme(
+    legend.position = "bottom",
+    axis.text.x = element_text(angle = 45, hjust = 1)
+  )
+
+p_tpp_codes
+```
+
+```{r}
+# =====================================================
+# Table: source classification with and without TPP coded deaths
+# =====================================================
+
+table_source_classification <- bind_rows(
+  table_death_source_overall_any_tpp |>
+    filter(
+      death_date_ref_year_w_tpp_codes >= 2019,
+      death_date_ref_year_w_tpp_codes <= 2025
+    ) |>
+    transmute(
+      year = death_date_ref_year_w_tpp_codes,
+      classification = "Including TPP dated or coded deaths",
+      source = case_when(
+        death_source_tpp_date_or_coded == "Both" ~ "ONS & TPP-pcEHR",
+        death_source_tpp_date_or_coded == "ONS_only" ~ "ONS only",
+        death_source_tpp_date_or_coded == "TPP_only" ~ "TPP-pcEHR only"
+      ),
+      n = total,
+      N = total_year
+    ),
+  fig_1_data |>
+    filter(
+      death_date_ref_year >= 2019,
+      death_date_ref_year <= 2025
+    ) |>
+    transmute(
+      year = death_date_ref_year,
+      classification = "Using TPP date only",
+      source = case_when(
+        death_source == "Both" ~ "ONS & TPP-pcEHR",
+        death_source == "ONS_only" ~ "ONS only",
+        death_source == "TPP_only" ~ "TPP-pcEHR only"
+      ),
+      n = total,
+      N = total_subgroup_value
+    )
+) |>
+  mutate(
+    source = factor(
+      source,
+      levels = c("ONS & TPP-pcEHR", "ONS only", "TPP-pcEHR only")
+    ),
+    perc = round(100 * n / N, 1),
+    value = sprintf("%.1f%% (%s/%s)", perc, n, N)
+  ) |>
+  select(classification, year, source, value) |>
+  pivot_wider(
+    names_from = source,
+    values_from = value
+  ) |>
+  arrange(classification, year)
+
+table_source_classification
+```
+
+
+```{r}
+# =====================================================
+# Deaths recorded only in ONS, by subgroup
+# =====================================================
+
+fig_ons_only_subgroups <- table_death_source |>
+  filter(
+    death_source == "ONS_only",
+    death_date_ref_year >= 2020,
+    death_date_ref_year <= 2025,
+    subgroup != "overall",
+    !is.na(perc)
+  )
+
+theme_subgroup <- theme_minimal() +
+  theme(
+    axis.text.x = element_text(angle = 0, hjust = 0.5)
+  )
+
+scale_x_subgroup <- scale_x_continuous(
+  breaks = 2020:2025
+)
+
+scale_y_subgroup <- scale_y_continuous(
+  limits = c(0, 20),
+  breaks = seq(0, 20, by = 5),
+  labels = \(x) paste0(x, "%")
+)
+
+# Age band
+p_age_band <- fig_ons_only_subgroups |>
+  filter(subgroup == "age_band") |>
+  ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) +
+  geom_line(linewidth = 0.8) +
+  geom_point(size = 1.8) +
+  scale_x_subgroup +
+  scale_y_subgroup +
+  labs(
+    x = "Year of death",
+    y = "Deaths recorded only in ONS (%)",
+    color = "Age band"
+  ) +
+  theme_subgroup
+
+# Sex
+p_sex <- fig_ons_only_subgroups |>
+  filter(subgroup == "sex") |>
+  ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) +
+  geom_line(linewidth = 0.8) +
+  geom_point(size = 1.8) +
+  scale_x_subgroup +
+  scale_y_subgroup +
+  labs(
+    x = "Year of death",
+    y = "Deaths recorded only in ONS (%)",
+    color = "Sex"
+  ) +
+  theme_subgroup
+
+# Ethnicity
+p_ethnicity <- fig_ons_only_subgroups |>
+  filter(subgroup == "ethnicity") |>
+  ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) +
+  geom_line(linewidth = 0.8) +
+  geom_point(size = 1.8) +
+  scale_x_subgroup +
+  scale_y_subgroup +
+  labs(
+    x = "Year of death",
+    y = "Deaths recorded only in ONS (%)",
+    color = "Ethnicity"
+  ) +
+  theme_subgroup
+
+# IMD quintile
+p_imd <- fig_ons_only_subgroups |>
+  filter(subgroup == "imd_quintile") |>
+  ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) +
+  geom_line(linewidth = 0.8) +
+  geom_point(size = 1.8) +
+  scale_x_subgroup +
+  scale_y_subgroup +
+  labs(
+    x = "Year of death",
+    y = "Deaths recorded only in ONS (%)",
+    color = "IMD quintile"
+  ) +
+  theme_subgroup
+
+# Rural/urban
+p_rural_urban <- fig_ons_only_subgroups |>
+  filter(subgroup == "rural_urban") |>
+  ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) +
+  geom_line(linewidth = 0.8) +
+  geom_point(size = 1.8) +
+  scale_x_subgroup +
+  scale_y_subgroup +
+  labs(
+    x = "Year of death",
+    y = "Deaths recorded only in ONS (%)",
+    color = "Rural/urban"
+  ) +
+  theme_subgroup
+
+# Region
+p_region <- fig_ons_only_subgroups |>
+  filter(subgroup == "region") |>
+  ggplot(aes(x = death_date_ref_year, y = perc, group = subgroup_value, color = subgroup_value)) +
+  geom_line(linewidth = 0.8) +
+  geom_point(size = 1.8) +
+  scale_x_subgroup +
+  scale_y_subgroup +
+  labs(
+    x = "Year of death",
+    y = "Deaths recorded only in ONS (%)",
+    color = "Region"
+  ) +
+  theme_subgroup
+```
+
+
+```{r}
+# =====================================================
+# Difference in date of death between ONS and TPP-pcEHR
+# =====================================================
+
+table_dod_diff_3cats <- table_ons_tpp_dates_diff |>
+  filter(
+    subgroup == "overall",
+    subgroup_value == "All",
+    death_date_ref_year %in% 2020:2025
+  ) |>
+  mutate(
+    dod_diff_3cat = case_when(
+      dod_diff_groups == "0" ~ "Same day",
+      dod_diff_groups %in% c("-1 to -7", "1-7") ~ "Within 7 days",
+      dod_diff_groups %in% c("-8 to -30", "-31+", "8 to 30", "31+") ~ "More than 7 days",
+      TRUE ~ NA_character_
+    )
+  ) |>
+  filter(!is.na(dod_diff_3cat)) |>
+  group_by(death_date_ref_year, dod_diff_3cat) |>
+  summarise(
+    total = sum(total, na.rm = TRUE),
+    .groups = "drop"
+  ) |>
+  group_by(death_date_ref_year) |>
+  mutate(
+    total_year = sum(total, na.rm = TRUE),
+    perc = total / total_year * 100,
+    dod_diff_3cat = factor(
+      dod_diff_3cat,
+      levels = c("Same day", "Within 7 days", "More than 7 days")
+    )
+  ) |>
+  ungroup()
+
+ggplot(
+  table_dod_diff_3cats,
+  aes(
+    x = factor(death_date_ref_year),
+    y = perc,
+    fill = dod_diff_3cat
+  )
+) +
+  geom_col(width = 0.8) +
+  scale_y_continuous(
+    labels = \(x) paste0(x, "%")
+  ) +
+  scale_fill_manual(
+    values = c(
+      "Same day" = "#44aa99",
+      "Within 7 days" = "#88ccee",
+      "More than 7 days" = "#ddcc77"
+    ),
+    name = NULL
+  ) +
+  labs(
+    x = "Year of death",
+    y = "Percentage"
+  ) +
+  theme_minimal()
+
+```
+
+```{r}
+# =====================================================
+# Distribution of ONS-only deaths across practices (deciles)
+# =====================================================
+
+practice_deciles_ons_perc <- ggplot(
+  table_practice_percentiles,
+  aes(
+    x = year,
+    y = value,
+    group = percentile,
+    linetype = line_group,
+    color = line_group,
+    linewidth = line_group
+  )
+) +
+  geom_line() +
+
+  # Aesthetics: median vs deciles
+  scale_color_manual(
+    values = c(decile = "grey60", median = "black")
+  ) +
+  scale_linetype_manual(
+    values = c(decile = "dashed", median = "solid")
+  ) +
+  scale_linewidth_manual(
+    values = c(decile = 0.3, median = 0.9)
+  ) +
+
+  # Axes
+  scale_x_continuous(
+    breaks = sort(unique(table_practice_percentiles$year))
+  ) +
+  scale_y_continuous(
+    labels = \(x) paste0(x, "%")
+  ) +
+
+  labs(
+    title = "% of deaths recorded only in ONS across practices",
+    subtitle = paste0(
+      "Practices per year (≥10 deaths): ",
+      paste(
+        with(
+          distinct(table_practice_percentiles, year, n_practices),
+          paste0(year, "=", n_practices)
+        ),
+        collapse = " · "
+      )
+    ),
+    x = "Year",
+    y = "ONS-only deaths (%)",
+    color = NULL,
+    linetype = NULL,
+    linewidth = NULL
+  ) +
+
+  theme_minimal(base_size = 12) +
+  theme(
+    legend.position = "bottom",
+    axis.text.x = element_text(angle = 45, hjust = 1)
+  )
+
+practice_deciles_ons_perc
+```
+
diff --git a/project.yaml b/project.yaml
index 889f15c..8c7e387 100644
--- a/project.yaml
+++ b/project.yaml
@@ -1,91 +1,63 @@
-version: '4.0'
+version: '5.0'
 
 actions:
-#Dataset for diff time
-  dataset_death_TPP_ONS:
-    run: ehrql:v1 generate-dataset analysis/dataset_def/dataset_definition.py --output output/highly_sensitive/dataset_death_TPP_ONS.csv.gz
+
+  dataset_death_raw:
+    run: ehrql:v1 generate-dataset analysis/dataset_def/dataset_definition.py 
+      --output output/highly_sensitive/dataset_death_TPP_ONS.csv.gz
     outputs:
       highly_sensitive:
         dataset: output/highly_sensitive/dataset_death_TPP_ONS.csv.gz
-  
-  summary_stat:
-    run: r:v2 analysis/1_summary_stats.R
-      --output output/summary_stats/*.csv 
-    needs: [dataset_death_TPP_ONS]
+
+  dataset_death_processed:
+    run: r:v2 analysis/1_derive_key_variables.R 
+    needs: [dataset_death_raw]
     outputs:
+      highly_sensitive:
+        csv: output/highly_sensitive/death_registration_processed.csv.gz
       moderately_sensitive:
-        tables: output/summary_stats/*.csv
+        txt: output/analysis_tables/death_registration_processed_skim.txt
+        csv: output/analysis_tables/table_source_raw.csv
 
-  reg_dereg_ONS_TPP:
-    run: r:v2 analysis/2_tables_ONS_TPP_reg_dereg.R 
-      --output output/report/reg_dereg_ONS_TPP/*.csv 
-    needs: [dataset_death_TPP_ONS]
+
+  report_implausible_death_dates:
+    run: r:v2 analysis/2_implausible_death_dates.R 
+    needs: [dataset_death_processed]
     outputs:
       moderately_sensitive:
-        csv: output/report/reg_dereg_ONS_TPP/*.csv
+        death_implausible_source: output/analysis_tables/death_implausible_source.csv
 
-  Dataset_source_dod_diff_processed:
-    run: r:v2 analysis/3_dataset_DoD_TPP_ONS_with_grace_period.R
-      --output output/highly_sensitive/DoD_TPP_ONS_with_grace_period.csv
-    needs: [dataset_death_TPP_ONS]
+  report_registration_at_death:
+    run: r:v2 analysis/3_registration_at_death.R 
+    needs: [dataset_death_processed]
     outputs:
-      highly_sensitive:
-        csv: output/highly_sensitive/DoD_TPP_ONS_with_grace_period.csv
+      moderately_sensitive:
+        registration_status_source: output/analysis_tables/registration_status_source.csv 
+        reg_start_timing_source: output/analysis_tables/reg_start_timing_source.csv
+        reg_end_timing_source: output/analysis_tables/reg_end_timing_source.csv
 
-  death_source:
-    run: r:v2 analysis/4_source_prop.R
-      --output output/report/by_source/*.csv 
-    needs: [Dataset_source_dod_diff_processed]
+  report_death_source_comparison:
+    run: r:v2 analysis/4_death_source_comparison.R 
+    needs: [dataset_death_processed]
     outputs:
       moderately_sensitive:
-        csv: output/report/by_source/*.csv
+        table_death_source: output/analysis_tables/table_death_source.csv
+        table_death_source_25_26: output/analysis_tables/table_death_source_25_26.csv
+        tpp_death_code_or_date: output/analysis_tables/tpp_death_code_or_date.csv
+        table_death_source_overall_any_tpp: output/analysis_tables/table_death_source_overall_any_tpp.csv
 
-  DoD_difference:
-    run: r:v2 analysis/5_DoD_diff.R
-      --output output/report/DoD_diff/*.csv 
-    needs: [Dataset_source_dod_diff_processed]
+  report_sources_date_agreement:
+    run: r:v2 analysis/5_sources_date_agreement.R 
+    needs: [dataset_death_processed]
     outputs:
       moderately_sensitive:
-        csv: output/report/DoD_diff/*.csv
-        
-# Measures
-  # measures_overall:
-  #   run: ehrql:v1 generate-measures analysis/measure_def.py 
-  #     --output output/measures/measures_overall.csv
-  #   outputs:
-  #     moderately_sensitive:
-  #       measure_csv: output/measures/measures_overall.csv
+        table_ons_tpp_dates_diff: output/analysis_tables/table_ons_tpp_dates_diff.csv
   
-  # table_measure_rate:
-  #   run: r:v2 analysis/Table_measure_rate.R
-  #     --output output/report/*.csv 
-  #   needs: [measures_overall]
-  #   outputs:
-  #     moderately_sensitive:
-  #       categorical: output/report/collate_measures_rate_table.csv
-
-  measures_by_practice:
-    run: ehrql:v1 generate-measures analysis/dataset_def/measure_practice.py 
-      --output output/highly_sensitive/measures_by_practice.csv
-    outputs:
-      highly_sensitive:
-        measure_csv: output/highly_sensitive/measures_by_practice.csv
-
-  decile_table_figure:
-    run:  r:v2 analysis/6_practice_by_source.R
-    needs: [measures_by_practice]
+  report_variation_source_by_practice:
+    run: r:v2 analysis/6_variation_source_by_practice.R 
+    needs: [dataset_death_processed]
     outputs:
       moderately_sensitive:
-        csv: output/report/source_by_practice/*.csv
-        png: output/report/source_by_practice/*.png  
-
-
-  # visualization:
-  #   run:  r:v2 Final-visualizations.R
-  #   needs: [table_DoD]
-  #   outputs:
-  #     moderately_sensitive:
-  #       csv: output/final_visualization/*.csv
-  #       png: output/final_visualization/*.png 
-
-#
\ No newline at end of file
+        table_practice_percentiles: output/analysis_tables/table_practice_percentiles.csv
+  
+# end----