Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 43 additions & 58 deletions analysis/summary_stats.R → analysis/1_summary_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ library("here")
library("skimr")

## Create output directory
output_dir <- here("output", "DoD")
output_dir <- here("output", "summary_stats")
fs::dir_create(output_dir)

# Import processed data ----
DoD_diff <- read_csv("output/dataset_death_date_diff.csv.gz") %>%
dataset_death_TPP_ONS <- read_csv("output/highly_sensitive/dataset_death_TPP_ONS.csv.gz") %>%
mutate(
region = as.factor(region),
age_band = as.factor(age_band),
Expand All @@ -32,69 +32,70 @@ DoD_diff <- read_csv("output/dataset_death_date_diff.csv.gz") %>%
)
)

DoD_diff_plus_inc_crit <- DoD_diff %>%
dataset_death_TPP_ONS_plus_inc_crit <- dataset_death_TPP_ONS %>%
filter(
has_registration == TRUE & # was registered at the beginning of the year the person died
any_death_during_study == TRUE # died between "2009-01-01" - "2025-06-06" + (deregistration date + 30 days) is after one date of death
)

DoD_diff_plus_inc_crit_0_dereg_death <- DoD_diff_plus_inc_crit %>%
dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death <- dataset_death_TPP_ONS_plus_inc_crit %>%
filter(
death_dereg_diff >= 0)



# Summary
## Gral
summary_DoD_diff <- skim(DoD_diff)
summary_dataset_death_TPP_ONS <- skim(dataset_death_TPP_ONS)

summary_DoD_diff_plus_inc_crit <- skim(DoD_diff_plus_inc_crit)
summary_dataset_death_TPP_ONS_plus_inc_crit <- skim(dataset_death_TPP_ONS_plus_inc_crit)

summary_DoD_diff_plus_inc_crit_0_dereg_death <- skim(DoD_diff_plus_inc_crit_0_dereg_death)
summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death <- skim(dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death)

write.csv(summary_DoD_diff, file = here::here("output", "DoD","summary_stats_DoD.csv"),
write.csv(summary_dataset_death_TPP_ONS, file = here::here("output", "summary_stats","summary_stats_dataset_death_TPP_ONS.csv"),
row.names = FALSE)

write.csv(summary_DoD_diff_plus_inc_crit, file = here::here("output", "DoD","summary_DoD_diff_plus_inc_crit.csv"),
write.csv(summary_dataset_death_TPP_ONS_plus_inc_crit, file = here::here("output", "summary_stats","summary_dataset_death_TPP_ONS_plus_inc_crit.csv"),
row.names = FALSE)

write.csv(summary_DoD_diff_plus_inc_crit_0_dereg_death, file = here::here("output", "DoD","summary_DoD_diff_plus_inc_crit_0_dereg_death.csv"),
write.csv(summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death, file = here::here("output", "summary_stats","summary_dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death.csv"),
row.names = FALSE)

## Cat
table_freq <- DoD_diff %>%
## Categorical variables

table_freq <- dataset_death_TPP_ONS %>%
pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>%
group_by(year(min_DoD), subgroup, category) %>%
summarise(
n=n()
)

table_freq_plus_inc_crit <- DoD_diff_plus_inc_crit %>%
table_freq_plus_inc_crit <- dataset_death_TPP_ONS_plus_inc_crit %>%
pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>%
group_by(year(min_DoD), subgroup, category) %>%
summarise(
n=n()
)

table_freq_plus_inc_crit_0_dereg_death <- DoD_diff_plus_inc_crit_0_dereg_death %>%
table_freq_plus_inc_crit_0_dereg_death <- dataset_death_TPP_ONS_plus_inc_crit_0_dereg_death %>%
pivot_longer(cols = c(age_band, ons_death_place:ethnicity), names_to = "subgroup", values_to = "category") %>%
group_by(year(min_DoD), subgroup, category) %>%
summarise(
n=n()
)

write.csv(table_freq, file = here::here("output", "DoD","table_freq_DoD.csv"),
write.csv(table_freq, file = here::here("output", "summary_stats","table_freq_DoD.csv"),
row.names = FALSE)

write.csv(table_freq_plus_inc_crit, file = here::here("output", "DoD","table_freq_plus_inc_crit.csv"),
write.csv(table_freq_plus_inc_crit, file = here::here("output", "summary_stats","table_freq_plus_inc_crit.csv"),
row.names = FALSE)

write.csv(table_freq_plus_inc_crit_0_dereg_death, file = here::here("output", "DoD","table_freq_plus_inc_crit_0_dereg_death.csv"),
write.csv(table_freq_plus_inc_crit_0_dereg_death, file = here::here("output", "summary_stats","table_freq_plus_inc_crit_0_dereg_death.csv"),
row.names = FALSE)


# Impossible dates of death not release
impossible_dod_month <- DoD_diff %>%
# Impossible dates of death
impossible_dod <- dataset_death_TPP_ONS %>%
mutate(
ons_DoD_impossible = case_when(
ons_death_date < date_of_birth ~ "death_before_birth",
Expand All @@ -110,7 +111,8 @@ impossible_dod_month <- DoD_diff %>%
is.na(TPP_death_date) ~ "is empty",
TRUE ~ "ok"
),
year_month_min_dod = format(min_DoD, "%Y-%m")
year_month_min_dod = format(min_DoD, "%Y-%m"),
year_min_dod = format(min_DoD, "%Y")
) %>%
pivot_longer(
cols = c(ons_DoD_impossible, TPP_DoD_impossible),
Expand All @@ -123,7 +125,10 @@ impossible_dod_month <- DoD_diff %>%
source == "TPP_DoD_impossible" ~ "TPP",
TRUE ~ source
)
) %>%
)

# by month
impossible_dod_month <- impossible_dod %>%
group_by(year_month_min_dod, source, DoD_impossible) %>%
summarise(
n = n(),
Expand All @@ -132,45 +137,25 @@ impossible_dod_month <- DoD_diff %>%

write.csv(
impossible_dod_month,
here::here("output", "DoD", "impossible_dod_month.csv"),
here::here("output", "summary_stats", "impossible_dod_month.csv"),
row.names = FALSE
)


# by year
impossible_dod_year <- impossible_dod %>%
group_by(year_min_dod, source, DoD_impossible) %>%
summarise(
n = n(),
.groups = "drop"
)

write.csv(
impossible_dod_year,
here::here("output", "summary_stats", "impossible_dod_year.csv"),
row.names = FALSE
)
# # plots
#
# # Plot histogram faceted by data source
# ## Data prep
# DoD_diff_date_long <- DoD_diff %>%
# pivot_longer(
# cols = c(TPP_death_date, ons_death_date, min_DoD),
# names_to = "source",
# values_to = "death_date"
# )

# ## Plot
# DoD_histogram <- ggplot(DoD_diff_date_long, aes(x = death_date)) +
# geom_histogram() +
# facet_wrap(~ source, scales = "free_y") +
# theme_minimal() +
# labs(title = "Death Dates by Source", x = "Date", y = "Count")
#
#
# # Cat variables bar plots
# ## Data prep
# DoD_diff_cat_long <- DoD_diff %>%
# pivot_longer(
# cols = c(ons_death_place, region, rural_urban, age_band, IMD_q10, ethnicity),
# names_to = "variable",
# values_to = "category"
# )
#
# # Bar plots subcat
# DoD_diff_cat_bar_plot <- ggplot(DoD_diff_cat_long, aes(x = category)) +
# geom_bar(aes(fill=year)) +
# facet_wrap(~ variable, scales = "free_x") +
# theme_minimal() +
# labs(title = "Counts by subgroup",
# x = "Category", y = "Count") +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))




Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ output_dir <- here("output", "report", "reg_dereg_ONS_TPP")
fs::dir_create(output_dir)

# Import processed data ----
dataset0 <- read_csv("output/dataset_death_date_diff.csv.gz") %>%
dataset_death_TPP_ONS <- read_csv("output/highly_sensitive/dataset_death_TPP_ONS.csv.gz") %>%
mutate(
TPP_death_date = as.Date(TPP_death_date),
ons_death_date = as.Date(ons_death_date),
Expand All @@ -38,7 +38,7 @@ rounding <- function(vars) {

# Data curating ---------------

dataset <- dataset0 %>%
death_TPP_ONS_reg_dereg <- dataset_death_TPP_ONS %>%
mutate(
TPP_death_reg = case_when(
!is.na(TPP_death_date) & !is.na(last_registration_start_date) ~
Expand Down Expand Up @@ -102,30 +102,30 @@ dataset <- dataset0 %>%

# a- ONS deaths distribution across time and inclusion criteria

ONS_death_year_by_reg <- dataset %>%
ONS_reg_tpp_year <- death_TPP_ONS_reg_dereg %>%
filter(!is.na(ons_death_date)) %>%
group_by(ons_death_year = year(ons_death_date)) %>%
summarise(
any_ons_death = rounding(n()), # any ONS death
ons_death_regist_before = rounding(sum(has_registration == TRUE, na.rm = TRUE)), # ONS + last registration before death
ons_registred_during = rounding(sum(ons_death_during_study == TRUE & has_registration == TRUE, na.rm = TRUE)),
tpp_any = rounding(sum(!is.na(TPP_death_date), na.rm = TRUE)), # any ONS death + any TPP death
tpp_reg_during_study = rounding(sum(tpp_death_during_study == TRUE & has_registration == TRUE, na.rm = TRUE)), # ONS + TPP + last dereg date after death
ons_death_regis = rounding(sum(has_registration == TRUE, na.rm = TRUE)), # ONS + last registration before death
ons_regis_and_during_study = rounding(sum(ons_death_during_study == TRUE & has_registration == TRUE, na.rm = TRUE)),
any_ons_tpp = rounding(sum(!is.na(TPP_death_date), na.rm = TRUE)), # any ONS death + any TPP death
ons_tpp_regis_and_during_study = rounding(sum(tpp_death_during_study == TRUE & has_registration == TRUE, na.rm = TRUE)), # ONS + TPP + last dereg date after death
.groups = "drop"
)

write.csv(ONS_death_year_by_reg, here::here("output", "report", "reg_dereg_ONS_TPP", "ONS_death_year_by_reg.csv"))
write.csv(ONS_reg_tpp_year, here::here("output", "report", "reg_dereg_ONS_TPP", "ONS_reg_tpp_year.csv"))

# b- Difference date of death - last registration date
ONS_death_reg_group<- dataset %>%
ONS_death_reg_group<- death_TPP_ONS_reg_dereg %>%
filter(!is.na(ons_death_date)) %>%
group_by(ons_death_year = year(ons_death_date), ons_death_reg_group) %>%
summarise(n = rounding(n()), .groups = "drop")

write.csv(ONS_death_reg_group, here::here("output", "report", "reg_dereg_ONS_TPP", "ONS_death_reg_group.csv"))

# c- Difference deregistration - death
ons_death_dereg_group <- dataset %>%
ons_death_dereg_group <- death_TPP_ONS_reg_dereg %>%
filter(!is.na(ons_death_date) & has_registration == TRUE) %>%
group_by(ons_death_year = year(ons_death_date), ons_death_dereg_group) %>%
summarise(n = rounding(n()), .groups = "drop")
Expand All @@ -137,15 +137,15 @@ write.csv(ons_death_dereg_group, here::here("output", "report", "reg_dereg_ONS_T
# a- NO

# b- Difference date of death - last registration date
TPP_death_reg_group<- dataset %>%
TPP_death_reg_group<- death_TPP_ONS_reg_dereg %>%
filter(!is.na(TPP_death_date)) %>%
group_by(TPP_death_year = year(TPP_death_date), TPP_death_reg_group) %>%
summarise(n = rounding(n()), .groups = "drop")

write.csv(TPP_death_reg_group, here::here("output", "report", "reg_dereg_ONS_TPP", "TPP_death_reg_group.csv"))

# c- Difference deregistration - death
TPP_death_dereg_group <- dataset %>%
TPP_death_dereg_group <- death_TPP_ONS_reg_dereg %>%
filter(!is.na(TPP_death_date) & has_registration == TRUE) %>%
group_by(TPP_death_year = year(TPP_death_date), TPP_death_dereg_group) %>%
summarise(n = rounding(n()), .groups = "drop")
Expand All @@ -158,7 +158,7 @@ write.csv(TPP_death_dereg_group, here::here("output", "report", "reg_dereg_ONS_T
# granular table not for release

# Granular table: daily difference deregistration - death for ONS
ons_death_dereg_daily <- dataset %>%
ons_death_dereg_daily <- death_TPP_ONS_reg_dereg %>%
filter(
!is.na(ons_death_date),
!is.na(last_registration_end_date),
Expand All @@ -177,12 +177,12 @@ ons_death_dereg_daily <- dataset %>%

write.csv(
ons_death_dereg_daily,
here::here("output", "report", "reg_dereg_ONS_TPP", "ons_death_dereg_daily.csv"),
here::here("output", "report", "reg_dereg_ONS_TPP", "nr_ons_death_dereg_daily.csv"),
row.names = FALSE
)

# Granular table: daily difference deregistration - death for TPP
TPP_death_dereg_daily <- dataset %>%
TPP_death_dereg_daily <- death_TPP_ONS_reg_dereg %>%
filter(
!is.na(TPP_death_date),
has_registration == TRUE,
Expand All @@ -202,6 +202,6 @@ TPP_death_dereg_daily <- dataset %>%

write.csv(
TPP_death_dereg_daily,
here::here("output", "report", "reg_dereg_ONS_TPP", "TPP_death_dereg_daily.csv"),
here::here("output", "report", "reg_dereg_ONS_TPP", "nr_TPP_death_dereg_daily.csv"),
row.names = FALSE
)
Loading