Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions analysis/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

interpreter_migrant_codes = codelist_from_csv("codelists/opensafely-interpreter-required.csv", column="code")

trafficking_codes = codelist_from_csv("codelists/opensafely-trafficking-and-modern-slavery.csv", column="code")

british_ethnicities_codes = codelist_from_csv("codelists/opensafely-british-ethnicities.csv", column="code")

ethnicity_16_level_codelist = codelist_from_csv(
Expand Down
182 changes: 182 additions & 0 deletions analysis/dataset_definition_date_of_entry_cohort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# #############################################################################
# Number of migrants in OpenSAFELY-TPP from 2009-2025
# - Author: Yamina Boukari
# - Bennett Institute for Applied Data Science, University of Oxford, 2025
#############################################################################

# This is a script to create a cohort of people:
# 1) have a date of UK entry code
# 1) were registered at anytime (2009-2025)
# 2) had a first registration that was between their birth and death date
# 2) do not have a disclosive sex AND
# 4) did not die before or on 1st Jan 2009 (study start)
# 4) had a plausible age at the beginning of the study period (i.e. not >110 years old in 2009)

from pathlib import Path

from ehrql import create_dataset, codelist_from_csv, show, case, when, days
from ehrql.tables.tpp import addresses, patients, practice_registrations, clinical_events, ons_deaths
import codelists
import migration_status_variables

# Dates

study_start_date = "2009-01-01"
study_end_date = "2025-12-31"

date_of_first_practice_registration = (
practice_registrations.sort_by(practice_registrations.start_date)
.first_for_patient().start_date
)

end_date_of_latest_practice_registration = (
practice_registrations.sort_by(practice_registrations.end_date)
.last_for_patient().end_date
)

is_registered_at_any_time_during_study = (
# starts during period
date_of_first_practice_registration.is_on_or_between(
study_start_date,
study_end_date,
)
# ending during period
| end_date_of_latest_practice_registration.is_on_or_between(
study_start_date,
study_end_date,
)
# starting before and ending after (or ongoing)
| (
date_of_first_practice_registration.is_on_or_before(study_start_date)
& (
end_date_of_latest_practice_registration.is_on_or_after(study_end_date)
| end_date_of_latest_practice_registration.is_null()
)
)
)

has_first_registration_between_birth_and_death = (
# between dob and date of death
(date_of_first_practice_registration.is_on_or_between(patients.date_of_birth, patients.date_of_death)) |
# after dob and date of death is null (still alive)
(date_of_first_practice_registration.is_on_or_after(patients.date_of_birth) & patients.date_of_death.is_null())
)


has_non_disclosive_sex = (
(patients.sex == "male") | (patients.sex == "female")
)

did_not_die_before_study_start = (
((patients.date_of_death > study_start_date) | (patients.date_of_death.is_null())) &
((ons_deaths.date > study_start_date) | (ons_deaths.date.is_null()))
)

was_not_over_110_at_study_start_or_less_than_0_at_end_date = (
(patients.age_on(study_start_date) <= 110) | (patients.age_on(study_end_date) >= 0)
)

# has date of UK entry code

date_of_entry_code = ["860021000000109"]

has_date_of_uk_entry = (
clinical_events
.where(clinical_events.snomedct_code.is_in(date_of_entry_code))
.where(clinical_events.date.is_on_or_between(patients.date_of_birth, study_end_date))
.where((clinical_events.date.is_on_or_before(patients.date_of_death)) | (patients.date_of_death.is_null()))
.exists_for_patient()
)


dataset = create_dataset()
dataset.define_population(has_date_of_uk_entry &
is_registered_at_any_time_during_study &
has_first_registration_between_birth_and_death &
has_non_disclosive_sex &
did_not_die_before_study_start &
was_not_over_110_at_study_start_or_less_than_0_at_end_date)

# add variables

## year of birth and date of birth
year_of_birth = (patients.date_of_birth).year
dataset.year_of_birth = year_of_birth

dataset.year_of_birth_band = case(
when((year_of_birth >= 1900) & (year_of_birth <= 1925)).then("1900-1925"),
when((year_of_birth > 1925) & (year_of_birth <= 1945)).then("1926-1945"),
when((year_of_birth > 1945) & (year_of_birth <= 1965)).then("1946-1965"),
when((year_of_birth > 1965) & (year_of_birth <= 1985)).then("1966-1985"),
when((year_of_birth > 1985) & (year_of_birth <= 2005)).then("1986-2005"),
when((year_of_birth > 2005) & (year_of_birth <= 2025)).then("2006-2025")
)

dataset.date_of_birth = patients.date_of_birth

## sex

dataset.sex = patients.sex

## ethnicity

latest_ethnicity_code = (
clinical_events.where(clinical_events.snomedct_code.is_in(codelists.ethnicity_16_level_codelist))
.where(clinical_events.date.is_on_or_before(study_end_date))
.sort_by(clinical_events.date)
.last_for_patient()
.snomedct_code)
dataset.latest_ethnicity_code = latest_ethnicity_code

latest_ethnicity_16_level_group = latest_ethnicity_code.to_category(
codelists.ethnicity_16_level_codelist)
dataset.latest_ethnicity_16_level_group = latest_ethnicity_16_level_group

latest_ethnicity_6_level_group = latest_ethnicity_code.to_category(
codelists.ethnicity_6_level_codelist)
dataset.latest_ethnicity_6_level_group = latest_ethnicity_6_level_group

## practice region (latest during the study period)

dataset.region = (practice_registrations
.sort_by(practice_registrations.start_date)
.last_for_patient()
.practice_nuts1_region_name)

## imd

address = (addresses
.sort_by(addresses.start_date)
.last_for_patient())

dataset.imd_decile = address.imd_decile
dataset.imd_quintile = address.imd_quintile

## date of first practice registration

dataset.date_of_first_practice_registration = date_of_first_practice_registration

dataset.date_of_death = patients.date_of_death

# migration status

migrant_indicators = migration_status_variables.build_migrant_indicators(study_end_date)

for name, indicator in migrant_indicators.items():
setattr(dataset, name, indicator)

## consolidate migration indiciators into 2-cat, 3-cat and 6-cat variables

dataset.mig_status_2_cat = migration_status_variables.build_mig_status_2_cat(migrant_indicators)

dataset.mig_status_3_cat = migration_status_variables.build_mig_status_3_cat(
migrant_indicators)

dataset.mig_status_6_cat = migration_status_variables.build_mig_status_6_cat(
migrant_indicators
)

dataset.configure_dummy_data(population_size=1000)
show(dataset)


18 changes: 9 additions & 9 deletions analysis/generate_annual_migrant_counts_2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@

# overall (ungrouped) measures

for label in labels:
bool_numer = (mig2_expr == label)
safe_label = label.lower().replace(" ", "_").replace("-", "_")
var_name = "mig_status_2_cat_overall"
# for label in labels:
# bool_numer = (mig2_expr == label)
# safe_label = label.lower().replace(" ", "_").replace("-", "_")
# var_name = "mig_status_2_cat_overall"

name = f"{var_name}_{safe_label}"
# name = f"{var_name}_{safe_label}"

measures.define_measure(
name=name,
numerator=bool_numer
)
# measures.define_measure(
# name=name,
# numerator=bool_numer
# )
18 changes: 9 additions & 9 deletions analysis/generate_annual_migrant_counts_3cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@

# overall (ungrouped) measures

for label in labels:
bool_numer = (mig3_expr == label)
safe_label = label.lower().replace(" ", "_").replace("-", "_")
var_name = "mig_status_3_cat_overall"
# for label in labels:
# bool_numer = (mig3_expr == label)
# safe_label = label.lower().replace(" ", "_").replace("-", "_")
# var_name = "mig_status_3_cat_overall"

name = f"{var_name}_{safe_label}"
# name = f"{var_name}_{safe_label}"

measures.define_measure(
name=name,
numerator=bool_numer
)
# measures.define_measure(
# name=name,
# numerator=bool_numer
# )
20 changes: 10 additions & 10 deletions analysis/generate_annual_migrant_counts_6cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,16 @@
name = f"{var_name}_{safe_label}_{suffix}"
measures.define_measure(name=name, numerator=bool_numer, group_by=group)

# overall (ungrouped) measures
# # overall (ungrouped) measures

for label in labels:
bool_numer = (mig6_expr == label)
safe_label = label.lower().replace(" ", "_").replace("-", "_")
var_name = "mig_status_6_cat_overall"
# for label in labels:
# bool_numer = (mig6_expr == label)
# safe_label = label.lower().replace(" ", "_").replace("-", "_")
# var_name = "mig_status_6_cat_overall"

name = f"{var_name}_{safe_label}"
# name = f"{var_name}_{safe_label}"

measures.define_measure(
name=name,
numerator=bool_numer
)
# measures.define_measure(
# name=name,
# numerator=bool_numer
# )
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@

# Overall (no grouping) counts

var_name = "migration_status_types_overall"
# var_name = "migration_status_types_overall"

for key, expr in numerators_separate.items():
safe_label = key.lower().replace(" ", "_").replace("-", "_")
name = f"{var_name}_{safe_label}"
# for key, expr in numerators_separate.items():
# safe_label = key.lower().replace(" ", "_").replace("-", "_")
# name = f"{var_name}_{safe_label}"

measures.define_measure(
name=name,
numerator=expr
)
# measures.define_measure(
# name=name,
# numerator=expr
# )
34 changes: 9 additions & 25 deletions analysis/migration_status_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"refugee_asylum_status": codelists.asylum_refugee_migrant_codes,
"english_not_main_language": codelists.english_not_main_language_excl_interpreter_migrant_codes,
"interpreter_required": codelists.interpreter_migrant_codes,
"trafficking": codelists.trafficking_codes,
"british_ethnicities": codelists.british_ethnicities_codes
}

Expand Down Expand Up @@ -64,50 +65,33 @@ def build_mig_status_6_cat(migrant_indicators):
6-category migrant status (priority order):
- Definite migrant: not_born_in_uk
- Highly likely migrant: immig_status_excl_refugee_asylum OR refugee_asylum_status
- Likely migrant: english_not_main_language OR interpreter_required
- Likely migrant: english_not_main_language OR interpreter_required OR trafficking
- Definite non-migrant: born_in_uk
- Likely non-migrant: british_ethnicities AND no migrant code
- Unknown: none of the above
- Unknown: no migrant codes
"""
migrant = migrant_indicators.get("any_migrant", False)
not_born_in_uk = migrant_indicators.get("not_born_in_uk", False)
immig_excl = migrant_indicators.get("immig_status_excl_refugee_asylum", False)
refugee_asylum = migrant_indicators.get("refugee_asylum_status", False)
english_not_main = migrant_indicators.get("english_not_main_language", False)
interpreter_required = migrant_indicators.get("interpreter_required", False)
trafficking = migrant_indicators.get("trafficking", False)
born_in_uk = migrant_indicators.get("born_in_uk", False)
british_ethnicities = migrant_indicators.get("british_ethnicities", False)

# Compose combined conditions
highly_likely = immig_excl | refugee_asylum
likely_migrant = english_not_main | interpreter_required
likely_migrant = english_not_main | interpreter_required | trafficking
likely_non_migrant = ((british_ethnicities) & ~migrant)
unknown = (~migrant)

return case(
when(not_born_in_uk).then("Definite migrant"),
when(born_in_uk).then("Definite non-migrant"),
when(highly_likely).then("Highly likely migrant"),
when(likely_migrant).then("Likely migrant"),
when(born_in_uk).then("Definite non-migrant"),
when(likely_non_migrant).then("Likely non-migrant"),
otherwise="Unknown"
when(unknown).then("Unknown"),
otherwise="Error"
)

# # Build the case expression in precedence order
# clauses = [
# (not_born_in_uk, "Definite migrant"),
# (highly_likely, "Highly likely migrant"),
# (likely_migrant, "Likely migrant"),
# (likely_non_migrant, "Likely non-migrant"),
# (born_in_uk, "Definite non-migrant"),
# ]

# # Start assembling call to case(...) with only the non-empty conditions
# case_args = []
# for cond, label in clauses:
# case_args.append(when(cond).then(label))

# return case(
# *case_args,
# otherwise="Unknown"
# )

Loading