diff --git a/analysis/dataset_definition_census_cohorts.py b/analysis/dataset_definition_census_cohorts.py index 7c370d6..1c5e122 100644 --- a/analysis/dataset_definition_census_cohorts.py +++ b/analysis/dataset_definition_census_cohorts.py @@ -111,6 +111,14 @@ dataset.mig_status_6_cat = migration_status_variables.build_mig_status_6_cat( migrant_indicators) +dataset.mig_status_2_cat_withdoe = migration_status_variables.build_mig_status_2_cat_withdoe(migrant_indicators) + +dataset.mig_status_3_cat_withdoe = migration_status_variables.build_mig_status_3_cat_withdoe( + migrant_indicators) + +dataset.mig_status_6_cat_withdoe = migration_status_variables.build_mig_status_6_cat_withdoe( + migrant_indicators) + dataset.configure_dummy_data(population_size=1000) show(dataset) \ No newline at end of file diff --git a/analysis/dataset_definition_full_study_cohort.py b/analysis/dataset_definition_full_study_cohort.py index cc53498..59484e6 100644 --- a/analysis/dataset_definition_full_study_cohort.py +++ b/analysis/dataset_definition_full_study_cohort.py @@ -153,13 +153,19 @@ ## consolidate migration indiciators into 2-cat, 3-cat and 6-cat variables dataset.mig_status_2_cat = migration_status_variables.build_mig_status_2_cat(migrant_indicators) +dataset.mig_status_2_cat_withdoe = migration_status_variables.build_mig_status_2_cat_withdoe(migrant_indicators) dataset.mig_status_3_cat = migration_status_variables.build_mig_status_3_cat( migrant_indicators) +dataset.mig_status_3_cat_withdoe = migration_status_variables.build_mig_status_3_cat_withdoe( + migrant_indicators) dataset.mig_status_6_cat = migration_status_variables.build_mig_status_6_cat( migrant_indicators ) +dataset.mig_status_6_cat_withdoe = migration_status_variables.build_mig_status_6_cat_withdoe( + migrant_indicators +) # number of migration codes per person @@ -172,12 +178,21 @@ ) dataset.number_of_migration_codes = number_of_migration_codes +date_of_entry_code = ["860021000000109"] + +number_of_migration_codes_withdoe = ( + clinical_events + .where((clinical_events.snomedct_code.is_in(codelists.all_migrant_codes)) | (clinical_events.snomedct_code.is_in(date_of_entry_code))) + .where(clinical_events.date.is_on_or_between(patients.date_of_birth, study_end_date)) + .where((clinical_events.date.is_on_or_before(patients.date_of_death)) | (patients.date_of_death.is_null())) + .count_for_patient() +) +dataset.number_of_migration_codes_withdoe = number_of_migration_codes_withdoe + # date of entry to the UK (SNOMED CT code: 860021000000109) ## has date of entry to the UK code -date_of_entry_code = ["860021000000109"] - has_date_of_uk_entry = ( clinical_events .where(clinical_events.snomedct_code.is_in(date_of_entry_code)) diff --git a/analysis/generate_annual_migrant_counts_2cat.py b/analysis/generate_annual_migrant_counts_2cat.py index 1d6d3d8..dbccc4d 100644 --- a/analysis/generate_annual_migrant_counts_2cat.py +++ b/analysis/generate_annual_migrant_counts_2cat.py @@ -30,17 +30,3 @@ else: name = f"{var_name}_{safe_label}_{suffix}" measures.define_measure(name=name, numerator=bool_numer, group_by=group) - -# overall (ungrouped) measures - -# for label in labels: -# bool_numer = (mig2_expr == label) -# safe_label = label.lower().replace(" ", "_").replace("-", "_") -# var_name = "mig_status_2_cat_overall" - -# name = f"{var_name}_{safe_label}" - -# measures.define_measure( -# name=name, -# numerator=bool_numer -# ) \ No newline at end of file diff --git a/analysis/generate_annual_migrant_counts_2cat_withdoe.py b/analysis/generate_annual_migrant_counts_2cat_withdoe.py new file mode 100644 index 0000000..69add4d --- /dev/null +++ b/analysis/generate_annual_migrant_counts_2cat_withdoe.py @@ -0,0 +1,31 @@ +from ehrql import create_measures, INTERVAL +from ehrql.tables.tpp import patients, practice_registrations, clinical_events, addresses +import migration_status_variables +from analysis import utilities +import codelists + +measures = create_measures() +measures.configure_dummy_data(population_size=1000) +measures.configure_disclosure_control(enabled=True) # enable on real data + +# build shared variables and defaults +common = utilities.build_common_vars(INTERVAL) +measures.define_defaults(denominator=common["denominator"], intervals=common["intervals"]) +subgroups = common["subgroups"] + +# build base indicators and aggregated 2-category expression +numerators_separate = migration_status_variables.build_migrant_indicators(INTERVAL.end_date) +mig2_expr = migration_status_variables.build_mig_status_2_cat_withdoe(numerators_separate) + +# register one measure per label × subgroup +labels = ["Migrant", "Non-migrant"] +for label in labels: + bool_numer = (mig2_expr == label) + safe_label = label.lower().replace(" ", "_").replace("-", "_") + var_name = "mig_status_2_cat" + for suffix, group in subgroups.items(): + if suffix == "": + name = f"{var_name}_{safe_label}" + else: + name = f"{var_name}_{safe_label}_{suffix}" + measures.define_measure(name=name, numerator=bool_numer, group_by=group) diff --git a/analysis/generate_annual_migrant_counts_3cat.py b/analysis/generate_annual_migrant_counts_3cat.py index 6e825db..a9ce24b 100644 --- a/analysis/generate_annual_migrant_counts_3cat.py +++ b/analysis/generate_annual_migrant_counts_3cat.py @@ -30,17 +30,3 @@ else: name = f"{var_name}_{safe_label}_{suffix}" measures.define_measure(name=name, numerator=bool_numer, group_by=group) - -# overall (ungrouped) measures - -# for label in labels: -# bool_numer = (mig3_expr == label) -# safe_label = label.lower().replace(" ", "_").replace("-", "_") -# var_name = "mig_status_3_cat_overall" - -# name = f"{var_name}_{safe_label}" - -# measures.define_measure( -# name=name, -# numerator=bool_numer -# ) \ No newline at end of file diff --git a/analysis/generate_annual_migrant_counts_3cat_withdoe.py b/analysis/generate_annual_migrant_counts_3cat_withdoe.py new file mode 100644 index 0000000..d0bf764 --- /dev/null +++ b/analysis/generate_annual_migrant_counts_3cat_withdoe.py @@ -0,0 +1,32 @@ + +from ehrql import create_measures, INTERVAL +from ehrql.tables.tpp import patients, practice_registrations, clinical_events, addresses +import migration_status_variables +from analysis import utilities + +measures = create_measures() +measures.configure_dummy_data(population_size=1000) +measures.configure_disclosure_control(enabled=True) # enable on real data + +# build shared variables and defaults +common = utilities.build_common_vars(INTERVAL) +measures.define_defaults(denominator=common["denominator"], intervals=common["intervals"]) +subgroups = common["subgroups"] +ethnicity = common["ethnicity"] + +# build base indicators and aggregated 3-category expression +numerators_separate = migration_status_variables.build_migrant_indicators(INTERVAL.end_date) +mig3_expr = migration_status_variables.build_mig_status_3_cat_withdoe(numerators_separate) + +# register one measure per label × subgroup +labels = ["Migrant", "Non-migrant", "Unknown"] +for label in labels: + bool_numer = (mig3_expr == label) + safe_label = label.lower().replace(" ", "_").replace("-", "_") + var_name = "mig_status_3_cat" + for suffix, group in subgroups.items(): + if suffix == "": + name = f"{var_name}_{safe_label}" + else: + name = f"{var_name}_{safe_label}_{suffix}" + measures.define_measure(name=name, numerator=bool_numer, group_by=group) diff --git a/analysis/generate_annual_migrant_counts_6cat.py b/analysis/generate_annual_migrant_counts_6cat.py index 176a90c..fcca4d0 100644 --- a/analysis/generate_annual_migrant_counts_6cat.py +++ b/analysis/generate_annual_migrant_counts_6cat.py @@ -37,17 +37,3 @@ else: name = f"{var_name}_{safe_label}_{suffix}" measures.define_measure(name=name, numerator=bool_numer, group_by=group) - -# # overall (ungrouped) measures - -# for label in labels: -# bool_numer = (mig6_expr == label) -# safe_label = label.lower().replace(" ", "_").replace("-", "_") -# var_name = "mig_status_6_cat_overall" - -# name = f"{var_name}_{safe_label}" - -# measures.define_measure( -# name=name, -# numerator=bool_numer -# ) \ No newline at end of file diff --git a/analysis/generate_annual_migrant_counts_6cat_withdoe.py b/analysis/generate_annual_migrant_counts_6cat_withdoe.py new file mode 100644 index 0000000..3f2166d --- /dev/null +++ b/analysis/generate_annual_migrant_counts_6cat_withdoe.py @@ -0,0 +1,39 @@ + +from ehrql import create_measures, INTERVAL +from ehrql.tables.tpp import patients, practice_registrations, clinical_events, addresses +import migration_status_variables +from analysis import utilities + +measures = create_measures() +measures.configure_dummy_data(population_size=1000) +measures.configure_disclosure_control(enabled=True) # enable on real data + +# build shared variables and defaults +common = utilities.build_common_vars(INTERVAL) +measures.define_defaults(denominator=common["denominator"], intervals=common["intervals"]) +subgroups = common["subgroups"] +ethnicity = common["ethnicity"] + +# build base indicators and aggregated 6-category expression +numerators_separate = migration_status_variables.build_migrant_indicators(INTERVAL.end_date) +mig6_expr = migration_status_variables.build_mig_status_6_cat_withdoe(numerators_separate) + + +labels = [ + "Definite migrant", + "Highly likely migrant", + "Likely migrant", + "Definite non-migrant", + "Likely non-migrant", + "Unknown", +] +for label in labels: + bool_numer = (mig6_expr == label) + safe_label = label.lower().replace(" ", "_").replace("-", "_") + var_name = "mig_status_6_cat" + for suffix, group in subgroups.items(): + if suffix == "": + name = f"{var_name}_{safe_label}" + else: + name = f"{var_name}_{safe_label}_{suffix}" + measures.define_measure(name=name, numerator=bool_numer, group_by=group) diff --git a/analysis/generate_annual_migrant_counts_migration_status_types.py b/analysis/generate_annual_migrant_counts_migration_status_types.py index db0b592..7df5d7e 100644 --- a/analysis/generate_annual_migrant_counts_migration_status_types.py +++ b/analysis/generate_annual_migrant_counts_migration_status_types.py @@ -39,15 +39,3 @@ measures.define_measure(name=name, numerator=expr, group_by=group) -# Overall (no grouping) counts - -# var_name = "migration_status_types_overall" - -# for key, expr in numerators_separate.items(): -# safe_label = key.lower().replace(" ", "_").replace("-", "_") -# name = f"{var_name}_{safe_label}" - -# measures.define_measure( -# name=name, -# numerator=expr -# ) \ No newline at end of file diff --git a/analysis/migration_status_variables.py b/analysis/migration_status_variables.py index f6cdc38..6909d57 100644 --- a/analysis/migration_status_variables.py +++ b/analysis/migration_status_variables.py @@ -12,7 +12,8 @@ "english_not_main_language": codelists.english_not_main_language_excl_interpreter_migrant_codes, "interpreter_required": codelists.interpreter_migrant_codes, "trafficking": codelists.trafficking_codes, - "british_ethnicities": codelists.british_ethnicities_codes + "british_ethnicities": codelists.british_ethnicities_codes, + "date_of_uk_entry": ["860021000000109"] } def build_migrant_indicators(date): @@ -41,6 +42,22 @@ def build_mig_status_2_cat(migrant_indicators): otherwise="Non-migrant" ) +def build_mig_status_2_cat_withdoe(migrant_indicators): + """ + 2-category migrant status: + - "Migrant" if migrant_indicators["any_migrant"] is True OR migrant_indicators["date_of_uk_entry"] is TRUE + - "Non-migrant" otherwise + """ + migrant = migrant_indicators.get("any_migrant", False) + date_of_uk_entry = migrant_indicators.get("date_of_uk_entry", False) + migrant_all = migrant | date_of_uk_entry + + + return case( + when(migrant_all).then("Migrant"), + otherwise="Non-migrant" + ) + def build_mig_status_3_cat(migrant_indicators): """ 3-category migrant status: @@ -60,6 +77,27 @@ def build_mig_status_3_cat(migrant_indicators): otherwise="Unknown" ) +def build_mig_status_3_cat_withdoe(migrant_indicators): + """ + 3-category migrant status: + - "Migrant" if migrant_indicators["migrant"] OR migrant_indicators["date_of_uk_entry"] is TRUE + - "Non-migrant" if born_in_uk OR british_ethnicities AND no migrant code) + - "Unknown" otherwise + """ + migrant = migrant_indicators.get("any_migrant", False) + date_of_uk_entry = migrant_indicators.get("date_of_uk_entry", False) + born_in_uk = migrant_indicators.get("born_in_uk", False) + british_ethnicities = migrant_indicators.get("british_ethnicities", False) + + migrant_cond = migrant | date_of_uk_entry + non_migrant_cond = born_in_uk | ((british_ethnicities) & ~migrant_cond) + + return case( + when(migrant_cond).then("Migrant"), + when(non_migrant_cond).then("Non-migrant"), + otherwise="Unknown" + ) + def build_mig_status_6_cat(migrant_indicators): """ 6-category migrant status (priority order): @@ -95,3 +133,40 @@ def build_mig_status_6_cat(migrant_indicators): when(unknown).then("Unknown"), otherwise="Error" ) + +def build_mig_status_6_cat_withdoe(migrant_indicators): + """ + 6-category migrant status (priority order): + - Definite migrant: not_born_in_uk + - Highly likely migrant: immig_status_excl_refugee_asylum OR refugee_asylum_status + - Likely migrant: english_not_main_language OR interpreter_required OR trafficking OR date_of_uk_entry + - Definite non-migrant: born_in_uk + - Likely non-migrant: british_ethnicities AND no migrant code + - Unknown: no migrant codes + """ + migrant = migrant_indicators.get("any_migrant", False) + not_born_in_uk = migrant_indicators.get("not_born_in_uk", False) + immig_excl = migrant_indicators.get("immig_status_excl_refugee_asylum", False) + refugee_asylum = migrant_indicators.get("refugee_asylum_status", False) + english_not_main = migrant_indicators.get("english_not_main_language", False) + interpreter_required = migrant_indicators.get("interpreter_required", False) + trafficking = migrant_indicators.get("trafficking", False) + born_in_uk = migrant_indicators.get("born_in_uk", False) + british_ethnicities = migrant_indicators.get("british_ethnicities", False) + date_of_uk_entry = migrant_indicators.get("date_of_uk_entry", False) + + # Compose combined conditions + highly_likely = immig_excl | refugee_asylum + likely_migrant = english_not_main | interpreter_required | trafficking | date_of_uk_entry + likely_non_migrant = ((british_ethnicities) & ~migrant) + unknown = (~migrant) + + return case( + when(not_born_in_uk).then("Definite migrant"), + when(born_in_uk).then("Definite non-migrant"), + when(highly_likely).then("Highly likely migrant"), + when(likely_migrant).then("Likely migrant"), + when(likely_non_migrant).then("Likely non-migrant"), + when(unknown).then("Unknown"), + otherwise="Error" + ) \ No newline at end of file diff --git a/analysis/generate_annual_migrant_counts.py b/analysis/scrapyard/generate_annual_migrant_counts.py similarity index 100% rename from analysis/generate_annual_migrant_counts.py rename to analysis/scrapyard/generate_annual_migrant_counts.py diff --git a/project.yaml b/project.yaml index cd6bef1..800a504 100644 --- a/project.yaml +++ b/project.yaml @@ -119,6 +119,55 @@ actions: moderately_sensitive: csv: output/tables/demographics_census_2021_cohort_6cat.csv + generate_demographics_census_2011_study_table_mig_2cat_withdoe: + run: r:latest analysis/process_census_cohort_data.R output/cohorts/census_2011_study_cohort.arrow output/tables/demographics_census_2011_cohort_2cat_withdoe.csv mig_status_2_cat_withdoe + needs: + - generate_dataset_for_census_2011 + outputs: + moderately_sensitive: + csv: output/tables/demographics_census_2011_cohort_2cat_withdoe.csv + + generate_demographics_census_2011_study_table_mig_3cat_withdoe: + run: r:latest analysis/process_census_cohort_data.R output/cohorts/census_2011_study_cohort.arrow output/tables/demographics_census_2011_cohort_3cat_withdoe.csv mig_status_3_cat_withdoe + needs: + - generate_dataset_for_census_2011 + outputs: + moderately_sensitive: + csv: output/tables/demographics_census_2011_cohort_3cat_withdoe.csv + + generate_demographics_census_2011_study_table_mig_6cat_withdoe: + run: r:latest analysis/process_census_cohort_data.R output/cohorts/census_2011_study_cohort.arrow output/tables/demographics_census_2011_cohort_6cat_withdoe.csv mig_status_6_cat_withdoe + needs: + - generate_dataset_for_census_2011 + outputs: + moderately_sensitive: + csv: output/tables/demographics_census_2011_cohort_6cat_withdoe.csv + + + generate_demographics_census_2021_study_table_2cat_withdoe: + run: r:latest analysis/process_census_cohort_data.R output/cohorts/census_2021_study_cohort.arrow output/tables/demographics_census_2021_cohort_2cat_withdoe.csv mig_status_2_cat_withdoe + needs: + - generate_dataset_for_census_2021 + outputs: + moderately_sensitive: + csv: output/tables/demographics_census_2021_cohort_2cat_withdoe.csv + + generate_demographics_census_2021_study_table_3cat_withdoe: + run: r:latest analysis/process_census_cohort_data.R output/cohorts/census_2021_study_cohort.arrow output/tables/demographics_census_2021_cohort_3cat_withdoe.csv mig_status_3_cat_withdoe + needs: + - generate_dataset_for_census_2021 + outputs: + moderately_sensitive: + csv: output/tables/demographics_census_2021_cohort_3cat_withdoe.csv + + generate_demographics_census_2021_study_table_6cat_withdoe: + run: r:latest analysis/process_census_cohort_data.R output/cohorts/census_2021_study_cohort.arrow output/tables/demographics_census_2021_cohort_6cat_withdoe.csv mig_status_6_cat_withdoe + needs: + - generate_dataset_for_census_2021 + outputs: + moderately_sensitive: + csv: output/tables/demographics_census_2021_cohort_6cat_withdoe.csv + generate_date_of_uk_entry_description: run: r:latest analysis/date_of_entry_to_uk.R needs: @@ -127,12 +176,6 @@ actions: moderately_sensitive: csv: output/tables/date_of_uk_entry_description.csv -# generate_annual_migrant_counts: -# run: ehrql:v1 generate-measures analysis/generate_annual_migrant_counts.py --output output/tables/annual_migrant_counts.csv -# outputs: -# moderately_sensitive: -# csv: output/tables/annual_migrant_counts.csv - generate_annual_migrant_counts_2cat: run: ehrql:v1 generate-measures analysis/generate_annual_migrant_counts_2cat.py --output output/tables/annual_counts/2cat/:csv outputs: @@ -151,6 +194,24 @@ actions: moderately_sensitive: csv: output/tables/annual_counts/6cat/*.csv + generate_annual_migrant_counts_2cat_withdoe: + run: ehrql:v1 generate-measures analysis/generate_annual_migrant_counts_2cat_withdoe.py --output output/tables/annual_counts/2cat_withdoe/:csv + outputs: + moderately_sensitive: + measures: output/tables/annual_counts/2cat_withdoe/*.csv + + generate_annual_migrant_counts_3cat_withdoe: + run: ehrql:v1 generate-measures analysis/generate_annual_migrant_counts_3cat_withdoe.py --output output/tables/annual_counts/3cat_withdoe/:csv + outputs: + moderately_sensitive: + csv: output/tables/annual_counts/3cat_withdoe/*.csv + + generate_annual_migrant_counts_6cat_withdoe: + run: ehrql:v1 generate-measures analysis/generate_annual_migrant_counts_6cat_withdoe.py --output output/tables/annual_counts/6cat_withdoe/:csv + outputs: + moderately_sensitive: + csv: output/tables/annual_counts/6cat_withdoe/*.csv + generate_annual_migrant_counts_migration_status_types: run: ehrql:v1 generate-measures analysis/generate_annual_migrant_counts_migration_status_types.py --output output/tables/annual_counts/migration_status_types/:csv outputs: