Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions dbt_project/macros/harmonization/clean_codes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{%- macro clean_codes(column_name, expected_prefixes, sp_chars) -%}
{%- set ns = namespace(expr=column_name) -%}
{%- for prefix in expected_prefixes -%}
{%- set ns.expr = "replace(" ~ ns.expr ~ ", '" ~ prefix ~ "', '|" ~ prefix ~ "')" -%}
{%- endfor -%}
{%- set chars_to_remove = sp_chars | join('') -%}
(
select string_agg(distinct trim(code), '|' order by trim(code))
from (
select unnest(string_to_array(trim(both '|'
from translate({{ ns.expr }}, '{{ chars_to_remove }}', '')),'|')) as code
) as split_codes
where code is not null and length(trim(code)) > 0
)
{%- endmacro -%}
10 changes: 10 additions & 0 deletions dbt_project/macros/harmonization/generate_global_id.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{%- macro generate_global_id(prefix='', descriptor=[], study_id='') -%}
{%- set formatted_columns = [] -%}
{%- for col in descriptor -%}
{%- do formatted_columns.append("'" ~ study_id ~ "'") -%}
{%- do formatted_columns.append("cast(coalesce(" ~ col ~ ", '') as text)") -%}
{%- endfor -%}
{{
"'" ~ prefix ~ "' || '_' || md5(" ~ formatted_columns | join(" || '|' || ") ~ ")"
}}
{%- endmacro -%}
6 changes: 6 additions & 0 deletions dbt_project/macros/harmonization/get_columns.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{% macro get_columns(relation, exclude=[]) %}
{{ return(dbt_utils.get_filtered_columns_in_relation(
from=relation,
except=exclude
)) }}
{% endmacro %}
4 changes: 4 additions & 0 deletions dbt_project/macros/register_external_sources.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{% macro register_external_sources(fq_tablename, csv_path) %}
{% do adapter.execute("CREATE SCHEMA IF NOT EXISTS " ~ fq_tablename ~ ";") %}
{% do adapter.execute("CREATE OR REPLACE VIEW " ~ fq_tablename ~ " AS SELECT * FROM read_csv_auto('" ~ csv_path ~ "');") %}
{% endmacro %}
26 changes: 26 additions & 0 deletions dbt_project/macros/register_external_sources_pg.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{% macro register_external_sources_pg(tb_schema, tablename, columns, src_data_csv_path) %}
{% if tb_schema is none or tablename is none or columns is none or src_data_csv_path is none %}
{{ log("Error: Missing required arguments in register_external_sources_pg macro.", warn=True) }}
{% do return('') %}
{% endif %}

{% set create_table_query %}
CREATE SCHEMA IF NOT EXISTS {{ tb_schema }};

CREATE TABLE IF NOT EXISTS {{ tb_schema }}.{{ tablename }} (
{% for column in columns %}
{{ column }}{% if not loop.last %},{% endif %}
{% endfor %}
);
{% endset %}


{% set full_query %}
{{ create_table_query }}
{% endset %}


{{ log("Executing SQL: " ~ full_query, info=True) }}
{{ run_query(full_query) }}

{% endmacro %}
3 changes: 3 additions & 0 deletions dbt_project/macros/run_sql.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{% macro run_sql(sql) %}
{% do run_query(sql) %}
{% endmacro %}
155 changes: 155 additions & 0 deletions dbt_project/models/include/aadsc/aadsc_test_condition.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
{{ config(materialized='table', schema='aadsc') }}

{% set relation = ref('include_aadsc_src_chicoine_down_syndrome_extract') %}
{% set constant_columns = ['masked_id','age','sex','race','ethnicity','extraction_date'] %}
{% set condition_columns = get_columns(relation=relation, exclude=constant_columns) %}

with
condition_display_lookup as (
select
variable_name as condition,
description as display
from {{ ref('clinical_stg_dd') }}
)
,unpivot_df as (
-- Convert from 'wide' to 'long' src data format.
-- Uses union all strategy as it is available across dbs.
-- Output schema: 'masked_id','age','sex','race','ethnicity','extraction_date','bmi','height','weight','condition','assertion'(1,bmi,height,or weight)

{% for col in condition_columns %}
select
{{ constant_columns | join(', ') }},
'{{ col }}' as condition,
cast({{ col }} as varchar) as assertion
from {{ ref('include_aadsc_src_chicoine_down_syndrome_extract') }}
where {{ col }} IS NOT NULL
{% if not loop.last %}union all{% endif %}
{% endfor %}
)
,mondo_annotations as (
select
distinct
"condition_name" ::text as condition_name,
"mondo_codes_with_prefix" ::text as mondo_code,
"mondo_label" ::text as mondo_label
from {{ ref('annotations') }} as a
where "mondo_codes_with_prefix" is not null
or "mondo_label" is not null
)
,hpo_annotations as (
select
distinct
"condition_name" ::text as condition_name,
"hpo_codes_with_prefix" ::text as hpo_code,
"hpo_label" ::text as hpo_label
from {{ ref('annotations') }} as a
where "hpo_codes_with_prefix" is not null
or "hpo_label" is not null
)
,other_annotations as (
select
distinct
"condition_name" ::text as condition_name,
"icd9_codes_with_prefix",
"icd10_codes_with_prefix",
"icdO_codes_with_prefix",
"icd10cm_label",
"loinc_label",
"loinc_code"
from {{ ref('annotations') }} as a
where "icd9_codes_with_prefix" is not null
or "icd10_codes_with_prefix" is not null
or "icdO_codes_with_prefix" is not null
or "loinc_code" is not null
)

,source as (
select
'AADSC' as "study_code",
clinical.condition::text as "condition_or_measure_source_text_code", -- Ex: gi_ibs_status
cdl.display::text as "condition_or_measure_source_text_display", -- Ex: Irritable bowel syndrome
case
when cdl.display = 'HP_0032551' then clinical.condition
else coalesce(cdl.display,clinical.condition,NULL)
end as "condition_or_measure_source_text",
clinical.masked_id::text as "participant_external_id",
case
when clinical.assertion = '1'
then 'Observed'
when clinical.assertion = '0'
then 'Not Observed'
when clinical.assertion is null
then null
else null -- NULL for bmi, height, weight
end as "condition_interpretation",
ha.hpo_label,
ha.hpo_code,
ma.mondo_label,
ma.mondo_code,
case
when ha.hpo_code is null and ma.mondo_code is null
then (select COALESCE("icd10cm_label","loinc_label",'')
from other_annotations
where condition_name = clinical.condition
order by "icd9_codes_with_prefix", "icd10_codes_with_prefix", "icdO_codes_with_prefix", "loinc_code"
limit 1)::text
else null
end as "other_label",
case
when ha.hpo_code is null and ma.mondo_code is null
then (select COALESCE("icd9_codes_with_prefix", "icd10_codes_with_prefix", "icdO_codes_with_prefix","loinc_code",'')
from other_annotations
where condition_name = clinical.condition
order by "icd9_codes_with_prefix", "icd10_codes_with_prefix", "icdO_codes_with_prefix", "loinc_code"
limit 1)::text
else null
end as "other_code",
case
when clinical.condition in ('bmi','height','weight')
then clinical.assertion
else null
end as "measure_value",
case
when clinical.condition = 'bmi'
then 'kg/m2'
when clinical.condition = 'height'
then 'in'
when clinical.condition = 'weight'
then 'lb'
else null
end as "measure_unit"
from unpivot_df as clinical
left join mondo_annotations as ma
on ma.condition_name = clinical.condition
left join hpo_annotations as ha
on ha.condition_name = clinical.condition
left join condition_display_lookup as cdl
on cdl.condition = clinical.condition
)


select
source.study_code, --req
null::text as "participant_global_id", --req, created after the pipeline
source.participant_external_id, --req
null::text as "event_id",
null::text as "event_type",
null::integer as "age_at_condition_measure_observation",
source.condition_or_measure_source_text_display,
source.condition_or_measure_source_text_code,
source.condition_or_measure_source_text,
null::integer as "age_at_first_patient_engagement",
source.condition_interpretation,
null::text as "condition_status",
null::text as "condition_data_source",
source.hpo_label,
source.hpo_code,
source.mondo_label,
source.mondo_code,
null::text as "maxo_label",
null::text as "maxo_code",
source.other_label,
source.other_code,
source.measure_value as "measure_value",
source.measure_unit as "measure_unit"
from source
92 changes: 92 additions & 0 deletions dbt_project/models/include/aadsc/aadsc_test_participant.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{{ config(materialized='table', schema='aadsc') }}

with
source as (
select
'AADSC' as "study_code",
clinical.masked_id::text as "participant_external_id",
'Proband-only' as "family_type",
'Proband' as "family_relationship",
case
when clinical.sex = 'Female'
then 'Female'
when clinical.sex = 'Male'
then 'Male'
when clinical.sex is null
then 'Unknown'
else null
end as "sex",
case
when clinical.race = 'Black/African American'
then 'Black or African American'
when clinical.race = 'Native Hawaiian'
then 'Native Hawaiian or Other Pacific Islander'
when clinical.race = 'Two or more races'
then 'More than one race'
when clinical.race = 'Other Pacific Islander'
then 'Native Hawaiian or Other Pacific Islander'
when clinical.race = 'XXNative Hawaiian/Other Pacific Islander'
then 'Native Hawaiian or Other Pacific Islander'
when clinical.race = 'American Indian or Alaskan Native'
then 'American Indian or Alaska Native'
when clinical.race = 'Unknown'
then 'Unknown'
when clinical.race = 'Asian'
then 'Asian'
when clinical.race = 'White'
then 'White'
when clinical.race is null
then 'Unknown'
else null
end as "race",
case
when clinical.ethnicity = 'Hispanic/Latino Origin'
then 'Hispanic or Latino'
when clinical.ethnicity = 'Not of Hispanic or Latino Origin'
then 'Not Hispanic or Latino'
when clinical.ethnicity = 'Patient Refused'
then 'Prefer not to answer'
when clinical.ethnicity = 'Unknown'
then 'Unknown'
when clinical.ethnicity is null
then 'Unknown'
else null
end as "ethnicity",
case
when clinical.ds_diagnosis = '1'
then 'T21'
else null
end as "down_syndrome_status",
case
when clinical.age = 'Age 90 or older'
then ROUND(90 * 365.25)
when CAST(clinical.age AS FLOAT) >= 0 AND CAST(clinical.age AS FLOAT) < 90
then ROUND(CAST(clinical.age AS FLOAT) * 365.25) -- years to days conversion
when clinical.age is null
then null
else null
end as "age_at_last_vital_status"
from {{ ref('include_aadsc_src_chicoine_down_syndrome_extract') }} as clinical
)

select
source.study_code, --req
null::text as "participant_global_id", --req, created after the pipeline
source.participant_external_id, --req
null::text as "family_id",
source.family_type, --req
null::text as "father_id",
null::text as "mother_id",
null::text as "sibling_id",
null::text as "other_family_member_id",
source.family_relationship, --req
source.sex, --req
source.race, --req
source.ethnicity, --req
source.down_syndrome_status, --req
null::integer as "age_at_first_patient_engagement", --req
null::text as "first_patient_engagement_event", --req
null::text as "outcomes_vital_status",
source.age_at_last_vital_status::integer as "age_at_last_vital_status"
from source

22 changes: 22 additions & 0 deletions dbt_project/models/include/aadsc/aadsc_test_study.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{{ config(materialized='table', schema='aadsc_data') }}

{% set relation = source('lookups', 'studies') %}
{% set exclude_columns = ['one','two','three','four'] %}
{% set study_columns = get_columns(relation=relation, exclude=exclude_columns) %}


with
source as (
select
{% for col in study_columns %}
"{{ col }}"::text AS "{{ col.lower().replace(" ", "_") }}"
{% if not loop.last %},{% endif %}
{% endfor %}
from {{ source('lookups', 'studies') }} as s
WHERE "Study Code" = 'AADSC'
)

select
*
from source

Loading