Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 45 additions & 26 deletions code/association_scan/quantile_models/qr_and_twas.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"kernel": "SoS"
},
Expand Down Expand Up @@ -357,6 +357,8 @@
"parameter: min_twas_maf = 0.01\n",
"parameter: screen_threshold = 0.01\n",
"parameter: ld_reference_meta_file = path()\n",
"# Only focus on a subset of variants\n",
"parameter: keep_variants = path()\n",
"# Analysis environment settings\n",
"parameter: container = \"\"\n",
"import re\n",
Expand Down Expand Up @@ -433,7 +435,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"kernel": "SoS"
},
Expand All @@ -450,12 +452,12 @@
" id_maps = {}\n",
" for id_map_file in id_map_files:\n",
" if id_map_file is not None and os.path.isfile(id_map_file):\n",
" df = pd.read_csv(id_map_file, sep='\\s+', header=None, comment='#', names=['old_ID', 'new_ID'])\n",
" df = pd.read_csv(id_map_file, sep=r'\\s+', header=None, comment='#', names=['old_ID', 'new_ID'])\n",
" id_maps[id_map_file] = df.set_index('old_ID')['new_ID'].to_dict()\n",
" return id_maps\n",
"\n",
"def load_and_apply_id_map(pheno_path, id_map_path, preloaded_id_maps):\n",
" pheno_df = pd.read_csv(pheno_path, sep=\"\\s+\", header=0)\n",
" pheno_df = pd.read_csv(pheno_path, sep=r\"\\s+\", header=0)\n",
" pheno_df['Original_ID'] = pheno_df['ID']\n",
" if id_map_path in preloaded_id_maps:\n",
" id_map = preloaded_id_maps[id_map_path]\n",
Expand All @@ -464,7 +466,12 @@
"\n",
"def filter_by_region_ids(data, region_ids):\n",
" if region_ids is not None and len(region_ids) > 0:\n",
" data = data[data['ID'].isin(region_ids)]\n",
" # Check both ID (mapped) and Original_ID (unmapped) to handle phenoIDFile cases\n",
" # where region_name uses the original ID but ID column has been mapped\n",
" if 'Original_ID' in data.columns:\n",
" data = data[data['ID'].isin(region_ids) | data['Original_ID'].isin(region_ids)]\n",
" else:\n",
" data = data[data['ID'].isin(region_ids)]\n",
" return data\n",
"\n",
"def custom_join(series):\n",
Expand Down Expand Up @@ -512,7 +519,9 @@
" pheno_df = load_and_apply_id_map(pheno_path, id_map_path, preloaded_id_maps)\n",
" pheno_df = filter_by_region_ids(pheno_df, region_ids)\n",
" if not pheno_df.empty:\n",
" pheno_df.iloc[:, 4] = adapt_file_path_all(pheno_df, pheno_df.columns[4], f\"{pheno_path:a}\")\n",
" # Use 'path' column by name to handle both 5-col (#chr,start,end,ID,path)\n",
" # and 6-col (#chr,start,end,ID,strand,path) region_list formats\n",
" pheno_df['path'] = adapt_file_path_all(pheno_df, 'path', f\"{pheno_path:a}\")\n",
" pheno_df = pheno_df.assign(cov_path=str(cov_path), cond=phenotype_name) \n",
" accumulated_pheno_df = pd.concat([accumulated_pheno_df, pheno_df], ignore_index=True)\n",
"\n",
Expand All @@ -538,7 +547,7 @@
" for pheno_path, cov_path, phenotype_name, id_map_path in zip(pheno_files, cov_files, phenotype_names, pheno_id_files):\n",
" if not os.path.isfile(cov_path):\n",
" raise FileNotFoundError(f\"No valid path found for file: {cov_path}\")\n",
" pheno_df = pd.read_csv(pheno_path, sep=\"\\s+\", header=0, names=['Original_ID', 'path'])\n",
" pheno_df = pd.read_csv(pheno_path, sep=r\"\\s+\", header=0, names=['Original_ID', 'path'])\n",
" if not pheno_df.empty:\n",
" pheno_df.iloc[:, -1] = adapt_file_path_all(pheno_df, pheno_df.columns[-1], f\"{pheno_path:a}\")\n",
" pheno_df = pheno_df.assign(cov_path=str(cov_path), cond=phenotype_name)\n",
Expand All @@ -549,7 +558,7 @@
" pheno_df['end'] = 0\n",
" if id_map_path is not None:\n",
" # Filter pheno_df by specific association-window and phenotype pairs\n",
" association_analysis_pair = pd.read_csv(id_map_path, sep='\\s+', header=None, comment='#', names=['ID', 'Original_ID'])\n",
" association_analysis_pair = pd.read_csv(id_map_path, sep=r'\\s+', header=None, comment='#', names=['ID', 'Original_ID'])\n",
" pheno_df = pd.merge(pheno_df, association_analysis_pair, on=['ID', 'Original_ID'])\n",
" accumulated_pheno_df = pd.concat([accumulated_pheno_df, pheno_df], ignore_index=True)\n",
"\n",
Expand All @@ -560,7 +569,7 @@
"if f\"{genoFile:x}\" == \".bed\":\n",
" geno_meta_data = pd.DataFrame([(\"chr\"+str(x), f\"{genoFile:a}\") for x in range(1,23)] + [(\"chrX\", f\"{genoFile:a}\")], columns=['#chr', 'geno_path'])\n",
"else:\n",
" geno_meta_data = pd.read_csv(f\"{genoFile:a}\", sep = \"\\s+\", header=0)\n",
" geno_meta_data = pd.read_csv(f\"{genoFile:a}\", sep = r\"\\s+\", header=0)\n",
" geno_meta_data.iloc[:, 1] = adapt_file_path_all(geno_meta_data, geno_meta_data.columns[1], f\"{genoFile:a}\")\n",
" geno_meta_data.columns = ['#chr', 'geno_path']\n",
" geno_meta_data['#chr'] = geno_meta_data['#chr'].apply(lambda x: str(x) if str(x).startswith('chr') else f'chr{x}')\n",
Expand Down Expand Up @@ -629,7 +638,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"kernel": "SoS",
"tags": []
Expand Down Expand Up @@ -667,6 +676,15 @@
" message(paste(length(keep_samples), \"samples are selected to be loaded for analysis\"))\n",
" }\n",
"\n",
" # Load variant filter list if provided\n",
" keep_variants_list = NULL\n",
" if (${\"TRUE\" if keep_variants.is_file() else \"FALSE\"}) {\n",
" keep_variants_list = trimws(readLines(${keep_variants:ar}))\n",
" # Remove empty lines\n",
" keep_variants_list = keep_variants_list[keep_variants_list != \"\"]\n",
" message(paste(length(keep_variants_list), \"variants are specified to be kept for analysis\"))\n",
" }\n",
"\n",
" # Load regional association data\n",
" tryCatch({\n",
" fdat = load_regional_association_data(genotype = ${_input[0]:anr},\n",
Expand Down Expand Up @@ -708,13 +726,13 @@
" condition_names = vector()\n",
" r = 1\n",
" while (r<=length(fdat$Y)) {\n",
" X <- fdat$X_data[[r]] \n",
" X <- fdat$X_data[[r]]\n",
" Y <- fdat$Y[[r]]\n",
" if (is.null(dim(Y))) {\n",
" Y <- matrix(Y, nrow = length(Y), ncol = 1)\n",
" }\n",
" colnames(Y) <- extract_region_name[[r]] \n",
" Z <- fdat$covar[[r]] \n",
" colnames(Y) <- extract_region_name[[r]]\n",
" Z <- fdat$covar[[r]]\n",
" # Update condition names first\n",
" new_names = names(fdat$residual_Y)[r]\n",
" new_col_names = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])})[[r]]\n",
Expand All @@ -724,44 +742,45 @@
" if(!(identical(new_names, new_col_names))) {\n",
" new_names = paste(new_names, new_col_names, sep=\"_\")\n",
" }\n",
" \n",
"\n",
" column_results <- lapply(1:ncol(Y), function(i) {\n",
" Y_col <- matrix(Y[,i], ncol=1)\n",
" colnames(Y_col) <- colnames(Y)[i]\n",
" \n",
"\n",
" qr_results = quantile_twas_weight_pipeline(\n",
" X = X, \n",
" X = X,\n",
" Y = Y_col,\n",
" Z = Z, \n",
" Z = Z,\n",
" ld_reference_meta_file=${('\"%s\"' % ld_reference_meta_file) if not ld_reference_meta_file.is_dir() else \"NULL\"},\n",
" maf = fdat$maf[[r]],\n",
" twas_maf_cutoff = ${min_twas_maf}, \n",
" twas_maf_cutoff = ${min_twas_maf},\n",
" region_id = paste0(colnames(Y_col), \"_\", names(fdat$residual_Y)[r]),\n",
" quantile_qtl_tau_list = seq(0.05, 0.95, by = 0.05),\n",
" quantile_twas_tau_list = seq(0.01, 0.99, by = 0.01),\n",
" screen_threshold = ${screen_threshold}\n",
" screen_threshold = ${screen_threshold},\n",
" keep_variants = keep_variants_list\n",
" )\n",
" \n",
"\n",
" if (!is.null(qr_results$message)) {\n",
" message(qr_results$message)\n",
" }\n",
" \n",
"\n",
" qr_results$region_info = region_info\n",
" qr_results$maf = fdat$maf[[r]]\n",
" \n",
"\n",
" return(qr_results)\n",
" })\n",
" \n",
"\n",
" fitted <- c(fitted, column_results)\n",
" condition_names <- c(condition_names, new_names)\n",
" \n",
"\n",
" if (length(new_names) > 0) {\n",
" message(\"Analysis completed for: \", paste(new_names, collapse=\",\"))\n",
" }\n",
" \n",
"\n",
" # Release memory\n",
" fdat$residual_X[[r]] <- NA\n",
" fdat$residual_Y[[r]] <- NA \n",
" fdat$residual_Y[[r]] <- NA\n",
" r = r + 1\n",
" }\n",
"\n",
Expand Down