StatFunGen · gaow · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/code/association_scan/quantile_models/qr_and_twas.ipynb b/code/association_scan/quantile_models/qr_and_twas.ipynb
@@ -308,7 +308,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "kernel": "SoS"
    },
@@ -357,6 +357,8 @@
     "parameter: min_twas_maf = 0.01\n",
     "parameter: screen_threshold = 0.01\n",
     "parameter: ld_reference_meta_file = path()\n",
+    "# Only focus on a subset of variants\n",
+    "parameter: keep_variants = path()\n",
     "# Analysis environment settings\n",
     "parameter: container = \"\"\n",
     "import re\n",
@@ -433,7 +435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "kernel": "SoS"
    },
@@ -450,12 +452,12 @@
     "    id_maps = {}\n",
     "    for id_map_file in id_map_files:\n",
     "        if id_map_file is not None and os.path.isfile(id_map_file):\n",
-    "            df = pd.read_csv(id_map_file, sep='\\s+', header=None, comment='#', names=['old_ID', 'new_ID'])\n",
+    "            df = pd.read_csv(id_map_file, sep=r'\\s+', header=None, comment='#', names=['old_ID', 'new_ID'])\n",
     "            id_maps[id_map_file] = df.set_index('old_ID')['new_ID'].to_dict()\n",
     "    return id_maps\n",
     "\n",
     "def load_and_apply_id_map(pheno_path, id_map_path, preloaded_id_maps):\n",
-    "    pheno_df = pd.read_csv(pheno_path, sep=\"\\s+\", header=0)\n",
+    "    pheno_df = pd.read_csv(pheno_path, sep=r\"\\s+\", header=0)\n",
     "    pheno_df['Original_ID'] = pheno_df['ID']\n",
     "    if id_map_path in preloaded_id_maps:\n",
     "        id_map = preloaded_id_maps[id_map_path]\n",
@@ -464,7 +466,12 @@
     "\n",
     "def filter_by_region_ids(data, region_ids):\n",
     "    if region_ids is not None and len(region_ids) > 0:\n",
-    "        data = data[data['ID'].isin(region_ids)]\n",
+    "        # Check both ID (mapped) and Original_ID (unmapped) to handle phenoIDFile cases\n",
+    "        # where region_name uses the original ID but ID column has been mapped\n",
+    "        if 'Original_ID' in data.columns:\n",
+    "            data = data[data['ID'].isin(region_ids) | data['Original_ID'].isin(region_ids)]\n",
+    "        else:\n",
+    "            data = data[data['ID'].isin(region_ids)]\n",
     "    return data\n",
     "\n",
     "def custom_join(series):\n",
@@ -512,7 +519,9 @@
     "        pheno_df = load_and_apply_id_map(pheno_path, id_map_path, preloaded_id_maps)\n",
     "        pheno_df = filter_by_region_ids(pheno_df, region_ids)\n",
     "        if not pheno_df.empty:\n",
-    "            pheno_df.iloc[:, 4] = adapt_file_path_all(pheno_df, pheno_df.columns[4], f\"{pheno_path:a}\")\n",
+    "            # Use 'path' column by name to handle both 5-col (#chr,start,end,ID,path)\n",
+    "            # and 6-col (#chr,start,end,ID,strand,path) region_list formats\n",
+    "            pheno_df['path'] = adapt_file_path_all(pheno_df, 'path', f\"{pheno_path:a}\")\n",
     "            pheno_df = pheno_df.assign(cov_path=str(cov_path), cond=phenotype_name)           \n",
     "            accumulated_pheno_df = pd.concat([accumulated_pheno_df, pheno_df], ignore_index=True)\n",
     "\n",
@@ -538,7 +547,7 @@
     "    for pheno_path, cov_path, phenotype_name, id_map_path in zip(pheno_files, cov_files, phenotype_names, pheno_id_files):\n",
     "        if not os.path.isfile(cov_path):\n",
     "            raise FileNotFoundError(f\"No valid path found for file: {cov_path}\")\n",
-    "        pheno_df = pd.read_csv(pheno_path, sep=\"\\s+\", header=0, names=['Original_ID', 'path'])\n",
+    "        pheno_df = pd.read_csv(pheno_path, sep=r\"\\s+\", header=0, names=['Original_ID', 'path'])\n",
     "        if not pheno_df.empty:\n",
     "            pheno_df.iloc[:, -1] = adapt_file_path_all(pheno_df, pheno_df.columns[-1], f\"{pheno_path:a}\")\n",
     "            pheno_df = pheno_df.assign(cov_path=str(cov_path), cond=phenotype_name)\n",
@@ -549,7 +558,7 @@
     "            pheno_df['end'] = 0\n",
     "            if id_map_path is not None:\n",
     "                # Filter pheno_df by specific association-window and phenotype pairs\n",
-    "                association_analysis_pair = pd.read_csv(id_map_path, sep='\\s+', header=None, comment='#', names=['ID', 'Original_ID'])\n",
+    "                association_analysis_pair = pd.read_csv(id_map_path, sep=r'\\s+', header=None, comment='#', names=['ID', 'Original_ID'])\n",
     "                pheno_df = pd.merge(pheno_df, association_analysis_pair, on=['ID', 'Original_ID'])\n",
     "            accumulated_pheno_df = pd.concat([accumulated_pheno_df, pheno_df], ignore_index=True)\n",
     "\n",
@@ -560,7 +569,7 @@
     "if f\"{genoFile:x}\" == \".bed\":\n",
     "    geno_meta_data = pd.DataFrame([(\"chr\"+str(x), f\"{genoFile:a}\") for x in range(1,23)] + [(\"chrX\", f\"{genoFile:a}\")], columns=['#chr', 'geno_path'])\n",
     "else:\n",
-    "    geno_meta_data = pd.read_csv(f\"{genoFile:a}\", sep = \"\\s+\", header=0)\n",
+    "    geno_meta_data = pd.read_csv(f\"{genoFile:a}\", sep = r\"\\s+\", header=0)\n",
     "    geno_meta_data.iloc[:, 1] = adapt_file_path_all(geno_meta_data, geno_meta_data.columns[1], f\"{genoFile:a}\")\n",
     "    geno_meta_data.columns = ['#chr', 'geno_path']\n",
     "    geno_meta_data['#chr'] = geno_meta_data['#chr'].apply(lambda x: str(x) if str(x).startswith('chr') else f'chr{x}')\n",
@@ -629,7 +638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "kernel": "SoS",
     "tags": []
@@ -667,6 +676,15 @@
     "      message(paste(length(keep_samples), \"samples are selected to be loaded for analysis\"))\n",
     "    }\n",
     "\n",
+    "    # Load variant filter list if provided\n",
+    "    keep_variants_list = NULL\n",
+    "    if (${\"TRUE\" if keep_variants.is_file() else \"FALSE\"}) {\n",
+    "      keep_variants_list = trimws(readLines(${keep_variants:ar}))\n",
+    "      # Remove empty lines\n",
+    "      keep_variants_list = keep_variants_list[keep_variants_list != \"\"]\n",
+    "      message(paste(length(keep_variants_list), \"variants are specified to be kept for analysis\"))\n",
+    "    }\n",
+    "\n",
     "    # Load regional association data\n",
     "    tryCatch({\n",
     "    fdat = load_regional_association_data(genotype = ${_input[0]:anr},\n",
@@ -708,13 +726,13 @@
     "    condition_names = vector()\n",
     "    r = 1\n",
     "    while (r<=length(fdat$Y)) {\n",
-    "        X <- fdat$X_data[[r]]  \n",
+    "        X <- fdat$X_data[[r]]\n",
     "        Y <- fdat$Y[[r]]\n",
     "        if (is.null(dim(Y))) {\n",
     "            Y <- matrix(Y, nrow = length(Y), ncol = 1)\n",
     "        }\n",
-    "        colnames(Y) <- extract_region_name[[r]] \n",
-    "        Z <- fdat$covar[[r]]     \n",
+    "        colnames(Y) <- extract_region_name[[r]]\n",
+    "        Z <- fdat$covar[[r]]\n",
     "        # Update condition names first\n",
     "        new_names = names(fdat$residual_Y)[r]\n",
     "        new_col_names = list(${\",\".join([(\"c('\"+x+\"')\") if isinstance(x, str) else (\"c\"+ str(x)) for x in _meta_info[3]])})[[r]]\n",
@@ -724,44 +742,45 @@
     "        if(!(identical(new_names, new_col_names))) {\n",
     "            new_names = paste(new_names, new_col_names, sep=\"_\")\n",
     "        }\n",
-    "                \n",
+    "\n",
     "        column_results <- lapply(1:ncol(Y), function(i) {\n",
     "            Y_col <- matrix(Y[,i], ncol=1)\n",
     "            colnames(Y_col) <- colnames(Y)[i]\n",
-    "            \n",
+    "\n",
     "            qr_results = quantile_twas_weight_pipeline(\n",
-    "                X = X, \n",
+    "                X = X,\n",
     "                Y = Y_col,\n",
-    "                Z = Z, \n",
+    "                Z = Z,\n",
     "                ld_reference_meta_file=${('\"%s\"' % ld_reference_meta_file) if not ld_reference_meta_file.is_dir() else \"NULL\"},\n",
     "                maf = fdat$maf[[r]],\n",
-    "                twas_maf_cutoff = ${min_twas_maf},                                            \n",
+    "                twas_maf_cutoff = ${min_twas_maf},\n",
     "                region_id = paste0(colnames(Y_col), \"_\", names(fdat$residual_Y)[r]),\n",
     "                quantile_qtl_tau_list = seq(0.05, 0.95, by = 0.05),\n",
     "                quantile_twas_tau_list = seq(0.01, 0.99, by = 0.01),\n",
-    "                screen_threshold = ${screen_threshold}\n",
+    "                screen_threshold = ${screen_threshold},\n",
+    "                keep_variants = keep_variants_list\n",
     "            )\n",
-    "            \n",
+    "\n",
     "            if (!is.null(qr_results$message)) {\n",
     "                message(qr_results$message)\n",
     "            }\n",
-    "            \n",
+    "\n",
     "            qr_results$region_info = region_info\n",
     "            qr_results$maf = fdat$maf[[r]]\n",
-    "            \n",
+    "\n",
     "            return(qr_results)\n",
     "        })\n",
-    "        \n",
+    "\n",
     "        fitted <- c(fitted, column_results)\n",
     "        condition_names <- c(condition_names, new_names)\n",
-    "        \n",
+    "\n",
     "        if (length(new_names) > 0) {\n",
     "            message(\"Analysis completed for: \", paste(new_names, collapse=\",\"))\n",
     "        }\n",
-    "        \n",
+    "\n",
     "        # Release memory\n",
     "        fdat$residual_X[[r]] <- NA\n",
-    "        fdat$residual_Y[[r]] <- NA      \n",
+    "        fdat$residual_Y[[r]] <- NA\n",
     "        r = r + 1\n",
     "    }\n",
     "\n",