diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index 53b08ae..0000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,25 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve
-title: ''
-labels: bug
-assignees: ''
-
----
-
-**Describe the bug**
-A clear and concise description of what the bug is.
-
-**Expected behavior**
-A clear and concise description of what you expected to happen.
-
-**Input data**
-Share the data if possible. If not, describe it as much as you can.
-
-**Genome build**
-
-**Screenshots or pipeline output**
-If applicable, add screenshots to help explain your problem.
-
-**Additional context**
-Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000..bbbac7e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,42 @@
+name: Bug report
+description: Report something that is broken or incorrect
+labels: bug
+body:
+ - type: textarea
+ id: description
+ attributes:
+ label: Description of the bug
+ description: A clear and concise description of what the bug is.
+ validations:
+ required: true
+
+ - type: textarea
+ id: command_used
+ attributes:
+ label: Command used and terminal output
+ description: Steps to reproduce the behaviour. Please paste the command you used to launch the pipeline and the output from your terminal.
+ render: console
+ placeholder: |
+ $ nextflow run ...
+
+ Some output where something broke
+
+ - type: textarea
+ id: files
+ attributes:
+ label: Relevant files
+ description: |
+ Please drag and drop the relevant files here. Create a `.zip` archive if the extension is not allowed.
+ Your verbose log file `.nextflow.log` is often useful _(this is a hidden file in the directory where you launched the pipeline)_ as well as custom Nextflow configuration files.
+
+ - type: textarea
+ id: system
+ attributes:
+ label: System information
+ description: |
+ * Nextflow version _(eg. 23.04.0)_
+ * Hardware _(eg. HPC, Desktop, Cloud)_
+ * Executor _(eg. slurm, local, awsbatch)_
+ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_
+ * OS _(eg. CentOS Linux, macOS, Linux Mint)_
+ * Version of LiuzLab/ai_marrvel _(eg. 1.1, 1.5, 1.8.2)_
diff --git a/.github/workflows/nf-core-lint.yml b/.github/workflows/nf-core-lint.yml
new file mode 100644
index 0000000..2a97eaa
--- /dev/null
+++ b/.github/workflows/nf-core-lint.yml
@@ -0,0 +1,84 @@
+name: nf-core linting
+
+on:
+ push:
+ branches:
+ # - main
+ - "**"
+ pull_request:
+ branches:
+ - main
+ - develop
+
+jobs:
+ pre-commit:
+ name: Run Pre-Commit Hooks
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python 3.12
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ architecture: "x64"
+
+ - name: Install pre-commit
+ run: pip install pre-commit
+
+ - name: Run pre-commit
+ run: pre-commit run --all-files
+
+ nf-core:
+ name: Run nf-core lint
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ NXF_VER:
+ - "23.10.1"
+
+ steps:
+ - name: Check out pipeline code
+ uses: actions/checkout@v4
+
+ - name: Install Nextflow
+ uses: nf-core/setup-nextflow@v2
+ with:
+ version: "${{ matrix.NXF_VER }}"
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ architecture: "x64"
+
+ - name: read .nf-core.yml
+ uses: pietrobolcato/action-read-yaml@1.1.0
+ id: read_yml
+ with:
+ config: ${{ github.workspace }}/.nf-core.yml
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install nf-core==${{ steps.read_yml.outputs['nf_core_version'] }}
+ - name: Run nf-core pipelines lint
+ if: ${{ github.base_ref != 'master' }}
+ env:
+ GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
+ run: nf-core -l lint_log.txt pipelines lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
+
+ - name: Save PR number
+ if: ${{ always() }}
+ run: echo ${{ github.event.pull_request.number }} > PR_number.txt
+
+ - name: Upload linting log artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: nf-core-lint-logs
+ path: |
+ lint_log.txt
+ lint_results.md
+ PR_number.txt
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b2d4f39..bf3978b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,7 +20,6 @@ jobs:
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
-
- name: Install jq
run: sudo apt-get install jq
@@ -81,7 +80,6 @@ jobs:
aws sqs delete-queue --queue-url ${{ steps.step_queue.outputs.queue_url }}
echo "Queue deleted: ${{ steps.step_queue.outputs.queue_url }}"
-
- name: Close Pull Request on Failure
if: ${{ failure() }} && github.event_name == 'pull_request'
run: |
diff --git a/.gitignore b/.gitignore
index 9861bc9..0311b94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,23 +1,33 @@
# General
**/__pycache__
+**/.DS_Store
+.vscode/
bcftools-1.9.tar.bz2
run/bedtools
-.vscode/
-.DS_Store
-*/.DS_Store
test_data/
test_ref/
test_output/
+**/.ipynb_checkpoints/
+*.ipynb
+.nfs*
# Nextflow Related
-.nextflow*
-.nextflow/
-work/
-out/
-trace-*txt
-report-*html
-params.yaml
+/.nextflow*
+/.nextflow/
+/work*/
+/out*/
+/store*/
+/dag-*.html
+/trace-*.txt
+/report-*.html
+/timeline-*.html
+/params.yaml
# Ignore R history and session files
.Rhistory
.RData
+
+# Ignore some files from model interpreter
+*/model_interpreter/results
+*/model_interpreter/templates
+*/model_interpreter/main.nf
diff --git a/.nf-core.yml b/.nf-core.yml
new file mode 100644
index 0000000..b590482
--- /dev/null
+++ b/.nf-core.yml
@@ -0,0 +1,43 @@
+repository_type: pipeline
+nf_core_version: 3.2.0
+
+lint:
+ pipeline_name_conventions: False # Ignore: Naming does not adhere to nf-core conventions: Contains uppercase letters
+ schema_lint: False # Ignore: Parameter input not found in schema
+ actions_awsfulltest: False
+ pipeline_todos: False
+ nextflow_config: False
+ multiqc_config: False
+ files_unchanged: False
+ files_exist:
+ - .gitattributes
+ - .editorconfig
+ - CHANGELOG.md
+ - CITATIONS.md
+ - CODE_OF_CONDUCT.md
+ - .github/.dockstore.yml
+ - .github/CONTRIBUTING.md
+ - .github/ISSUE_TEMPLATE/config.yml
+ - .github/ISSUE_TEMPLATE/feature_request.yml
+ - .github/PULL_REQUEST_TEMPLATE.md
+ - .github/workflows/awstest.yml
+ - .github/workflows/awsfulltest.yml
+ - .github/workflows/branch.yml
+ - .github/workflows/linting_comment.yml
+ - .github/workflows/linting.yml
+ - .github/workflows/ci.yml
+ - assets/multiqc_config.yml
+ - assets/email_template.html
+ - assets/email_template.txt
+ - assets/sendmail_template.txt
+ - assets/nf-core-AI_MARRVEL_logo_light.png
+ - conf/igenomes.config
+ - conf/igenomes_ignored.config
+ - conf/test.config
+ - conf/test_full.config
+ - docs/images/nf-core-AI_MARRVEL_logo_light.png
+ - docs/images/nf-core-AI_MARRVEL_logo_dark.png
+ - docs/output.md
+ - docs/README.md
+ - docs/usage.md
+ - ro-crate-metadata.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..9e9f0e1
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
+repos:
+ - repo: https://github.com/pre-commit/mirrors-prettier
+ rev: "v3.1.0"
+ hooks:
+ - id: prettier
+ additional_dependencies:
+ - prettier@3.2.5
+
+ - repo: https://github.com/editorconfig-checker/editorconfig-checker.python
+ rev: "3.0.3"
+ hooks:
+ - id: editorconfig-checker
+ alias: ec
diff --git a/.prettierignore b/.prettierignore
new file mode 100644
index 0000000..4e2ba77
--- /dev/null
+++ b/.prettierignore
@@ -0,0 +1,12 @@
+email_template.html
+slackreport.json
+.nextflow*
+work/
+data/
+results/
+.DS_Store
+testing/
+testing*
+*.pyc
+bin/
+utils/
diff --git a/.prettierrc.yml b/.prettierrc.yml
new file mode 100644
index 0000000..c81f9a7
--- /dev/null
+++ b/.prettierrc.yml
@@ -0,0 +1 @@
+printWidth: 120
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index fd115c3..809f5c0 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,4 +10,4 @@ python:
- requirements: docs/source/requirements.txt
sphinx:
- configuration: docs/source/conf.py
\ No newline at end of file
+ configuration: docs/source/conf.py
diff --git a/Dockerfile b/Dockerfile
index bd1733a..d95e234 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,18 +1,7 @@
-# List of dependencies installed
-# bcftools v1.9-1-deb_cv1
-# ensemblorg/ensembl-vep:release_104.3
-# python 2.7
-# python 3.8.13
-# R 4.2.1
-
-# dc7a542b7e1f
-#FROM ubuntu:18.04
-FROM ensemblorg/ensembl-vep:release_104.3
-USER root
-ENV DEBIAN_FRONTEND noninteractive
+FROM python:3.8
# Install dependencies
-RUN apt-get update && apt-get install -y \
+RUN apt update && apt install -y \
curl \
git \
build-essential \
@@ -22,54 +11,15 @@ RUN apt-get update && apt-get install -y \
libbz2-dev \
liblzma-dev \
tabix \
- python2.7 \
- python3.8 \
- python3.8-dev \
- python3.8-distutils \
- python3-apt
-
-RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-RUN python3.8 get-pip.py
+ jq
# Install python 3.8 dependencies
COPY requirements.txt /opt/requirements.txt
RUN pip3 install --upgrade pip
RUN pip3 install -r /opt/requirements.txt
-RUN pip3 install bgzip
-
-
-# Install R
-RUN apt-get update
-RUN apt install -y --no-install-recommends software-properties-common dirmngr
-# Add the keys
-RUN apt install wget
-RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
-
-# add the R 4.0 repo from CRAN -- adjust 'focal' to 'groovy' or 'bionic' as needed
-#RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || \
-# apt-key adv --keyserver ha.pool.sks-keyservers.net --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || \
-# apt-key adv --keyserver pgp.mit.edu --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || \
-# apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || \
-# apt-key adv --keyserver keyserver.pgp.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
-#RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
-
-RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
-RUN add-apt-repository universe
-RUN apt-get update
-
-RUN apt install -y r-base r-base-core
-
-# Install R libs
-RUN R -e "install.packages('data.table',dependencies=TRUE, repos='http://cran.rstudio.com/')"
-RUN R -e "install.packages('dplyr',dependencies=TRUE, repos='http://cran.rstudio.com/')"
-RUN R -e "install.packages('ontologyIndex',dependencies=TRUE, repos='http://cran.rstudio.com/')"
-RUN R -e "install.packages('ontologySimilarity',dependencies=TRUE, repos='http://cran.rstudio.com/')"
-RUN R -e "install.packages('tidyverse',dependencies=TRUE, repos='http://cran.rstudio.com/')"
-
-
# Install bcftools
-RUN wget https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2
+RUN curl -OL https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2
RUN mv bcftools-1.20.tar.bz2 /opt/bcftools-1.20.tar.bz2
RUN tar -xf /opt/bcftools-1.20.tar.bz2 -C /opt/ && \
rm /opt/bcftools-1.20.tar.bz2 && \
@@ -80,9 +30,6 @@ RUN tar -xf /opt/bcftools-1.20.tar.bz2 -C /opt/ && \
rm -rf /opt/bcftools-1.20
# Install bedtools
-RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary
+RUN curl -OL https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary
RUN mv bedtools.static.binary /run/bedtools
RUN chmod a+x /run/bedtools
-
-
-
diff --git a/Dockerfiles/oldpython.Dockerfile b/Dockerfiles/oldpython.Dockerfile
new file mode 100644
index 0000000..845b477
--- /dev/null
+++ b/Dockerfiles/oldpython.Dockerfile
@@ -0,0 +1,5 @@
+FROM python:3.8
+
+RUN pip install scikit-learn==1.1.2
+RUN pip install pandas
+
diff --git a/Dockerfiles/r.Dockerfile b/Dockerfiles/r.Dockerfile
new file mode 100644
index 0000000..a8598eb
--- /dev/null
+++ b/Dockerfiles/r.Dockerfile
@@ -0,0 +1,5 @@
+FROM rocker/tidyverse
+
+# Install R libs
+RUN R -e "install.packages('ontologyIndex',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+RUN R -e "install.packages('ontologySimilarity',dependencies=TRUE, repos='http://cran.rstudio.com/')"
diff --git a/README.md b/README.md
index d09d12c..2ab6c05 100644
--- a/README.md
+++ b/README.md
@@ -7,29 +7,27 @@
-
+
-
+
-**AI-MARRVEL (AIM)** is an AI system for rare genetic disease diagnosis.
-
-It takes as input patient VCF and phenotype (formatted with HPO) to predict the causal variant(s).
-In making prediction, it takes variant annotation from [MARRVEL](https://marrvel.org/) database and more,
-and generates **prediction score** + **confidence score** as output.
+**AI-MARRVEL (AIM)** is an AI system for rare genetic disease diagnosis.
+It takes as input patient VCF and phenotype (formatted with HPO) to predict the causal variant(s).
+In making prediction, it takes variant annotation from [MARRVEL](https://marrvel.org/) database and more,
+and generates **prediction score** + **confidence score** as output.
You can use AI-MARRVEL from our [website](https://ai.marrvel.org/) or follow the [documentation](https://aimarrvel.readthedocs.io/en/main/) to run locally.
-
:new: Our paper is now published in [NEJM AI](https://ai.nejm.org/doi/full/10.1056/AIoa2300009)!
-
## Quick Start
### Install Required Data Dependencies
+
AIM utilizes various databases for variant annotation, all of which have been compiled and are available for download. We use AWS S3 for data access, and the data can be downloaded by following these steps:
1. **Install the AWS CLI**: Follow the instructions provided in the [AWS CLI Installation Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
@@ -37,13 +35,17 @@ AIM utilizes various databases for variant annotation, all of which have been co
```bash
$ cd
```
-3. Use the following command to sync the S3 bucket to your local directory:
- ```bash
- $ aws s3 sync s3://aim-data-dependencies-2.0-public . --no-sign-request
- ```
+3. Use the following command to sync the S3 bucket to your local directory:
+ ```bash
+ $ aws s3 sync s3://aim-data-dependencies-2.3-public . --no-sign-request
+ ```
### Get the software
+
+[](https://www.nextflow.io/)
+
AIM is released as a Nextflow pipeline for easy distribution. To get it:
+
```bash
$ git clone https://github.com/LiuzLab/AI_MARRVEL
$ cd AI_MARRVEL
@@ -51,22 +53,18 @@ $ nextflow run main.nf --version
```
### Run with your sample
-```bash
-$ nextflow run main.nf --ref_dir
- --input_vcf
- --input_hpo
- --outdir
- --bed_filter # Optional
- --run_id [Sample Id] # Optional, default: 1
- --ref_ver [Reference genome: hg19/hg38] # Optional, default: hg19
- --exome_filter # Optional
-```
-Alternatively, the pipeline can be executed with a parameter file (yaml)
```bash
-$ nextflow run main.nf -params-file params.yaml
+nextflow run /path/to/AI_MARRVEL/main.nf \\
+ -profile \\
+ --ref_dir /path/to/dependencies/ \\
+ --input_vcf /path/to/input.vcf \\
+ --input_hpo /path/to/input.hpos.txt \\
+ --outdir /path/to/output \\
+ --storedir /path/to/store \\
+ --run_id \\
+ --ref_ver
```
-NOTE: You need to create `params.yaml` by copying [params.yaml.example](params.yaml.example) file and follow the instruction.
For more information on usage and parameters which are open for modification, please use `--help` option as shown below.
@@ -74,13 +72,13 @@ For more information on usage and parameters which are open for modification, pl
$ nextflow run main.nf --help
```
-
## License
+
AI-MARRVEL is licensed under GPL-3.0. You are welcomed to use it for research purpose.
For business purpose, please contact us for licensing.
-
## Disclaimer
+
- Some of the data and software included in the distribution may be subject to third-party constraints. Users of the data and software are solely responsible for establishing the nature of and complying with any such restrictions.
- AI-MARRVEL provides this data and software in good faith, but make no warranty, express or implied, nor assume any legal liability or responsibility for any purpose for which they are used.
diff --git a/assets/NO_FILE b/assets/NO_FILE
new file mode 100644
index 0000000..e69de29
diff --git a/bin/VarTierDiseaseDBFalse.R b/bin/VarTierDiseaseDBFalse.R
index 1e0c613..27c42fa 100755
--- a/bin/VarTierDiseaseDBFalse.R
+++ b/bin/VarTierDiseaseDBFalse.R
@@ -4,6 +4,7 @@ library(tidyr)
library(readr)
library(tibble)
library(dplyr)
+library(stringr)
library(data.table)
# set adj.k value to adjust Tier
adj.k <- 1.05
@@ -22,7 +23,7 @@ genemap2.Inh.F <- readr::read_tsv(omim_inheritance)
# load the input ####
anno.columns <- c(
- "varId_dash",
+ "varId",
"zyg",
"geneSymbol",
"geneEnsId",
@@ -39,6 +40,8 @@ anno.columns <- c(
anno.orig <- readr::read_csv(in.f)
anno <- anno.orig[, anno.columns]
+anno <- anno %>% mutate(varId = str_split(varId, "_-", simplify = TRUE)[, 1]) # intergenic_variant
+anno <- anno %>% mutate(varId = str_split(varId, "_E", simplify = TRUE)[, 1]) # transcript
# rename the col
colnames(anno) <- c(
diff --git a/bin/add_c_nc.py b/bin/add_c_nc.py
index 8c4b74d..91fc24b 100755
--- a/bin/add_c_nc.py
+++ b/bin/add_c_nc.py
@@ -10,13 +10,13 @@ def add_c_nc(score, ref):
if ref == "hg38":
clin_c = pd.read_csv("merge_expand/hg38/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg38/clin_nc.tsv.gz", sep="\t")
- hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg38/hgmd_c.tsv.gz", sep="\t")
+ hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
else:
clin_c = pd.read_csv("merge_expand/hg19/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg19/clin_nc.tsv.gz", sep="\t")
- hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg19/hgmd_c.tsv.gz", sep="\t")
+ hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
a = score.pos.values
ac = score.chrom.values
@@ -28,7 +28,7 @@ def add_c_nc(score, ref):
i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))
- cln = pd.concat(
+ clin = pd.concat(
[
temp.loc[i, :].reset_index(drop=True),
clin_nc.loc[j, :].reset_index(drop=True),
@@ -38,7 +38,7 @@ def add_c_nc(score, ref):
# Take into account When HGMD data base is empty
if hgmd_nc.shape[0] == 0:
- pass
+ hgmd = hgmd_nc
else:
# merge by region
@@ -58,12 +58,12 @@ def add_c_nc(score, ref):
axis=1,
)
- merged = score.merge(cln, how="left", on="varId")
- if hgmd_nc.shape[0] == 0:
- merged["nc_HGMD_Exp"] = np.NaN
- merged["nc_RANKSCORE"] = np.NaN
- else:
- merged = merged.merge(hgmd, how="left", on="varId")
+ merged = score.merge(
+ clin_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
+ how="left",
+ on=["chrom", "pos", "ref", "alt"],
+ )
+ merged = merged.merge(clin, how="left", on="varId")
if hgmd_c.shape[0] == 0:
merged["c_HGMD_Exp"] = np.NaN
@@ -71,16 +71,15 @@ def add_c_nc(score, ref):
merged["CLASS"] = np.NaN
else:
merged = merged.merge(
- hgmd_c,
+ hgmd_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
- left_on=["chrom", "pos", "ref", "alt"],
- right_on=["new_chr", "new_pos", "ref", "alt"],
+ on=["chrom", "pos", "ref", "alt"],
)
- merged = merged.merge(
- clin_c,
- how="left",
- left_on=["chrom", "pos", "ref", "alt"],
- right_on=["new_chr", "new_pos", "ref", "alt"],
- )
+
+ if hgmd_nc.shape[0] == 0:
+ merged["nc_HGMD_Exp"] = np.NaN
+ merged["nc_RANKSCORE"] = np.NaN
+ else:
+ merged = merged.merge(hgmd, how="left", on="varId")
return merged
diff --git a/bin/annotation/utils_for_marrvel_flatfile_module_3.py b/bin/annotation/utils_for_marrvel_flatfile_module_3.py
index 3e2d7fa..7a497ab 100755
--- a/bin/annotation/utils_for_marrvel_flatfile_module_3.py
+++ b/bin/annotation/utils_for_marrvel_flatfile_module_3.py
@@ -118,7 +118,7 @@ def getAnnotateInfoRow_3_1(row, genomeRef):
varObj.ref = ref
varObj.alt = alt
varObj.varId_dash = "-".join([str(chrom), str(start), ref, alt])
- # print('varId dash:', varObj.varId_dash)
+ # NOTE(JL): varId_dash should have been renamed. the start value is needed for simple_repeat
varId = "_".join([str(chrom), str(pos), ref, alt, transcriptId])
varObj.varId = varId
if "ZYG" in row:
diff --git a/bin/extraModel/generate_bivar_data.py b/bin/extraModel/generate_bivar_data.py
index 06a7c2c..720f86c 100755
--- a/bin/extraModel/generate_bivar_data.py
+++ b/bin/extraModel/generate_bivar_data.py
@@ -10,7 +10,7 @@
def process_sample(data_folder, sample_id, default_pred, labeling=False):
if labeling:
- raise 'The below code was not tested with real data with labeling=True'
+ raise NotImplementedError('The below code was not tested with real data with labeling=True')
recessive_folder = f"{data_folder}/recessive_matrix/"
if not os.path.exists(recessive_folder):
@@ -21,7 +21,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
os.mkdir(tmp_folder)
# read feature matrix for single var
- feature_fn = f"{sample_id}.csv"
+ feature_fn = f"{sample_id}.default_prediction.csv"
# feature_df = []
# for feature_fn in feature_fns:
if "csv" in feature_fn:
@@ -68,7 +68,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
]
# Remove all the duplicated variant pairs.
- gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
+ gene_var_pairs_df = pd.DataFrame(gene_var_pairs, columns=['geneEnsId', 'varId1', 'varId2'])
# gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])
# Use only subset columns of features
@@ -94,8 +94,8 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
# Calculate variant distance
recessive_feature_df['var_dist'] = (
- recessive_feature_df.varId1.str.split('-').apply(lambda x: x[1]).astype(float)
- - recessive_feature_df.varId2.str.split('-').apply(lambda x: x[1]).astype(float)
+ recessive_feature_df.varId1.str.split('_').apply(lambda x: x[1]).astype(float)
+ - recessive_feature_df.varId2.str.split('_').apply(lambda x: x[1]).astype(float)
).abs()
# Calculate label
@@ -113,7 +113,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
)
# Create pair id as the legacy did
- recessive_feature_df = recessive_feature_df.set_index(recessive_feature_df.varId1 + '_' + recessive_feature_df.varId2)
+ recessive_feature_df = recessive_feature_df.set_index(recessive_feature_df.varId1 + '-' + recessive_feature_df.varId2)
# Drop the intermediate columns
recessive_feature_df = recessive_feature_df.drop(
@@ -130,4 +130,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
# Sort before saving
recessive_feature_df = recessive_feature_df.sort_index()
+ if recessive_feature_df.shape[0] == 0:
+ return
+
recessive_feature_df.to_csv(f"{recessive_folder}/{sample_id}.csv")
diff --git a/bin/extraModel_main.py b/bin/extraModel_main.py
index 57c1f50..776bb52 100755
--- a/bin/extraModel_main.py
+++ b/bin/extraModel_main.py
@@ -11,6 +11,7 @@
from tqdm import tqdm
from scipy.stats import rankdata
from extraModel.integrate_output import *
+from model_interpreter.variant_model_interpreter import ModelInterpreter
parser = argparse.ArgumentParser()
@@ -45,6 +46,30 @@
print(f"### Start generating recessive matrix and running predictions.")
+def run_model_interpretation(model, data_df, features, output_prefix, run_id, model_type):
+ """Run SHAP analysis for a specific model"""
+ interpreter = ModelInterpreter()
+ print(interpreter, model)
+ interpreter.set_model(model) # Set the already loaded model
+
+ # Prepare data (use only the features used by this model)
+ model_data = data_df.loc[:, features]
+
+ # Calculate SHAP values
+ interpreter.calculate_shap_values(model_data)
+
+ print(model_data.index[0:9])
+ interpreter.variant_ids = list(model_data.index)
+ print(model_data.columns[0:9])
+ interpreter.feature_names = list(model_data.columns)
+
+ print(interpreter.model_type)
+ print(interpreter.shap_values.base_values)
+ # Create output directories if they don't exist
+ os.makedirs("shap_outputs", exist_ok=True)
+ interpreter.run_shap_analysis(output_file=f"shap_outputs/{run_id}_{model_type}_shap_values.json")
+ # os.makedirs("lime_outputs", exist_ok=True)
+ # interpreter.run_lime_analysis(model_data, output_file=f"lime_outputs/{run_id}_{model_type}_lime_values.json")
def assign_ranking(df):
pred_df = df.copy()
@@ -58,7 +83,7 @@ def assign_ranking(df):
def AIM(data_folder, sample_id):
- feature_fn = f"{sample_id}.csv"
+ feature_fn = f"{sample_id}.default_prediction.csv"
if not os.path.exists(feature_fn):
print(f"{feature_fn} does not exist.")
@@ -78,6 +103,16 @@ def AIM(data_folder, sample_id):
df_pred = df_pred.sort_values("confidence", ascending=False)
df_pred = assign_ranking(df_pred)
df_pred.to_csv(f"{out_folder}/{sample_id}_{mn}_predictions.csv")
+
+ # Run model interpretation
+ run_model_interpretation(
+ model=model_dict[mn]["model"],
+ data_df=df_pred,
+ features=model_dict[mn]["features"],
+ output_prefix=mn,
+ run_id=sample_id,
+ model_type = mn
+ )
print(f"### Start processing recessive data and make predictions.")
@@ -109,18 +144,21 @@ def AIM(data_folder, sample_id):
df_pred = df_pred.sort_values("confidence", ascending=False, kind="stable")
df_pred = assign_ranking(df_pred)
df_pred.to_csv(f"{out_folder}/{sample_id}_{mn}_predictions.csv")
+
+ # Run model interpretation
+ run_model_interpretation(
+ model=model_dict[mn]["model"],
+ data_df=df_pred,
+ features=model_dict[mn]["features"],
+ output_prefix=mn,
+ run_id=sample_id,
+ model_type = mn
+ )
else:
# AIM found no recessive variant pair
pass
-
- print(f"Integrating all information for sample {sample_id}...")
- ######### Construction Paused here #########
- integrated_df = integrate_output(out_folder, data_folder, sample_id)
- if not os.path.exists(f"{out_folder}/integrated"):
- os.mkdir(f"{out_folder}/integrated")
- integrated_df.to_csv(f"{out_folder}/integrated/{sample_id}_integrated.csv")
return
# for sample_id in tqdm(sample_folders):
-AIM(out_folder, sample_id)
+AIM(out_folder, sample_id)
\ No newline at end of file
diff --git a/bin/feature.py b/bin/feature.py
index cd60f43..fb81967 100755
--- a/bin/feature.py
+++ b/bin/feature.py
@@ -88,6 +88,13 @@ def main():
parser.add_argument(
"-genomeRef", "--genomeRef", help="Proivde genome ref: hg19, hg38"
)
+ parser.add_argument(
+ "-enableLIT", "--enableLIT",
+ action="store_true",
+ default=False,
+ help="Enable low‑impact transcripts"
+ )
+
args = parser.parse_args()
# check the user args
checkUserArgs(args)
@@ -96,6 +103,7 @@ def main():
print("modules:", args.modules)
moduleList = args.modules.split(",")
print("modules list:", moduleList)
+ print("low impact transcripts enabled: ", args.enableLIT)
# start time
start_time = time.time()
@@ -257,28 +265,33 @@ def main():
break
print("input annoatated varFile:", args.varFile)
t1 = time.time()
- varDf = pd.read_csv(
- args.varFile, sep="\t", skiprows=numHeaderSkip, error_bad_lines=False
+ transcriptDf = pd.read_csv(
+ args.varFile, sep="\t", skiprows=numHeaderSkip,
)
- # varDf=varDf[0:10]
+
+ # Ignore Low Impact Transcripts (LIT).
+ if args.enableLIT:
+ transcriptDf = transcriptDf.groupby("#Uploaded_variation", group_keys=False).apply(lambda g: g[g.IMPACT.isin(["HIGH", "MODERATE"])] if g.IMPACT.isin(["HIGH", "MODERATE"]).any() else g).reset_index(drop=True)
+
+ # transcriptDf=transcriptDf[0:10]
# #do this if need to have a small test
- print("shape:", varDf.shape)
+ print("shape:", transcriptDf.shape)
t2 = time.time()
inputReadTime = t2 - t1
- inputNumRows = len(varDf.index)
+ inputNumRows = len(transcriptDf.index)
# TODO: change with `.rename`
- if "GERP++_RS" in varDf.columns:
+ if "GERP++_RS" in transcriptDf.columns:
print("found GERP++RS")
- varDf["GERPpp_RS"] = varDf["GERP++_RS"]
- if "GERP++_NR" in varDf.columns:
+ transcriptDf["GERPpp_RS"] = transcriptDf["GERP++_RS"]
+ if "GERP++_NR" in transcriptDf.columns:
print("found GERP++NR")
- varDf["GERPpp_NR"] = varDf["GERP++_NR"]
+ transcriptDf["GERPpp_NR"] = transcriptDf["GERP++_NR"]
# update column names which have - in it
- if "fathmm-MKL_coding_score" in varDf.columns:
- varDf["fathmm_MKL_coding_score"] = varDf["fathmm-MKL_coding_score"]
- if "M-CAP_score" in varDf.columns:
- varDf["M_CAP_score"] = varDf["M-CAP_score"]
+ if "fathmm-MKL_coding_score" in transcriptDf.columns:
+ transcriptDf["fathmm_MKL_coding_score"] = transcriptDf["fathmm-MKL_coding_score"]
+ if "M-CAP_score" in transcriptDf.columns:
+ transcriptDf["M_CAP_score"] = transcriptDf["M-CAP_score"]
if "conserve" in moduleList:
decipherSortedDf = decipherDf.set_index(['Chr', 'Start', 'Stop']).sort_index()
@@ -290,7 +303,7 @@ def main():
hgmdHPOScoreAccSortedDf = hgmdHPOScoreDf.groupby('acc_num').first().sort_index()
annotateInfoDf = getAnnotateInfoRows_3(
- varDf,
+ transcriptDf,
args.genomeRef,
clinvarGeneDf,
clinvarAlleleDf,
diff --git a/bin/fillna_tier.py b/bin/fillna_tier.py
index 3aecc4a..72f279c 100644
--- a/bin/fillna_tier.py
+++ b/bin/fillna_tier.py
@@ -21,6 +21,7 @@ def getValFromStr(valStr: str, select: str = "min"):
def feature_engineering(score_file, tier_file):
variable_name = [
+ "varId",
"varId_dash",
"hgmdSymptomScore",
"omimSymMatchFlag",
@@ -102,10 +103,11 @@ def feature_engineering(score_file, tier_file):
patient = score_file.copy()
patient = patient[variable_name]
+ patient["varId"] = patient["varId"].str.split("_-").str[0] # integenic_variant
patient = patient.fillna("-")
indel_index = [
- len(i.split("-")[-1]) != 1 or len(i.split("-")[-2]) != 1
- for i in patient["varId_dash"].to_list()
+ len(i.split("_")[-1]) != 1 or len(i.split("_")[-2]) != 1
+ for i in patient["varId"].to_list()
]
patient.loc[patient["phrank"] == "-", "phrank"] = 0
@@ -921,7 +923,7 @@ def feature_engineering(score_file, tier_file):
"ESP6500_EA_AF",
],
]
- patient = patient.groupby(["varId_dash"], sort=False)[variable_name[1:]].max()
+ patient = patient.groupby(["varId"], sort=False)[variable_name[1:]].max()
patient.loc[
:,
[
@@ -969,6 +971,7 @@ def feature_engineering(score_file, tier_file):
tier.loc[:, ["TierAD", "TierAR", "TierAR.adj"]] = -tier.loc[
:, ["TierAD", "TierAR", "TierAR.adj"]
]
+
tier = tier.groupby(["Uploaded_variation"], sort=False)[tier_vars].max()
tier.loc[:, ["TierAD", "TierAR", "TierAR.adj"]] = -tier.loc[
:, ["TierAD", "TierAR", "TierAR.adj"]
diff --git a/bin/generate_input_vcf.py b/bin/generate_input_vcf.py
new file mode 100755
index 0000000..5393928
--- /dev/null
+++ b/bin/generate_input_vcf.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3.8
+
+import string
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("variants_file", type=str)
+
+args = parser.parse_args()
+
+vcf_header = """
+##fileformat=VCFv4.2
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FORMAT=
+##FILTER=
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
+""".strip()
+
+vcf_row_template = string.Template("""
+$chrom $pos ${chrom}_${pos}_${ref}_${alt} $ref $alt . . . GT:AD:DP:GQ:PL 0/1:7,5:12:99:142,0,214
+""".strip())
+
+def main(variants_file):
+ with open(variants_file, "r") as f:
+ variants = [line.strip() for line in f if line.strip()]
+
+ vcf_rows = []
+ for variant in variants:
+ chrom, pos, ref, alt = variant.split("_")
+ vcf_row = vcf_row_template.substitute(
+ chrom=chrom,
+ pos=pos,
+ ref=ref,
+ alt=alt,
+ )
+ vcf_rows.append(vcf_row)
+
+ with open("input.vcf", "w", encoding="ascii") as text_file:
+ text_file.write(vcf_header + "\n")
+ text_file.write("\n".join(vcf_rows))
+
+if __name__ == "__main__":
+ main(**vars(args))
diff --git a/bin/generate_new_matrix_2.py b/bin/generate_new_matrix_2.py
index b1685df..783cf52 100755
--- a/bin/generate_new_matrix_2.py
+++ b/bin/generate_new_matrix_2.py
@@ -20,7 +20,6 @@ def main():
### get original coordinates ###
merged["varId"] = merged["varId"].apply(lambda x: x.split("_E")[0])
- path_phrank = sys.argv[1] + ".phrank.txt"
phr = pd.read_csv(path_phrank, sep="\t", names=["ENSG", "phrank"])
merged = merged.merge(phr, left_on="geneEnsId", right_on="ENSG", how="left")
diff --git a/bin/merge_rm.py b/bin/merge_rm.py
index ad3089d..5c267d4 100755
--- a/bin/merge_rm.py
+++ b/bin/merge_rm.py
@@ -2,11 +2,13 @@
import pandas as pd
import sys
-###merge score files with final rank matrix
+feature_path = sys.argv[1]
+annot_path = sys.argv[2]
+output_path = sys.argv[3]
-# read rank matrix and score file
-rm = pd.read_csv(sys.argv[1] + ".csv")
-annot = pd.read_csv("scores.txt.gz", sep="\t", compression="gzip")
+# read feature matrix and score file
+df = pd.read_csv(feature_path)
+annot = pd.read_csv(annot_path, sep="\t", compression="gzip")
# get columns from score file
@@ -30,6 +32,7 @@
# get original coordinates
annot["varId"] = annot["varId"].apply(lambda x: x.split("_E")[0])
+annot["varId"] = annot["varId"].apply(lambda x: x.split("_-")[0])
# rename for final output
annot = annot.rename(
@@ -37,10 +40,10 @@
)
# merge
-test = rm.merge(annot, right_on="varId_dash", left_on="Unnamed: 0", how="left")
+test = df.merge(annot, right_on="origId", left_on="Unnamed: 0", how="left")
test.to_csv(
- "final_matrix_expanded/" + sys.argv[1] + ".expanded.csv.gz",
+ output_path,
index=False,
compression="gzip",
)
diff --git a/bin/mod5_diffusion.py b/bin/mod5_diffusion.py
index ea4fe07..019b027 100755
--- a/bin/mod5_diffusion.py
+++ b/bin/mod5_diffusion.py
@@ -148,7 +148,7 @@ def diffuseSample(ID, Anno_df, Phrank_folder):
scores = stats.rankdata(score_ordered, "max") / len(score_ordered)
m12_df["diffuse_Phrank_STRING"] = scores
- m12_df = m12_df.drop_duplicates(subset=["varId_dash"])
- m12_df = m12_df[["varId_dash", "diffuse_Phrank_STRING"]]
- m12_df = m12_df.set_index("varId_dash", drop=True)
+ m12_df = m12_df.drop_duplicates(subset=["varId"])
+ m12_df = m12_df[["varId", "diffuse_Phrank_STRING"]]
+ m12_df = m12_df.set_index("varId", drop=True)
return m12_df
diff --git a/bin/model_interpreter/__init__.py b/bin/model_interpreter/__init__.py
new file mode 100644
index 0000000..c33cbde
--- /dev/null
+++ b/bin/model_interpreter/__init__.py
@@ -0,0 +1 @@
+from .variant_model_interpreter import ModelInterpreter
\ No newline at end of file
diff --git a/bin/model_interpreter/bin/LIME_TO_JSON.py b/bin/model_interpreter/bin/LIME_TO_JSON.py
new file mode 100644
index 0000000..ea9f673
--- /dev/null
+++ b/bin/model_interpreter/bin/LIME_TO_JSON.py
@@ -0,0 +1,90 @@
+# Import necessary libraries
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent))
+from utils.numpy_to_python import convert_numpy_to_python
+
+from lime import lime_tabular
+import numpy as np
+import json
+
+def create_lime_explainer(X_train, feature_names, categorical_features=None, class_names=['negative', 'positive'], mode="classification"):
+ """
+ Create a LIME explainer object
+ """
+ if categorical_features is None:
+ categorical_features = []
+
+ explainer = lime_tabular.LimeTabularExplainer(
+ training_data=np.array(X_train),
+ feature_names=feature_names,
+ class_names=class_names,
+ categorical_features=categorical_features,
+ mode=mode,
+ verbose=False,
+ random_state=42
+ )
+
+ return explainer
+
+def create_lime_json(variant_ids, feature_names, processed_df, model, explainer, output_file="./results/lime_values.json", num_features=20):
+ """
+ Create JSON File from LIME explanations with preserved exact values
+ """
+ json_data = []
+
+ # Get explanations for all instances
+ for idx, instance in enumerate(processed_df.values):
+ # Get explanation for this instance
+ explanation = explainer.explain_instance(
+ instance,
+ model.predict_proba,
+ num_features=num_features
+ )
+
+ # Get the feature explanations as list of (feature, importance) tuples
+ exp_list = explanation.as_list()
+
+ # Handle intercept value (it's a dict with class label as key)
+ intercept_value = explanation.intercept[1] if isinstance(explanation.intercept, dict) else explanation.intercept
+
+ # Handle local prediction (it's a numpy array)
+ local_pred = explanation.local_pred[0] if isinstance(explanation.local_pred, (list, np.ndarray)) else explanation.local_pred
+
+ # Create entry for this instance
+ entry = {
+ "variant_id": variant_ids[idx],
+ "base_value": float(intercept_value),
+ "confidence": float(explanation.score),
+ "prediction_local": float(local_pred),
+ "model_output_score": {},
+ "feature_values": {},
+ "feature_explanations": [] # Add this to store the original explanation format
+ }
+
+ # Store the original explanation format
+ for feature_text, importance in exp_list:
+ entry["feature_explanations"].append({
+ "feature": feature_text, # This preserves the full text with conditions
+ "importance": float(importance)
+ })
+
+ # Extract just the feature name for the model_output_score
+ # Assuming format is "feature_name > value" or similar
+ feature_name = feature_text.split(' >')[0].strip()
+ entry["model_output_score"][feature_name] = float(importance)
+
+ # Add actual feature values
+ for feature_name in feature_names:
+ entry["feature_values"][feature_name] = float(instance[feature_names.index(feature_name)])
+
+ json_data.append(entry)
+
+ # Convert to JSON string with proper formatting
+ json_string = json.dumps(json_data, indent=2, default=convert_numpy_to_python)
+
+ # Save to file
+ with open(output_file, 'w') as f:
+ f.write(json_string)
+
+ return json_string
\ No newline at end of file
diff --git a/bin/model_interpreter/bin/SHAP_to_JSON.py b/bin/model_interpreter/bin/SHAP_to_JSON.py
new file mode 100644
index 0000000..47c8351
--- /dev/null
+++ b/bin/model_interpreter/bin/SHAP_to_JSON.py
@@ -0,0 +1,49 @@
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent))
+
+import numpy as np
+import json
+from utils.numpy_to_python import convert_numpy_to_python
+
+def create_shap_json(variant_ids, feature_names, shap_values, output_file="./results/shap_values.json"):
+ """
+ Create JSON file from SHAP values.
+ """
+ if not variant_ids or not feature_names:
+ raise ValueError("variant_ids or feature_names cannot be None")
+
+ json_data = []
+
+ # Determine if base_values is an array and extract the relevant value for binary classification
+ is_multiclass = isinstance(shap_values.base_values[0], (list, np.ndarray))
+
+ for idx in range(len(shap_values.values)):
+ # Handle the case where base_values is an array (e.g., binary classification)
+ if is_multiclass:
+ base_value = float(shap_values.base_values[idx][1]) # Use the base value for class 1
+ else:
+ base_value = float(shap_values.base_values[idx])
+
+ entry = {
+ "variant_id": variant_ids[idx],
+ "base_value": base_value,
+ "model_output_score": {},
+ "feature_values": {}
+ }
+
+ # Add feature contributions and values
+ for feat_idx, feat_name in enumerate(feature_names):
+ entry["model_output_score"][feat_name] = float(shap_values.values[idx][feat_idx])
+ entry["feature_values"][feat_name] = float(shap_values.data[idx][feat_idx])
+
+ json_data.append(entry)
+
+ # Convert to JSON string with proper formatting
+ json_string = json.dumps(json_data, indent=2, default=convert_numpy_to_python)
+
+ # Optionally save to file
+ with open(output_file, 'w') as f:
+ f.write(json_string)
+
+ return json_string
diff --git a/bin/model_interpreter/bin/__init__.py b/bin/model_interpreter/bin/__init__.py
new file mode 100644
index 0000000..8eccd63
--- /dev/null
+++ b/bin/model_interpreter/bin/__init__.py
@@ -0,0 +1,2 @@
+from .SHAP_to_JSON import create_shap_json
+from .LIME_TO_JSON import create_lime_explainer, create_lime_json
\ No newline at end of file
diff --git a/bin/model_interpreter/main.py b/bin/model_interpreter/main.py
new file mode 100644
index 0000000..a368b9b
--- /dev/null
+++ b/bin/model_interpreter/main.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# bin/model_interpreter/main.py
+
+import argparse
+from variant_model_interpreter import ModelInterpreter
+import pandas as pd
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Run model interpretation (SHAP/LIME) analysis')
+ parser.add_argument('--analysis-type', type=str, default='both', # Set default to 'both'
+ choices=['shap', 'lime', 'both'],
+ help='Type of analysis to run (shap, lime, or both)')
+ parser.add_argument('--model-path', type=str, required=True,
+ help='Path to the trained model .job file')
+ parser.add_argument('--input-path', type=str, required=True,
+ help='Path to the input CSV file')
+ parser.add_argument('--output-path', type=str, required=True,
+ help='Path where to save the analysis JSON')
+ return parser.parse_args()
+
+def main():
+ args = parse_args()
+
+ # Initialize interpreter
+ interpreter = ModelInterpreter()
+
+ # Load model
+ interpreter.load_model(args.model_path)
+
+ # Load and prepare data
+ input_df = pd.read_csv(args.input_path, index_col=0)
+ processed_df = interpreter.prepare_data(input_df)
+
+ # Run specified analysis
+ if args.analysis_type in ['shap', 'both']:
+ interpreter.calculate_shap_values(processed_df)
+ shap_output = args.output_path if args.analysis_type == 'shap' else args.output_path.replace('.json', '_shap.json')
+ interpreter.run_shap_analysis(output_file=shap_output)
+
+ if args.analysis_type in ['lime', 'both']:
+ lime_output = args.output_path if args.analysis_type == 'lime' else args.output_path.replace('.json', '_lime.json')
+ interpreter.run_lime_analysis(processed_df, output_file=lime_output)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/bin/model_interpreter/utils/__init__.py b/bin/model_interpreter/utils/__init__.py
new file mode 100644
index 0000000..5f79acf
--- /dev/null
+++ b/bin/model_interpreter/utils/__init__.py
@@ -0,0 +1 @@
+from .numpy_to_python import convert_numpy_to_python
\ No newline at end of file
diff --git a/bin/model_interpreter/utils/numpy_to_python.py b/bin/model_interpreter/utils/numpy_to_python.py
new file mode 100644
index 0000000..401fa8e
--- /dev/null
+++ b/bin/model_interpreter/utils/numpy_to_python.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+def convert_numpy_to_python(obj):
+ """
+ Convert Numpy to Python object for JSON storage
+
+ Usage: for json.dumps
+ json_string = json.dumps(json_data, indent=2, default=convert_numpy_to_python)
+
+ """
+
+ if isinstance(obj, np.ndarray):
+ return obj.tolist()
+ elif isinstance(obj, np.float32):
+ return float(obj)
+ return obj
diff --git a/bin/model_interpreter/variant_model_interpreter.py b/bin/model_interpreter/variant_model_interpreter.py
new file mode 100644
index 0000000..6ce3675
--- /dev/null
+++ b/bin/model_interpreter/variant_model_interpreter.py
@@ -0,0 +1,210 @@
+from typing import List, Optional, Dict, Any
+import json
+import logging
+
+import joblib
+import pandas as pd
+import shap
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+import xgboost as xgb
+from .bin import create_lime_explainer, create_lime_json, create_shap_json
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class ModelInterpreter:
+ def __init__(self):
+ self.model = None
+ self.model_type = None
+ self.shap_explainer = None
+ self.shap_values = None
+ self.lime_explainer = None
+ self.model_features = None
+ self.is_model_loaded = False
+ self.variant_ids = None
+ self.feature_names = None
+
+ def set_model(self, model) -> None:
+ """
+ Accept a pre-loaded model directly and extract its feature names.
+ """
+ try:
+ # Check the model type and extract feature names
+ if isinstance(model, (RandomForestClassifier, RandomForestRegressor)):
+ self.model_type = "random_forest"
+ self.task_type = "classification" if isinstance(model, RandomForestClassifier) else "regression"
+ self.model_features = list(model.feature_names_in_)
+ elif isinstance(model, xgb.XGBModel):
+ self.model_type = "xgboost"
+ self.task_type = "classification" if model.objective.startswith('binary:') else "regression"
+ self.model_features = list(model.get_booster().feature_names)
+ else:
+ raise ValueError(f"Unsupported model type: {type(model)}")
+
+ self.model = model
+ self.is_model_loaded = True
+ logger.info(f"Model set successfully with {len(self.model_features)} features")
+ logger.info(f"Model features: {self.model_features}")
+
+ except Exception as e:
+ logger.error(f"Error setting model: {str(e)}")
+ raise
+
+ def load_model(self, model_path: str) -> None:
+ """
+ Load a model from a file using joblib.
+ """
+ try:
+ # Load the model from a file
+ loaded_model = joblib.load(model_path)
+
+ # Delegate to set_model to extract features and set properties
+ self.set_model(loaded_model)
+
+ except Exception as e:
+ logger.error(f"Error loading model: {str(e)}")
+ raise
+
+ def get_feature_differences(self, df: pd.DataFrame) -> Dict[str, set]:
+ """
+ Utility method to show feature differences between input data and model
+ """
+ if not self.is_model_loaded:
+ raise ValueError("Model not loaded. Please load model first.")
+
+ df_features = set(df.columns)
+ model_features = set(self.model_features)
+
+ return {
+ 'missing_features': model_features - df_features,
+ 'extra_features': df_features - model_features
+ }
+
+ def prepare_data(self, df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Match feature names between model and feature dataframe
+ """
+ try:
+ if not self.is_model_loaded:
+ raise ValueError("Model not loaded. Please load model first.")
+
+ # Store variant IDs
+ self.variant_ids = list(df.index)
+
+ # Get current features
+ df_features = df.columns
+
+ # Find missing and extra features
+ missing_features = set(self.model_features) - set(df_features)
+ extra_features = set(df_features) - set(self.model_features)
+
+ # Log the differences
+ logger.info(f"Missing features: {missing_features}")
+ logger.info(f"Extra features: {extra_features}")
+
+ # Subset to model features
+ subset_df = df[self.model_features]
+
+ # Store feature names
+ self.feature_names = self.model_features
+
+ return subset_df
+
+ except Exception as e:
+ logger.error(f"Error preparing data: {str(e)}")
+ raise
+
+ def calculate_shap_values(self, df: pd.DataFrame) -> None:
+ """
+ Calculate SHAP values with optimization for speed
+ """
+ try:
+ if not self.is_model_loaded:
+ raise ValueError("Model not loaded. Please load model first.")
+
+ if self.shap_explainer is None:
+ # Use approximate=True for faster computation
+ self.shap_explainer = shap.TreeExplainer(
+ self.model,
+ feature_perturbation='interventional',
+ approximate=True
+ )
+ logger.info(f"Created SHAP explainer for {self.model_type} model")
+
+ self.shap_values = self.shap_explainer(df)
+ logger.info("SHAP values calculated successfully")
+
+ # Conditionally adjust SHAP values for RandomForestClassifier
+ if self.model_type == "random_forest" and self.task_type == "classification":
+ # Extract SHAP values for the positive class
+ self.shap_values.values = self.shap_values.values[:, :, 1]
+ logger.info("Extracted SHAP values for positive class (class 1)")
+
+ # Set variant_ids and feature_names if not already set
+ self.variant_ids = list(df.index)
+ self.feature_names = list(df.columns)
+
+ except Exception as e:
+ logger.error(f"Error calculating SHAP values: {str(e)}")
+ raise
+
+ def run_shap_analysis(self, output_file: str = './results/shap_values.json') -> str:
+ """
+ Generate SHAP analysis JSON
+
+ Parameters:
+ -----------
+ output_file : str, optional
+ Path where to save the JSON file (default: './shap_values.json')
+ """
+ if self.shap_values is None:
+ raise ValueError("SHAP values not calculated. Run calculate_shap_values first.")
+
+ return create_shap_json(
+ variant_ids=self.variant_ids,
+ feature_names=self.feature_names,
+ shap_values=self.shap_values,
+ output_file=output_file # Pass the output path
+ )
+
+ def run_lime_analysis(self, processed_df: pd.DataFrame, num_features = 20,
+ categorical_features: Optional[List[int]] = None,
+ output_file: str = './results/lime_values.json') -> str:
+ """
+ Run LIME analysis on processed data
+
+ Parameters:
+ -----------
+ processed_df : pd.DataFrame
+ Processed feature matrix
+ categorical_features : List[int], optional
+ List of categorical feature indices
+ output_file : str, optional
+ Path where to save the JSON file
+
+ Returns:
+ --------
+ str
+ JSON string containing the LIME explanations
+ """
+ if not self.is_model_loaded:
+ raise ValueError("Model not loaded. Please load model first.")
+
+ if self.lime_explainer is None:
+ self.lime_explainer = create_lime_explainer(
+ X_train=processed_df.values,
+ feature_names=self.feature_names,
+ categorical_features=categorical_features
+ )
+
+ return create_lime_json(
+ variant_ids=self.variant_ids,
+ feature_names=self.feature_names,
+ processed_df=processed_df,
+ num_features = num_features,
+ model=self.model,
+ explainer=self.lime_explainer,
+ output_file=output_file
+ )
+
\ No newline at end of file
diff --git a/bin/post_processing.py b/bin/post_processing.py
index 5880e68..b86efee 100755
--- a/bin/post_processing.py
+++ b/bin/post_processing.py
@@ -37,6 +37,7 @@ def main():
iff = simple_repeat_anno(
sys.argv[1], iff, f"merge_expand/{sys.argv[2]}/simpleRepeats.{sys.argv[2]}.bed"
)
+ iff = iff.drop(columns=["varId_dash"])
### delete intermediate files ###
# os.remove(sys.argv[1]+".bed")
diff --git a/bin/predict_new/utilities.py b/bin/predict_new/utilities.py
index 61a453f..b75ab26 100755
--- a/bin/predict_new/utilities.py
+++ b/bin/predict_new/utilities.py
@@ -1,298 +1,7 @@
-from math import ceil
-import os
import pandas as pd
-import numpy as np
from scipy.stats import rankdata
-import sys
-import xgboost as xgb
-from tqdm import trange
-from time import time
-import json
-from matplotlib import pyplot as plt
-from sklearn.ensemble import RandomForestClassifier as RF
-import itertools
from predict_new.confidence import *
-
-def get_training_data(fns, label_col):
- """Read in training data, drop features with specified mode in the meta['feature_mode'] json.
- Allowed modes are ['all', 'raw', 'var_disase_db_off', 'gene_disase_db_off']
- """
- np.random.seed(0)
- ### READ PARAMETERS
- f = open("meta.json")
- meta = json.load(f)
- feature_mode = meta["feature_mode"]
- feature_meta = pd.read_csv(meta["feature_meta_fn"], sep=",")
- ### READ training samples
- t0 = time()
- print("[TrainVal-Train] Start reading training data.")
- dfs = []
- for i in trange(len(fns)):
- fn = fns[i]
- df = pd.read_csv(fn, index_col=0, sep="\t")
- dfs.append(df)
- dfs = pd.concat(dfs)
-
- t1 = time()
- print(f"[TrainVal-Train] Completed reading training data in {(t1-t0):.2f} seconds.")
- train_Y = dfs[label_col].to_numpy().astype(int)
- neg_pos_ratios = [(train_Y == 0).sum() / (train_Y == 1).sum()]
-
- ## Drop features based on feature mode
- if feature_mode == "all":
- features = feature_meta.Feature.tolist()
- elif feature_mode == "raw":
- features = feature_meta.loc[feature_meta.Engineered != 1, :].Feature.tolist()
- elif feature_mode == "var_disase_db_off":
- features = feature_meta.loc[
- feature_meta.DiseaseDB_VariantLevel != 1, :
- ].Feature.tolist()
- elif feature_mode == "gene_disase_db_off":
- features = feature_meta.loc[
- feature_meta.DiseaseDB_GeneLevel != 1, :
- ].Feature.tolist()
- else:
- print(
- f"Wrong feature mode entered, should be one of\
- ['all', 'raw', 'var_disease_db_off', 'gene_disease_db_off'],\
- your input is {feature_mode}."
- )
- sys.exit(0)
- train_X = dfs[features]
- print("[TrainVal-Train] Training data obtained.")
- return train_X, train_Y, np.mean(neg_pos_ratios)
-
-
-def tune_params(fns, classifier="xgb"):
- ### READ PARAMETERS
- f = open("meta.json")
- meta = json.load(f)
- train_label = meta["train_label"]
- test_label = meta["test_label"]
- train_val_frac = meta["train_val_frac"]
- sep2 = "\t"
-
- train_fns_sampled = list(np.random.choice(fns, int(train_val_frac * len(fns))))
- val_fns_sampled = list(set(fns) - set(train_fns_sampled))
-
- print(
- f"[TrainVal-Train] train {len(train_fns_sampled)} samples with label {train_label},\
- validate {len(val_fns_sampled)} samples with label {test_label}."
- )
-
- para_train_dfs = [
- pd.read_csv(train_fn, usecols=[train_label], sep=sep2)
- for train_fn in train_fns_sampled
- ]
- para_train_dfs = pd.concat(para_train_dfs)
- if classifier == "xgb":
- ## tune parameters
- n_estimators = [200, 300]
- learning_rates = list(np.logspace(0.001, 0.2, num=10, endpoint=True))
- max_depths = [None]
- params = list(itertools.product(n_estimators, learning_rates, max_depths))
- elif classifier == "rf":
- n_estimators = [100, 200, 300, 400]
- max_depths = [None]
- params = list(itertools.product(n_estimators, max_depths))
-
- train_X, train_Y, neg_pos_ratio = get_training_data(train_fns_sampled, train_label)
- perf_dfs = []
- for param in params:
- t0 = time()
- if classifier == "xgb":
- n_est, lr, md = param
- print(
- f"[TrainVal-Train] n_trees: {n_est}, lr: {lr}, max_depth: {md}, n/p ratio: {neg_pos_ratio}."
- )
- clf = train_XGBoost(train_X, train_Y, n_est, lr, md, neg_pos_ratio)
-
- elif classifier == "rf":
- n_est, md = param
- print(
- f"[TrainVal-Train] n_trees: {n_est}, max_depth: {md}, n/p ratio: {neg_pos_ratio}."
- )
- clf, _, _ = train_minibatch_RF(train_X, train_Y, n_est, md, neg_pos_ratio)
- t1 = time()
- print(f"[TrainVal-Train] done in {(t1-t0):.2f} seconds.")
- rank_dfs = rank_test_patients(
- clf, val_fns_sampled, train_X.columns.tolist(), test_label
- )
- obj_df = objective(rank_dfs)
- if classifier == "xgb":
- perf_df = pd.DataFrame(
- data=[
- [
- n_est,
- md,
- lr,
- obj_df["objective"],
- obj_df["var_top1_acc"],
- obj_df["var_top5_acc"],
- obj_df["var_top10_acc"],
- ]
- ],
- columns=[
- "n_estimator",
- "max_depth",
- "learning_rate",
- "objective",
- "var_top1",
- "var_top5",
- "var_top10",
- ],
- )
- else:
- perf_df = pd.DataFrame(
- data=[
- [
- n_est,
- md,
- obj_df["objective"],
- obj_df["var_top1_acc"],
- obj_df["var_top5_acc"],
- obj_df["var_top10_acc"],
- ]
- ],
- columns=[
- "n_estimator",
- "max_depth",
- "objective",
- "var_top1",
- "var_top5",
- "var_top10",
- ],
- )
- perf_dfs.append(perf_df)
- t2 = time()
- print(
- f"[TrainVal-Validate] done in {(t2-t1):.2f} seconds, objective = {obj_df['objective']}."
- )
- perf_dfs = pd.concat(perf_dfs).sort_values(
- ["objective", "var_top1", "var_top5", "var_top10"], ascending=False
- )
- return perf_dfs
-
-
-def quantify_perf(test_perfs, topK=5, mode="variant_level"):
- if mode == "sample_level":
- sample_perfs = test_perfs.groupby("identifier")[["ranking"]].min()
- else:
- sample_perfs = test_perfs.copy()
-
- topK_acc = (sample_perfs["ranking"] <= topK).sum() / sample_perfs.shape[0]
- return topK_acc
-
-
-def plot_topK_performance(rank_dfs):
- f = plt.figure(figsize=(10, 4))
- topks = range(1, 101)
- var_accs = []
- sample_accs = []
-
- for i in topks:
- var_accs.append(quantify_perf(rank_dfs, topK=i))
- sample_accs.append(quantify_perf(rank_dfs, topK=i, mode="sample_level"))
- df = pd.DataFrame(
- {"top_k": topks, "var_accuracy": var_accs, "sample_accuracy": sample_accs}
- )
- ax1 = plt.subplot(1, 2, 1)
- ax1.plot(topks, var_accs, linestyle="--", marker="o", color="blue")
- ax1.set_xlabel("Top K", fontsize=14)
- ax1.set_ylabel("Variant Accuracy", fontsize=14)
- ax2 = plt.subplot(1, 2, 2)
- ax2.plot(topks, sample_accs, linestyle="--", marker="o", color="green")
- ax2.set_xlabel("Top K", fontsize=14)
- ax2.set_ylabel("Sample Accuracy", fontsize=14)
- plt.subplots_adjust(wspace=0.3)
- return f, df
-
-
-def objective(rank_dfs):
- ### READ PARAMETERS
- f = open("meta.json")
- meta = json.load(f)
- alpha = meta["objective_params"]["alpha"]
- beta = meta["objective_params"]["beta"]
- gamma = meta["objective_params"]["gamma"]
- f.close()
- ### Get accuracy and calculate objective
- vl1 = quantify_perf(rank_dfs, topK=1)
- vl5 = quantify_perf(rank_dfs, topK=5)
- vl10 = quantify_perf(rank_dfs, topK=10)
- sl1 = quantify_perf(rank_dfs, topK=1, mode="sample_level")
- sl5 = quantify_perf(rank_dfs, topK=5, mode="sample_level")
- sl10 = quantify_perf(rank_dfs, topK=10, mode="sample_level")
- obj_score = alpha * vl1 + beta * (vl5 - vl1) + gamma * (vl10 - vl5)
- result = {
- "var_top1_acc": vl1,
- "var_top5_acc": vl5,
- "var_top10_acc": vl10,
- "sample_top1_acc": sl1,
- "sample_top5_acc": sl5,
- "sample_top10_acc": sl10,
- "objective": obj_score,
- }
- return result
-
-
-def train_XGBoost(X, Y, n_estimator, lr, max_depth, ratio):
- xgb.config_context(verbosity=0)
- bst = xgb.XGBClassifier(
- max_depth=max_depth,
- n_estimators=n_estimator,
- learning_rate=lr,
- scale_pos_weight=ratio,
- verbosity=0,
- n_jobs=-1,
- )
- bst.fit(X, Y)
- return bst
-
-
-def train_RF(X, Y, n_estimator=None, max_depth=None, ratio=None, model=None):
- if model is None:
- model = RF(
- class_weight={0: 1, 1: ratio},
- random_state=2021,
- n_jobs=-1,
- n_estimators=n_estimator,
- max_depth=max_depth,
- warm_start=True,
- )
- model.fit(X, Y)
- return model
-
-
-def train_minibatch_RF(fns, n_estimator, max_depth, label_col, batch=500):
- k = ceil(len(fns) / batch)
- model = RF(
- random_state=2021,
- n_jobs=-1,
- n_estimators=n_estimator,
- max_depth=max_depth,
- warm_start=True,
- )
- train_causals = []
- for i in range(k):
- if i < k - 1:
- batch_fns = fns[i * batch : (i + 1) * batch]
- else:
- batch_fns = fns[i * batch :]
- trainX, trainY, neg_pos_ratio = get_training_data(
- batch_fns, label_col=label_col
- )
- features = trainX.columns.tolist()
- model.set_params(
- **{"class_weight": {0: 1, 1: neg_pos_ratio}, "n_estimators": n_estimator}
- )
- model.fit(trainX, trainY)
- n_estimator += 10
- train_causals += trainX.index[np.where(trainY == 1)[0]].tolist()
- return model, features, train_causals
-
-
def rank_patient(model, fn, features, train_causals=None):
df = pd.read_csv(
fn, sep="\t", index_col=0
@@ -315,49 +24,3 @@ def rank_patient(model, fn, features, train_causals=None):
pred_df["ranking"] = max_rankings
pred_df["identifier"] = identifier
return pred_df
-
-
-def rank_test_patients(
- model,
- fns,
- features,
- label_col,
- causal_distr=None,
- out_folder=None,
- train_causals=None,
-):
- print(out_folder)
- causal_rank_dfs = []
- for i in trange(len(fns)):
- fn = fns[i]
-
- if not os.path.exists(fn):
- continue
-
- identifier = fn.split("/")[-1].split("_")[0]
- rank_df = rank_patient(model, fn, features, train_causals)
-
- if causal_distr is not None:
- rank_df = assign_confidence_score(
- causal_distr, rank_df, label_col="is_strong"
- )
-
- if out_folder is not None:
- if not os.path.exists(out_folder):
- os.mkdir(out_folder)
-
- rank_df.to_csv(f"{out_folder}/{identifier}_variant_rank.csv")
-
- causal_rank = get_causal_rank(rank_df, label_col)
-
- if causal_rank is not None:
- causal_rank_dfs.append(causal_rank)
-
- causal_rank_dfs = pd.concat(causal_rank_dfs)
- return causal_rank_dfs
-
-
-def get_causal_rank(pred_df, label_col):
- if (pred_df[label_col] == 1).sum() == 0:
- return None
- return pred_df.loc[pred_df[label_col] == 1, :]
diff --git a/bin/run_final.py b/bin/run_final.py
index 835aca0..dba3dd2 100755
--- a/bin/run_final.py
+++ b/bin/run_final.py
@@ -9,14 +9,19 @@
from predict_new.confidence import *
import sys
+jobfile_path = sys.argv[1]
+featurenames_path = sys.argv[2]
+matrix_path = sys.argv[3]
+output_path = sys.argv[4]
+
# read in precalulated model
-model = joblib.load("predict_new/hg19/final_model_wo_bg_val.job")
+model = joblib.load(jobfile_path)
# read in features to use
-features = list(pd.read_csv("predict_new/hg19/features.csv"))
+features = list(pd.read_csv(featurenames_path))
# predict with Linhua's code
-out = rank_patient(model, sys.argv[1] + ".matrix.txt", features, train_causals=None)
+out = rank_patient(model, matrix_path, features, train_causals=None)
# write out
-out.to_csv(sys.argv[1] + ".csv")
+out.to_csv(output_path)
diff --git a/bin/simple_repeat_anno.py b/bin/simple_repeat_anno.py
index 52b68b7..59a1c29 100755
--- a/bin/simple_repeat_anno.py
+++ b/bin/simple_repeat_anno.py
@@ -9,11 +9,11 @@ def simple_repeat_anno(sample_id, feature_file, sp_bed_file):
sample = feature_file
sample_bed = pd.DataFrame(
{
- "chr": sample.index.str.split("-").str[0],
- "start": sample.index.str.split("-").str[1],
- "end": sample.index.str.split("-").str[1],
- "ref": sample.index.str.split("-").str[2],
- "alt": sample.index.str.split("-").str[3],
+ "chr": sample.varId_dash.str.split("-").str[0],
+ "start": sample.varId_dash.str.split("-").str[1],
+ "end": sample.varId_dash.str.split("-").str[1],
+ "ref": sample.varId_dash.str.split("-").str[2],
+ "alt": sample.varId_dash.str.split("-").str[3],
}
)
sample_bed.to_csv("%s.bed" % (sample_id), index=False, header=False, sep="\t")
@@ -37,7 +37,8 @@ def simple_repeat_anno(sample_id, feature_file, sp_bed_file):
+ "-"
+ sample_sp[4].astype(str)
)
- sample["simple_repeat"] = sample.index.isin(
+ # NOTE(JL): This takes O(n^2) time complexity
+ sample["simple_repeat"] = sample.varId_dash.isin(
sample_sp["var_dash"].to_numpy()
).astype(int)
diff --git a/bin/trio/inh_utils.py b/bin/trio/inh_utils.py
new file mode 100644
index 0000000..3690cad
--- /dev/null
+++ b/bin/trio/inh_utils.py
@@ -0,0 +1,183 @@
+import pandas as pd
+import numpy as np
+from glob import glob
+import sys
+
+
+def matchID_in_inheritance(score_file, singleton_rank, inheritance_file, mode='test'):
+
+ score_df = pd.read_csv(score_file,
+ sep='\t', compression='gzip')
+ score_df = score_df.loc[~(score_df['chrom'].isin([25,26]))]
+ rank_df = pd.read_csv(singleton_rank, index_col=0)
+ inh_df = pd.read_csv(inheritance_file)
+ print('score_df:', score_df.shape)
+ print('rank_df:', rank_df.shape)
+ print('inh_df:', inh_df.shape)
+
+ inh_df = inh_df.loc[~(inh_df['varid'].str.startswith('GL'))].copy()
+ inh_df['varid'] = inh_df['varid'].str.replace('\.0','', regex=True)
+ inh_df['varid'] = inh_df['varid'].str.replace('X','23')
+ inh_df['varid'] = inh_df['varid'].str.replace('Y','24')
+
+ ### To accommodate the situation where hg38 files have "chr" in the chromosome naming: Remove 'chr' prefix from varId in inheritance file ###
+ inh_df['varid'] = inh_df['varid'].str.replace('^chr', '', regex=True)
+
+
+ score_df_2 = score_df.loc[:,['varId','geneEnsId']].copy()
+ score_df_2['varId'] = score_df_2['varId'].str.split('_').str[0] + '_'+\
+ score_df_2['varId'].str.split('_').str[1] + '_'+\
+ score_df_2['varId'].str.split('_').str[2] + '_'+\
+ score_df_2['varId'].str.split('_').str[3]
+ score_df_2 = score_df_2.loc[~score_df_2.duplicated()]
+ score_df_2 = score_df_2.groupby(by='varId', as_index=False).agg({'geneEnsId' : ','.join})
+ score_df_2['geneEnsId'] = score_df_2['geneEnsId'].str.replace('-','').str.rstrip(',')
+ n_rows = score_df_2.shape[0]
+ n_varid = len(np.unique(score_df_2['varId']))
+
+ # check if varId is unique
+ assert n_rows == n_varid
+ score_df_2.index = score_df_2['varId']
+
+ inh_df = inh_df.loc[inh_df['varid'].isin(score_df_2['varId'])].copy()
+ inh_df['geneEnsId'] = score_df_2.loc[inh_df['varid'], 'geneEnsId'].tolist()
+
+ assert np.all(inh_df['varid'].isin(rank_df.index))
+
+
+ if mode == 'test':
+ return inh_df
+
+ else:
+ if np.all(rank_df.loc[rank_df['is_causal']==1].index.isin(inh_df['varid'])):
+ return inh_df
+ else:
+ sys.exit('%s: not all causal variants in inheritance file!'%(ID))
+
+
+def matrix_inh(inh_df, rank_file):
+
+ # Load in rank file
+ rank_df = pd.read_csv(rank_file, index_col=0)
+
+ if 'confidence' in rank_df.columns.tolist():
+ rank_df = rank_df.drop(columns=['confidence','confidence level'])
+ features = rank_df.columns[:-4].tolist() + ['ranking']
+ features_name = features[:-2] + ['rf_score','rf_rank']
+ rank_df = rank_df.loc[:,features].copy()
+ rank_df.columns = features_name
+
+ # Load in inheritance file
+ inh_df.index = inh_df['varid']
+ # feature engineering
+ inh_df.loc[inh_df['GATK']=='Inherited','GATK'] = 0
+ inh_df.loc[inh_df['GATK']=='loConfDeNovo','GATK'] = 1
+ inh_df.loc[inh_df['GATK']=='hiConfDeNovo','GATK'] = 2
+ inh_df['GATK'] = inh_df['GATK'].astype(int)
+ inh_df['DeNovo'] = (inh_df['GATK']==2).astype(int)
+ inh_df['Father'] = ((inh_df['Pattern'].isin(['F','MF','NF','MU','UF','UU'])) | (inh_df['Pattern'].isin(['MN','NN']) & (inh_df['GATK']!=2))).astype(int)
+ inh_df['Mother'] = ((inh_df['Pattern'].isin(['M','MF','MN','MU','UF','UU'])) | (inh_df['Pattern'].isin(['NF','NN']) & (inh_df['GATK']!=2))).astype(int)
+ inh_df['Unknown'] = ((inh_df['DeNovo']==0) & (inh_df['Father']==0) & (inh_df['Mother']==0)).astype(int)
+
+ # Merge the two
+ rank_df = rank_df.loc[rank_df.index.isin(inh_df.index)].copy()
+ added_features = ['DeNovo','Father','Mother','Unknown','geneEnsId']
+ rank_df[added_features] = inh_df[added_features]
+ rank_df.loc[rank_df['geneEnsId'].isna(), 'geneEnsId'] = [str(i) for i in range(np.sum(rank_df['geneEnsId'].isna()))]
+
+ # Gene level variant summary
+ genes = list(set((','.join(rank_df['geneEnsId'].tolist())).split(','))) ### Changed from set to list ###
+ gene_vars_df = pd.DataFrame({#'Gene':genes,
+ 'Gene_vars_Father':0,
+ 'Gene_vars_Mother':0,
+ 'Gene_vars_DeNovo':0,
+ 'Gene_vars_Unknown':0}, index=genes)
+
+ genes_HMImpact = list(set((','.join(rank_df.loc[rank_df['IMPACT.from.Tier']>=3,'geneEnsId'].tolist())).split(',')))
+
+ for g in genes_HMImpact:
+ gene_df = rank_df.loc[rank_df['geneEnsId'].str.contains(g)].copy()
+ gene_df = gene_df.loc[gene_df['IMPACT.from.Tier']>=3].copy()
+ gene_vars_df.loc[g, 'Gene_vars_Father'] = gene_df['Father'].sum()
+ gene_vars_df.loc[g, 'Gene_vars_Mother'] = gene_df['Mother'].sum()
+ gene_vars_df.loc[g, 'Gene_vars_DeNovo'] = gene_df['DeNovo'].sum()
+ gene_vars_df.loc[g, 'Gene_vars_Unknown'] = gene_df['Unknown'].sum()
+
+ rank_df['varId'] = rank_df.index
+ rank_df_new = pd.DataFrame(np.ones((0,2)), columns=['varId','geneEnsId'])
+
+ for i in range(rank_df.shape[0]):
+ n_lines = len(rank_df.loc[rank_df.index[i],'geneEnsId'].split(','))
+ rank_df_new_tmp = rank_df.loc[np.repeat(rank_df.index[i],n_lines),['varId','geneEnsId']]
+ rank_df_new_tmp['geneEnsId'] = rank_df.loc[rank_df.index[i],'geneEnsId'].split(',')
+
+ rank_df_new = pd.concat([rank_df_new, rank_df_new_tmp], axis=0)
+ rank_df_new.index = np.arange(rank_df_new.shape[0])
+
+ rank_df_new['Gene_vars_Father'] = 0
+ rank_df_new['Gene_vars_Mother'] = 0
+ rank_df_new['Gene_vars_DeNovo'] = 0
+ rank_df_new['Gene_vars_Unknown'] = 0
+ rank_df_new.loc[:,['Gene_vars_Father','Gene_vars_Mother','Gene_vars_DeNovo','Gene_vars_Unknown']] = gene_vars_df.loc[rank_df_new['geneEnsId'].tolist(),
+ gene_vars_df.columns.tolist()].to_numpy()
+ rank_df_new = rank_df_new.groupby(by='varId')[['Gene_vars_Father','Gene_vars_Mother','Gene_vars_DeNovo','Gene_vars_Unknown']].max()
+ rank_df_new = rank_df_new.loc[rank_df.index]
+
+ assert np.all(rank_df.index == rank_df_new.index)
+
+ rank_df['Gene_vars_Father'] = rank_df_new['Gene_vars_Father']
+ rank_df['Gene_vars_Mother'] = rank_df_new['Gene_vars_Mother']
+ rank_df['Gene_vars_DeNovo'] = rank_df_new['Gene_vars_DeNovo']
+ rank_df['Gene_vars_Unknown'] = rank_df_new['Gene_vars_Unknown']
+
+ rank_df['No.Risk.Var'] = rank_df['No.Var.H'] + rank_df['No.Var.M']
+ rank_df['chrom'] = rank_df.index.str.split('_').str[0]
+ rank_df['chrom'] = rank_df['chrom'].astype(int)
+
+ rank_df['Homozygous_Recessive'] = ((rank_df['zyg']==2) & (rank_df['chrom']!=23)).astype(int)
+ rank_df['De_Novo'] = ((rank_df['DeNovo']==1) & (rank_df['zyg']==1)).astype(int)
+ rank_df['X_Linked'] = (rank_df['chrom']==23).astype(int)
+
+ rank_df['Compound_heterozygous'] = (
+ (rank_df['zyg']==1) &\
+ (rank_df['No.Risk.Var']>0) &\
+ (((rank_df['Father']==1) & ((rank_df['Gene_vars_Mother']+rank_df['Gene_vars_DeNovo'])>0)) |\
+ ((rank_df['Mother']==1) & ((rank_df['Gene_vars_Father']+rank_df['Gene_vars_DeNovo'])>0)))
+ ).astype(int)
+
+
+ rank_df['Inherited_dominant'] = (
+ (rank_df['zyg']==1) &\
+ (rank_df['DeNovo']==0) &\
+ (rank_df['X_Linked']==0) &\
+ (rank_df['Compound_heterozygous']==0)
+ ).astype(int)
+
+
+ rank_df['Homozygous_Recessive_Risk'] = rank_df['Homozygous_Recessive'].copy()
+ rank_df.loc[rank_df['IMPACT.from.Tier']<3,'Homozygous_Recessive_Risk'] = 0
+
+ rank_df['De_Novo_Risk'] = rank_df['De_Novo'].copy()
+ rank_df.loc[rank_df['IMPACT.from.Tier']<3,'De_Novo_Risk'] = 0
+
+ rank_df['X_Linked_Risk'] = rank_df['X_Linked'].copy()
+ rank_df.loc[rank_df['IMPACT.from.Tier']<3,'X_Linked_Risk'] = 0
+
+ rank_df['Compound_heterozygous_Risk'] = rank_df['Compound_heterozygous'].copy()
+ rank_df.loc[rank_df['IMPACT.from.Tier']<3,'Compound_heterozygous_Risk'] = 0
+
+ rank_df['Inherited_dominant_Risk'] = rank_df['Inherited_dominant'].copy()
+ rank_df.loc[rank_df['IMPACT.from.Tier']<3,'Inherited_dominant_Risk'] = 0
+
+ return rank_df #.to_csv('matrix_inh/%s.txt'%ID)
+
+
+
+
+
+
+
+
+
+
+
diff --git a/bin/trio_check_inheritance.GATK.py b/bin/trio_check_inheritance.GATK.py
new file mode 100755
index 0000000..9832edf
--- /dev/null
+++ b/bin/trio_check_inheritance.GATK.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import numpy as np
+import sys
+
+
+sampleID = sys.argv[1]
+vcf_file = sys.argv[2]
+ref_genome = sys.argv[3]
+proband_id = sys.argv[4]
+paternal_id = sys.argv[5]
+maternal_id = sys.argv[6]
+
+
+Y_autosomal = {'hg19':{'cut1':10001,
+ 'cut2':2649520,
+ 'cut3':59034050,
+ 'cut4':59363566},
+ 'hg38':{'cut1':10001,
+ 'cut2':2781479,
+ 'cut3':56887903,
+ 'cut4':57217415}}
+
+
+n = 0
+with open(vcf_file,'r') as F:
+ for line in F:
+ if line.startswith('##'):
+ n += 1
+ else:
+ break
+
+vcf_df = pd.read_csv(vcf_file, sep='\t', skiprows=n)
+##I removed this flag because the vcfs we're dealing with don't have the PASS flag...
+#vcf_df = vcf_df.loc[vcf_df['FILTER']=='PASS',:]
+
+probandID = [coln for coln in vcf_df.columns if proband_id in coln][0]
+paternalID = [coln for coln in vcf_df.columns if paternal_id in coln][0]
+maternalID = [coln for coln in vcf_df.columns if maternal_id in coln][0]
+
+
+# Preprocessing
+vcf_df['varid'] = vcf_df['#CHROM'].astype(str) + '_' + vcf_df['POS'].astype(str) + '_' + vcf_df['REF'] + '_' + vcf_df['ALT']
+vcf_df['Patient'] = vcf_df.loc[:,probandID].str.split(':').str[0].str.replace('|','/')
+vcf_df['Mother'] = vcf_df.loc[:,maternalID].str.split(':').str[0].str.replace('|','/')
+vcf_df['Father'] = vcf_df.loc[:,paternalID].str.split(':').str[0].str.replace('|','/')
+vcf_df = vcf_df.loc[~vcf_df['Patient'].isin(['./.','0/0']),:]
+vcf_df['Pattern'] = 'Left'
+#print(vcf_df)
+#print(vcf_df['Patient'])
+#print(vcf_df['Patient'].str.split('/',expand=True))
+#print(vcf_df[['Patient_0','Patient_1']])
+#quit()
+vcf_df[['Patient_0','Patient_1']] = vcf_df['Patient'].str.split('/',expand=True)
+vcf_df[['Mother_0','Mother_1']] = vcf_df['Mother'].str.split('/',expand=True)
+vcf_df[['Father_0','Father_1']] = vcf_df['Father'].str.split('/',expand=True)
+
+
+# make GT like 1/2, 0/4 to several lines with only 0 or 1
+'''
+multivar_df = vcf_df.loc[vcf_df['ALT'].str.contains(','),:]
+vcf_df = vcf_df.loc[~vcf_df['ALT'].str.contains(','),vcf_df.columns[0:16]]
+multivar_df.loc[:,'ALT_list'] = multivar_df['ALT'].str.split(',')
+multivar_df.loc[:,'ALT_int'] = multivar_df['ALT_list'].apply(lambda x: list(range(1,len(x)+1)))
+
+multivar_df.loc[:,'PGT_0_list'] = multivar_df.apply(lambda x: np.isin(np.array(x.ALT_int).astype(str),x.Patient_0).astype(int), axis=1)
+multivar_df.loc[:,'PGT_1_list'] = multivar_df.apply(lambda x: np.isin(np.array(x.ALT_int).astype(str),x.Patient_1).astype(int), axis=1)
+
+multivar_df.loc[:,'FGT_0_list'] = multivar_df.apply(lambda x: np.isin(np.array(x.ALT_int).astype(str),x.Father_0).astype(int), axis=1)
+multivar_df.loc[:,'FGT_1_list'] = multivar_df.apply(lambda x: np.isin(np.array(x.ALT_int).astype(str),x.Father_1).astype(int), axis=1)
+
+multivar_df.loc[:,'MGT_0_list'] = multivar_df.apply(lambda x: np.isin(np.array(x.ALT_int).astype(str),x.Mother_0).astype(int), axis=1)
+multivar_df.loc[:,'MGT_1_list'] = multivar_df.apply(lambda x: np.isin(np.array(x.ALT_int).astype(str),x.Mother_1).astype(int), axis=1)
+
+multivar_df.loc[:,'PGT_list'] = multivar_df.apply(lambda x: ['%s/%s'%(x.PGT_0_list[i],x.PGT_1_list[i]) for i in range(len(x.ALT_int))], axis=1)
+multivar_df.loc[:,'FGT_list'] = multivar_df.apply(lambda x: ['%s/%s'%(x.FGT_0_list[i],x.FGT_1_list[i]) for i in range(len(x.ALT_int))], axis=1)
+multivar_df.loc[:,'MGT_list'] = multivar_df.apply(lambda x: ['%s/%s'%(x.MGT_0_list[i],x.MGT_1_list[i]) for i in range(len(x.ALT_int))], axis=1)
+
+multivar_new = pd.DataFrame(np.ones((0,16)), columns=multivar_df.columns[0:16])
+
+for i in range(multivar_df.shape[0]):
+ n_lines = len(multivar_df.loc[multivar_df.index[i],'ALT_list'])
+ multivar_new_tmp = multivar_df.iloc[np.repeat(i,n_lines),0:16]
+ multivar_new_tmp['ALT'] = multivar_df.loc[multivar_df.index[i],'ALT_list']
+ multivar_new_tmp['Patient'] = multivar_df.loc[multivar_df.index[i],'PGT_list']
+ multivar_new_tmp['Mother'] = multivar_df.loc[multivar_df.index[i],'MGT_list']
+ multivar_new_tmp['Father'] = multivar_df.loc[multivar_df.index[i],'FGT_list']
+ multivar_new = pd.concat([multivar_new, multivar_new_tmp], axis=0)
+ multivar_new.index = np.arange(multivar_new.shape[0])
+
+multivar_new['varid'] = multivar_new['#CHROM'].astype(str) + '_' + multivar_new['POS'].astype(str) + '_' + multivar_new['REF'] + '_' + multivar_new['ALT']
+
+vcf_df = pd.concat([vcf_df, multivar_new], axis=0)
+'''
+
+vcf_df.index = np.arange(vcf_df.shape[0])
+vcf_df = vcf_df.loc[~vcf_df['Patient'].isin(['./.','0/0']),:]
+
+
+print('Total number of variants:', vcf_df.shape[0])
+
+
+# het condition
+bool_M = vcf_df['Patient'].isin(['0/1','1/0']) & (vcf_df['Mother'].str.find('1') != -1) & (vcf_df['Father'].str.find('1') == -1)
+bool_F = vcf_df['Patient'].isin(['0/1','1/0']) & (vcf_df['Father'].str.find('1') != -1) & (vcf_df['Mother'].str.find('1') == -1)
+
+# more specific if one parent is 1/1
+bool_UI = vcf_df['Patient'].isin(['0/1','1/0']) & (vcf_df['Father'].str.find('1') != -1) & (vcf_df['Mother'].str.find('1') != -1)
+bool_M = bool_M | (bool_UI & (vcf_df['Father']!='1/1') & (vcf_df['Mother']=='1/1'))
+bool_F = bool_F | (bool_UI & (vcf_df['Father']=='1/1') & (vcf_df['Mother']!='1/1'))
+
+bool_N = vcf_df['Patient'].isin(['0/1','1/0']) & (vcf_df['Father'].str.find('1') == -1) & (vcf_df['Mother'].str.find('1') == -1)
+
+vcf_df.loc[bool_UI,'Pattern'] = 'UI'
+vcf_df.loc[bool_M,'Pattern'] = 'M'
+vcf_df.loc[bool_F,'Pattern'] = 'F'
+vcf_df.loc[bool_N,'Pattern'] = 'N'
+
+# unkown cases
+vcf_df.loc[(vcf_df['Father']=='./.') & (vcf_df['Mother']=='./.') & bool_N, 'Pattern'] = 'U'
+
+
+
+# homo condition
+bool_NM = vcf_df['Patient'].isin(['1/1']) & (vcf_df['Mother'].str.find('1') != -1) & (vcf_df['Father'].str.find('1') == -1)
+bool_NF = vcf_df['Patient'].isin(['1/1']) & (vcf_df['Mother'].str.find('1') == -1) & (vcf_df['Father'].str.find('1') != -1)
+bool_B = vcf_df['Patient'].isin(['1/1']) & (vcf_df['Mother'].str.find('1') != -1) & (vcf_df['Father'].str.find('1') != -1)
+bool_N = vcf_df['Patient'].isin(['1/1']) & (vcf_df['Mother'].str.find('1') == -1) & (vcf_df['Father'].str.find('1') == -1)
+
+vcf_df.loc[bool_NM,'Pattern'] = 'MN'
+vcf_df.loc[bool_NF,'Pattern'] = 'NF'
+vcf_df.loc[bool_B,'Pattern'] = 'MF'
+vcf_df.loc[bool_N,'Pattern'] = 'NN'
+
+# Unkown cases - fix: if the other parent is 0/1 1/1
+vcf_df.loc[(vcf_df['Mother']=='./.') & bool_NF, 'Pattern'] = 'UF'
+vcf_df.loc[(vcf_df['Father']=='./.') & bool_NM, 'Pattern'] = 'MU'
+vcf_df.loc[(vcf_df['Father']=='./.') & (vcf_df['Mother']=='./.') & bool_N, 'Pattern'] = 'UU'
+
+
+
+
+# Y chromosome non-psudoautosomal region
+# https://en.wikipedia.org/wiki/Pseudoautosomal_region
+bool_non_pseudo = (vcf_df['#CHROM'] == 'Y') & ((vcf_df['POS'] < Y_autosomal[ref_genome]['cut1']) | ((vcf_df['POS']>Y_autosomal[ref_genome]['cut2']) & (vcf_df['POS'] < Y_autosomal[ref_genome]['cut3'])) | (vcf_df['POS']>Y_autosomal[ref_genome]['cut4']))
+bool_F = vcf_df['Patient'].isin(['0/1','1/0']) & (vcf_df['Father'].str.find('1') != -1) & bool_non_pseudo
+bool_N = vcf_df['Patient'].isin(['0/1','1/0']) & (vcf_df['Father'].str.find('1') == -1) & bool_non_pseudo
+
+vcf_df.loc[bool_F,'Pattern'] = 'F'
+vcf_df.loc[bool_N,'Pattern'] = 'N'
+
+bool_F = vcf_df['Patient'].isin(['1/1']) & (vcf_df['Father'].str.find('1') != -1) & bool_non_pseudo
+bool_N = vcf_df['Patient'].isin(['1/1']) & (vcf_df['Father'].str.find('1') == -1) & bool_non_pseudo
+
+vcf_df.loc[bool_F,'Pattern'] = 'F'
+vcf_df.loc[bool_N,'Pattern'] = 'N'
+
+
+
+# GATK results
+bool_hidenovo = vcf_df['INFO'].str.find('hiConfDeNovo=%s'%(proband_id)) != -1
+bool_lowdenovo = vcf_df['INFO'].str.find('loConfDeNovo=%s'%(proband_id)) != -1
+
+vcf_df['GATK'] = 'Inherited'
+vcf_df.loc[bool_hidenovo, 'GATK'] = 'hiConfDeNovo'
+vcf_df.loc[bool_lowdenovo, 'GATK'] = 'loConfDeNovo'
+
+inheritance = vcf_df.loc[:,['varid','Patient','Mother','Father','Pattern','GATK']].copy()
+inheritance.to_csv('./%s.inheritance.txt'%(sampleID), index=False)
+
+inheritance_summary = (inheritance['GATK'] + '_' + inheritance['Pattern']).value_counts()
+inheritance_summary.to_csv('./%s.summary.txt'%(sampleID))
+print((inheritance['GATK'] + '_' + inheritance['Pattern']).value_counts())
diff --git a/bin/trio_merge_inheritance.py b/bin/trio_merge_inheritance.py
new file mode 100755
index 0000000..21324ed
--- /dev/null
+++ b/bin/trio_merge_inheritance.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3.8
+
+import sys
+
+from trio.inh_utils import *
+
+score_file = sys.argv[1]
+singleton_rank = sys.argv[2]
+inheritance_file = sys.argv[3]
+out_path = sys.argv[4]
+
+
+inh_df = matchID_in_inheritance(score_file, singleton_rank, inheritance_file)
+inh_df = matrix_inh(inh_df, singleton_rank)
+inh_df.to_csv(out_path, sep="\t")
+
+
diff --git a/bin/trio_merge_rm.py b/bin/trio_merge_rm.py
new file mode 100755
index 0000000..640ae7c
--- /dev/null
+++ b/bin/trio_merge_rm.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3.8
+
+import pandas as pd
+import sys
+
+###merge score files with final rank matrix
+
+#read rank matrix and score file
+feature=pd.read_csv("./"+sys.argv[1]+".trio.csv")
+ndg = pd.read_csv("./"+sys.argv[1]+".trio.NDG.csv")
+ndg = ndg.loc[:,["Unnamed: 0","predict","ranking"]]
+ndg.columns = ["Unnamed: 0","predict (nd)","ranking (nd)"]
+
+feature = feature.merge(ndg, on="Unnamed: 0", how="left")
+feature.to_csv("./"+sys.argv[1]+".trio.prediction.csv",index=False)
\ No newline at end of file
diff --git a/bin/trio_rename_VCFheader.py b/bin/trio_rename_VCFheader.py
new file mode 100755
index 0000000..b2e71f4
--- /dev/null
+++ b/bin/trio_rename_VCFheader.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import sys
+
+proband_id = sys.argv[1]
+paternal_id = sys.argv[2]
+maternal_id = sys.argv[3]
+header_file = sys.argv[4]
+
+sample_list = [proband_id, paternal_id, maternal_id]
+
+header = open(header_file, 'r').readlines()[0]
+header = header.strip('\n').split('\t')[-3:]
+newheader = []
+
+for sample in header:
+ newheader += [i for i in sample_list if i in sample]
+
+assert len(newheader) == 3
+
+sample_name = open("./sample_ids.txt", 'w')
+sample_name.write('\n'.join(newheader))
+sample_name.close()
diff --git a/conf/base.config b/conf/base.config
new file mode 100644
index 0000000..0459025
--- /dev/null
+++ b/conf/base.config
@@ -0,0 +1,15 @@
+docker.enabled = true
+docker.fixOwnership = true
+docker.runOptions = '-u $(id -u):$(id -g)'
+
+validation.lenientMode = true
+
+process {
+ cpus = 2
+ memory = { 25.GB * task.attempt }
+ errorStrategy = { task.exitStatus in [143,137,104,134,139,140] ? 'retry' : 'finish' }
+ maxRetries = 5
+
+ container = "zhandongliulab/aim-lite:1.2"
+ cache = "lenient"
+}
diff --git a/conf/modules.config b/conf/modules.config
new file mode 100644
index 0000000..1d2f354
--- /dev/null
+++ b/conf/modules.config
@@ -0,0 +1,11 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Available keys to override module options:
+ ext.args = Additional arguments appended to command in module.
+ ext.args2 = Second set of arguments appended to command in module (multi-tool modules).
+ ext.args3 = Third set of arguments appended to command in module (multi-tool modules).
+ ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2c71d24..7b29fdb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -15,13 +15,13 @@
# -- Project information -----------------------------------------------------
project = 'AI-MARRVEL'
-copyright = "2024, Zhandong Liu's Lab at Baylor College of Medicine"
+copyright = "2025, Zhandong Liu's Lab at Baylor College of Medicine"
author = 'Chaozhong Liu, Dongxue Mao'
# The short X.Y version
-version = 'v1.0'
+version = 'v1.1'
# The full version, including alpha/beta/rc tags
-release = '1.0.0'
+release = '1.1.0'
# -- General configuration ---------------------------------------------------
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 8c8f0a1..6d9b1f9 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -30,14 +30,15 @@ Running AIM
Use following command-line to run AIM.
.. code-block:: bash
-
- nextflow run Liuzlab/AI_MARRVEL -r nextflow_conversion \
- --ref_dir \
- --ref_ver [Reference genome: hg19/hg38] \
- --input_vcf \
- --input_hpo \
- --outdir \
- --run_id [Sample Id]
+ nextflow run /path/to/AI_MARRVEL/main.nf \\
+ -profile \\
+ --ref_dir /path/to/dependencies/ \\
+ --input_vcf /path/to/input.vcf \\
+ --input_hpo /path/to/input.hpos.txt \\
+ --outdir /path/to/output \\
+ --storedir /path/to/store \\
+ --run_id \\
+ --ref_ver
@@ -60,7 +61,7 @@ AIM utilizes various databases for variant annotation, all of which have been co
.. code-block:: bash
- aws s3 sync s3://aim-data-dependencies-2.0-public . --no-sign-request
+ aws s3 sync s3://aim-data-dependencies-2.3-public . --no-sign-request
.. warning::
diff --git a/docs/source/nf_usage.txt b/docs/source/nf_usage.txt
deleted file mode 100644
index f5b0129..0000000
--- a/docs/source/nf_usage.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-Usage:
-
- nextflow run main.nf [All Args] [Metadata Flags]
-
-Example:
-
- nextflow run main.nf --ref_dir path/to/aim_data_dependencies \
- --input_vcf path/to/sample.vcf.gz \
- --input_hpo path/to/sample/hpos.txt \
- --outdir path/to/sample/Output \
- --run_id [Sample ID] \
- --ref_ver [hg19/hg38]
-
-Args:
- --input_vcf Path to input VCF file.
- --input_hpo Path to input HPO file.
- --ref_dir Path to aim pipeline dependencies directory.
- --outdir Output directory.
- --run_id Unique identifier for this run. (default: 1)
- --ref_ver Reference genome version [hg38 or hg19] (default: hg19)
- --bed_filter Path to a BED file to perform an analysis only for regions of interest (optional)
- --exome_filter Enable exonic filter. Addition will filter out variants outside of exonic region (default: false)
-
-Metadata Flags:
- --help Display the usage information.
- --version Display the version of the AI Marrvel pipeline.
-
-Reference Files:
- --ref_loc Path to reference location file
- --ref_to_sym Path to reference to symbol file
- --ref_sorted_sym Path to reference sorted symbol file
- --ref_exonic_filter_bed Path to exonic filter BED file
-
-VEP Annotation:
- --vep_dir_cache Path to VEP cache directory
- --vep_dir_plugins Path to VEP plugins directory
- --vep_custom_gnomad Path to custom gnomAD file for VEP
- --vep_custom_clinvar Path to custom ClinVar file for VEP
- --vep_custom_hgmd Path to custom HGMD file for VEP
-
-For detailed information about each process,
-please refer to the documentation. https://ai-marrvel.readthedocs.io/en/latest/
diff --git a/docs/source/output.rst b/docs/source/output.rst
index cdd0226..ff37f29 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -23,8 +23,6 @@ Following the execution of AIM, you will receive five output files, which are sa
- Prediction results from AIM Novel Disease Gene mode.
* - \*_nd_recessive_predictions.csv
- Prediction results from AIM Novel Disease Gene + Recessive mode.
- * - \*_integrated.csv
- - Combining all 4 prediction results.
Output Explanation
======================
diff --git a/main.nf b/main.nf
index 6b10486..3137df8 100644
--- a/main.nf
+++ b/main.nf
@@ -1,726 +1,72 @@
nextflow.enable.dsl = 2
+include { validateParameters } from 'plugin/nf-schema'
-def showUsage() {
- if (params.help) {
- def helpFile = file(params.usage_file) // Specify your Markdown file path here
- if (helpFile.exists()) {
- println helpFile.text
- } else {
- println """
- Sorry something went wrong, usage file not found!
- Please vist our website for more info : https://ai-marrvel.readthedocs.io/en/latest/
- """
- }
- exit 0
- }
-}
-
-def showVersion() {
- if (!params.version) {
- return
- }
-
- println "1.0.0"
- exit 0
-}
+include {
+ showVersion
+} from "./modules/local/utils"
-def validateInputParams() {
- def checkPathParamMap = [
- "input_vcf": params.input_vcf,
- "input_hpo": params.input_hpo,
- "ref_dir" : params.ref_dir,
- "ref_ver" : params.ref_ver,
- ]
+include {
+ VCF_PRE_PROCESS_TRIO; GENERATE_TRIO_FEATURES; PREDICTION_TRIO
+} from "./modules/local/trio"
- checkPathParamMap.each { paramName, paramValue ->
- if (paramValue) {
- // Check if the file exists
- if(!(paramName == "ref_ver")) {
- def fileObj = file(paramValue, checkIfExists: true)
- // println("Check file: '--${paramName}' value '${paramValue}' ")
+include {
+ PREPARE_DATA
+} from "./subworkflows/local/prepare_data"
- // Check the file type based on the parameter name
- if (paramName == "input_vcf" && !(paramValue.endsWith(".vcf") || paramValue.endsWith(".vcf.gz"))) {
- println("Error: '--${paramName}' value '${paramValue}' should be a VCF file (.vcf) or (.vcf.gz)")
- println("To see usage and available parameters run `nextflow run main.nf --help`")
- exit 1
- } else if (paramName == "input_hpo" && !(paramValue.endsWith(".hpo") || paramValue.endsWith(".txt"))) {
- println("Error: '--${paramName}' value '${paramValue}' should be an HPO file (.hpo) or (.txt)")
- println("To see usage and available parameters run `nextflow run main.nf --help`")
- exit 1
- } else if (paramName == "ref_dir" && !fileObj.isDirectory()) {
- println("Error: '--${paramName}' value '${paramValue}' should be an directory.")
- println("To see usage and available parameters run `nextflow run main.nf --help`")
- exit 1
- }
- }
+include {
+ HANDLE_INPUT
+} from "./subworkflows/local/handle_input"
- if (paramName == "ref_ver" && !(paramValue.equals("hg19") || paramValue.equals("hg38")) ) {
- println("Error: '--${paramName}' value ${paramValue} should be either set to 'hg19' or 'hg38'.")
- println("To see usage and available parameters run `nextflow run main.nf --help`")
- exit 1
- }
+include {
+ VCF_PRE_PROCESS; GENERATE_SINGLETON_FEATURES; PREDICTION
+} from "./subworkflows/local/singleton"
- } else {
- println("Input parameter '${paramName}' not specified or is null!")
- println("To see usage and available parameters run `nextflow run main.nf --help`")
- exit 1
- }
- }
-}
-
-showUsage()
showVersion()
-validateInputParams()
-
-
-// Process to handle the VCF file
-process NORMALIZE_VCF {
- input:
- path vcf
-
- output:
- path "input.vcf.gz", emit: vcf
- path "input.vcf.gz.tbi", emit: tbi
-
- script:
- """
- # Determine the file type
- INPUT_VCF_TYPE="\$(file -b $vcf)"
-
- if echo "\${INPUT_VCF_TYPE}" | grep -q 'symbolic link to'; then
- SYM_LINK="\$(readlink -f $vcf)"
- INPUT_VCF_TYPE="\$(file -b \${SYM_LINK})"
- fi
-
-
- if echo "\${INPUT_VCF_TYPE}" | grep -q 'BGZF'; then
- echo "The file is in BGZF format, ready for tabix."
- cp $vcf input.vcf.gz
- elif echo "\${INPUT_VCF_TYPE}" | grep -q 'gzip compressed data'; then
- echo "GZIP format detected, converting to BGZF."
- gunzip -c $vcf | bgzip > input.vcf.gz
- elif echo "\${INPUT_VCF_TYPE}" | grep -q 'ASCII text'; then
- echo "Plain VCF file detected, compressing and indexing."
- bgzip -c $vcf > input.vcf.gz
- else
- echo "The file $vcf does not exist or is not a recognized format."
- exit 1
- fi
-
- tabix -p vcf input.vcf.gz
- """
-}
-
-process FILTER_BED {
- input:
- path vcf
- path tbi
- path ref_filter_bed
-
- output:
- path "${params.run_id}.recode.vcf.gz", emit: vcf
- path "${params.run_id}.recode.vcf.gz.tbi", emit: tbi
-
- script:
- """
- if [ -f ${ref_filter_bed} ]; then
- awk '{gsub(/^chr/, ""); print}' ${ref_filter_bed} > bed
- bcftools filter --regions-file bed ${vcf} -Oz -o "${params.run_id}.recode.vcf.gz"
- tabix -p vcf "${params.run_id}.recode.vcf.gz"
- else
- cp ${vcf} "${params.run_id}.recode.vcf.gz"
- cp ${tbi} "${params.run_id}.recode.vcf.gz.tbi"
- fi
- """
-}
-
-process BUILD_REFERENCE_INDEX {
- container "broadinstitute/gatk"
- storeDir "${params.outdir}/general/reference_index/"
-
- output:
- path "final_${params.ref_ver}.fa", emit: fasta
- path "final_${params.ref_ver}.fa.fai", emit: fasta_index
- path "final_${params.ref_ver}.dict", emit: fasta_dict
-
- script:
- """
- wget --quiet http://hgdownload.soe.ucsc.edu/goldenPath/${params.ref_ver}/bigZips/${params.ref_ver}.fa.gz
- gunzip ${params.ref_ver}.fa.gz
- sed 's/>chr/>/g' ${params.ref_ver}.fa > num_prefix_${params.ref_ver}.fa
- samtools faidx num_prefix_${params.ref_ver}.fa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M > final_${params.ref_ver}.fa
- samtools faidx final_${params.ref_ver}.fa
- gatk CreateSequenceDictionary -R final_${params.ref_ver}.fa
- """
-}
-
-process CONVERT_GVCF {
- container "broadinstitute/gatk"
-
- input:
- path vcf
- path tbi
- path fasta
- path fasta_index
- path fasta_dict
- path chrmap_file
-
- output:
- path "${params.run_id}.nog.vcf.gz", emit: vcf
- path "${params.run_id}.nog.vcf.gz.tbi", emit: tbi
-
-
- script:
- """
- # Define input/output paths and reference genome
- reference_genome="$fasta"
- input_file="$vcf"
- output_file="${params.run_id}.nog.vcf.gz"
-
- # Step 0: Check for
- zcat "\$input_file" | head -n 10000 | grep -q "" || { echo "It's not gVCF"; cp $vcf ${params.run_id}.nog.vcf.gz; cp $tbi ${params.run_id}.nog.vcf.gz.tbi; exit 0; }
-
- # Step 1: Annotate and remove ID field
- echo "Step 1: Annotating and removing ID field..."
- bcftools annotate --rename-chrs "$chrmap_file" -x ID "\$input_file" -Oz -o step1.vcf.gz
- tabix step1.vcf.gz
- bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y step1.vcf.gz -Oz -o step1_1.vcf.gz
-
- # Step 2: Sort the VCF file
- echo "Step 2: Sorting the VCF file..."
- bcftools sort step1_1.vcf.gz -Oz -o step2.vcf.gz
-
- # Step 2.1: Index step2.vcf.gz with tabix
- echo "Indexing step2.vcf.gz with tabix..."
- tabix -p vcf step2.vcf.gz
-
- # Step 3: Genotype GVCFs with GATK
- echo "Step 3: Running GenotypeGVCFs with GATK..."
- gatk GenotypeGVCFs -R "\$reference_genome" -V step2.vcf.gz -O step3.vcf.gz --allow-old-rms-mapping-quality-annotation-data
-
- # Step 4: Filter based on quality with GATK
- echo "Step 4: Running VariantFiltration with GATK..."
- gatk VariantFiltration -V step3.vcf.gz -O step4.vcf.gz --filter-expression "QUAL < 30.0" --filter-name "LowQual"
-
- # Rename the final output file
- echo "Renaming the final output file..."
- mv step4.vcf.gz "\$output_file"
-
- # Index the final output using tabix
- echo "Indexing the final output with tabix..."
- tabix -p vcf "\$output_file"
-
- # Display the number of non-comment lines (ignore lines starting with #)
- lines=\$(zcat "\$output_file" | grep -v '^#' | wc -l)
- echo "File: \$output_file has \$lines lines (excluding comment lines)."
-
- # Clean up intermediate files
- echo "Cleaning up intermediate files..."
- rm -f step*.vcf.gz*
- """
-}
-
-
-process FILTER_UNPASSED {
- input:
- path vcf
- path tbi
- path chrmap
-
- output:
- path "${params.run_id}.filt.vcf.gz", emit: vcf
- path "${params.run_id}.filt.vcf.gz.tbi", emit: tbi
-
- script:
- """
- # Annotate with new chromosomes and preserve original coordinates in ID
- bcftools annotate --rename-chrs $chrmap -x ID $vcf -Oz -o ${params.run_id}-annot
-
- # Annotate with new IDs based on CHROM, POS, REF, ALT
- bcftools annotate --set-id +'%CHROM\\_%POS\\_%REF\\_%FIRST_ALT' ${params.run_id}-annot -Oz -o ${params.run_id}-add-id.vcf.gz
-
- #run quality filters
- bcftools filter ${params.run_id}-add-id.vcf.gz -i'FILTER == "PASS"' -Oz -o ${params.run_id}.filt.vcf.gz
-
- #check number of variants left
- variant_count=\$(bcftools view -H ${params.run_id}.filt.vcf.gz | wc -l)
- if [ "\${variant_count}" -gt 0 ]; then
- echo "Quality filtering completed successfully. Variants passing the filters: \${variant_count}"
- else
- echo "The VCF file doesn't have FILTER annotation, or all variants filtered."
- echo "Pipeline will proceed with unfiltered VCF file."
- cp ${params.run_id}-add-id.vcf.gz ${params.run_id}.filt.vcf.gz
- fi
- tabix -p vcf ${params.run_id}.filt.vcf.gz
- """
-}
-
-
-process FILTER_MITO_AND_UNKOWN_CHR {
- publishDir "${params.outdir}/${params.run_id}/vcf/", mode: 'copy'
- input:
- path vcf
- path tbi
-
- output:
- path "${params.run_id}.filt.rmMT.vcf.gz", emit: vcf
- path "${params.run_id}.filt.rmMT.vcf.gz.tbi", emit: tbi
-
-
- script:
- """
- bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y $vcf -o ${params.run_id}.filt.rmMT.vcf
-
- bgzip ${params.run_id}.filt.rmMT.vcf
- tabix ${params.run_id}.filt.rmMT.vcf.gz
- """
-}
-
-
-process VCF_TO_VARIANTS {
- input:
- path vcf
-
- output:
- path "${params.run_id}-var-filt.txt", emit: var
-
- script:
- """
- zcat $vcf | awk 'substr(\$0, 1, 1) != "#"' | cut -f1,2,4,5 | sed 's/\t/:/g' > ${params.run_id}-var.txt
- cat ${params.run_id}-var.txt | sed 's/chr//g' | sort -u > ${params.run_id}-var-filt.txt
- """
-}
-
-
-process VARIANTS_TO_ENSEMBL {
- input:
- path var
- path ref
-
- output:
- path "${params.run_id}-ensmbl.txt"
-
- script:
- """
- location_to_gene.py $var $ref | \\
- sed 's/:/\\t/g' | sed 's/X\\t/23\\t/g' | sed 's/Y\\t/24\\t/g' | \\
- sed 's/MT\\t/25\\t/g' > ${params.run_id}-ensmbl.txt
- """
-}
-
-
-process ENSEMBL_TO_GENESYM {
- input:
- path ensmbl
- path ref_to_sym
- path ref_sorted_sym
-
- output:
- path "${params.run_id}-gene.txt", emit: gene
-
- script:
- """
- cat $ensmbl | sort -k5,5 | join -1 5 -2 1 - $ref_to_sym | sed 's/ /\\t/g' | cut -f2- > genesym.txt
- cat genesym.txt | cut -f5 | sort -u | join -t\$'\\t' -1 1 -2 2 - $ref_sorted_sym | cut -f2 | sort -u > ${params.run_id}-gene.txt
- """
-}
-
-
-process GENESYM_TO_PHRANK {
- publishDir "${params.outdir}/${params.run_id}/phrank/", mode: 'copy'
-
- input:
- path gene
- path hpo
- path dagfile
- path disease_annotation
- path gene_annotation
- path disease_gene
-
- output:
- path "${params.run_id}.phrank.txt", emit: phrank
-
- script:
- """
- run_phrank.py \\
- $gene $hpo $dagfile $disease_annotation $gene_annotation $disease_gene > ${params.run_id}.phrank.txt
- """
-}
-
-
-process HPO_SIM {
- input:
- path hpo
- path omim_hgmd_phen
- path omim_obo
- path omim_genemap2
- path omim_pheno
-
- output:
- path "${params.run_id}-cz", emit: hgmd_sim
- path "${params.run_id}-dx", emit: omim_sim
-
- script:
- """
- if [[ -z \$(egrep 'HP:[0-9]{7}' $hpo) ]] ; then
- echo "HP:0000001" > $hpo
- fi
-
- phenoSim.R $hpo $omim_hgmd_phen $omim_obo $omim_genemap2 $omim_pheno \\
- ${params.run_id}-cz ${params.run_id}-dx
- """
-
-}
-
-process FILTER_PROBAND {
- publishDir "${params.outdir}/${params.run_id}/vcf/", mode: 'copy'
-
- input:
- path vcf
- path tbi
- path ref_gnomad_genome
- path ref_gnomad_genome_idx
- path ref_gnomad_exome
- path ref_gnomad_exome_idx
-
- output:
- path "${params.run_id}.filt.rmBL.vcf", emit: vcf
-
- script:
- """
- mkdir -m777 isec_tmp1
- bcftools isec -p isec_tmp1 -w 1 -Oz \\
- $vcf $ref_gnomad_genome
- # tabix isec_tmp1/0000.vcf.gz
-
- mkdir -m777 isec_tmp2
- bcftools isec -p isec_tmp2 -w 1 -Oz \\
- $vcf $ref_gnomad_exome
- # tabix isec_tmp2/0000.vcf.gz
-
- mkdir -m777 isec_tmp3
- bcftools isec -p isec_tmp3 -Ov \\
- isec_tmp1/0000.vcf.gz isec_tmp2/0000.vcf.gz
-
- mv isec_tmp3/0002.vcf ${params.run_id}.filt.rmBL.vcf
- """
-
-}
-
-process SPLIT_VCF_BY_CHROMOSOME {
- input:
- path vcf
-
- output:
- path "chr*.vcf.gz", emit: chr_vcfs
-
- script:
- """
- # Get the list of chromosomes from the VCF file
-
- bgzip ${vcf}
- bcftools index ${vcf}.gz
-
- bcftools query -f '%CHROM\n' ${vcf}.gz | sort | uniq > chrom_list.txt
- # Split the VCF file by chromosome
- while read chrom; do
- bcftools view -r \${chrom} ${vcf}.gz -Oz -o chr\${chrom}.vcf.gz
- done < chrom_list.txt
- """
-}
-
-process ANNOTATE_BY_VEP {
- tag "${vcf.simpleName}"
- publishDir "${params.outdir}/${params.run_id}/vep/", mode: "copy"
-
- input:
- path vcf
- path vep_dir_cache
- path vep_dir_plugins
- path vep_custom_gnomad
- path vep_custom_clinvar
- path vep_custom_hgmd
- path vep_plugin_revel
- path vep_plugin_spliceai_snv
- path vep_plugin_spliceai_indel
- path vep_plugin_cadd
- path vep_plugin_dbnsfp
- path vep_idx
-
- output:
- path "${vcf.baseName}-vep.txt", emit: vep_output
-
- script:
- def ref_assembly = (params.ref_ver == 'hg38') ? 'GRCh38' : 'GRCh37'
- """
- /opt/vep/src/ensembl-vep/vep \\
- --dir_cache ${vep_dir_cache} \\
- --dir_plugins ${vep_dir_plugins} \\
- --fork ${task.cpus} --everything --format vcf \\
- --cache --offline --tab --force_overwrite \\
- --species homo_sapiens --assembly ${ref_assembly} \\
- --custom ${vep_custom_gnomad},gnomADg,vcf,exact,0,AF,AF_popmax,controls_nhomalt \\
- --custom ${vep_custom_clinvar},clinvar,vcf,exact,0,CLNREVSTAT,CLNSIG,CLNSIGCONF \\
- --custom ${vep_custom_hgmd},hgmd,vcf,exact,0,CLASS,GENE,PHEN,RANKSCORE \\
- --af_gnomad --plugin REVEL,${vep_plugin_revel},ALL \\
- --plugin SpliceAI,snv=${vep_plugin_spliceai_snv},indel=${vep_plugin_spliceai_indel},cutoff=0.5 \\
- --plugin CADD,${vep_plugin_cadd},ALL \\
- --plugin dbNSFP,${vep_plugin_dbnsfp},ALL \\
- --individual all --output_file ${vcf.baseName}-vep.txt --input_file $vcf \\
- --buffer_size 50
- """
-}
-
-process ANNOTATE_BY_MODULES {
- tag "${vep.simpleName}"
-
- input:
- path vep
- path hgmd_sim, stageAs: "hgmd_sim.tsv"
- path omim_sim, stageAs: "omim_sim.tsv"
- path ref_annot_dir
-
- output:
- path "${vep.baseName}_scores.csv", emit: scores
-
- script:
- """
- feature.py \\
- -patientHPOsimiOMIM $omim_sim \\
- -patientHPOsimiHGMD $hgmd_sim \\
- -varFile ${vep} \\
- -inFileType vepAnnotTab \\
- -patientFileType one \\
- -genomeRef ${params.ref_ver} \\
- -diseaseInh AD \\
- -modules curate,conserve
-
- mv scores.csv ${vep.baseName}_scores.csv
- """
-}
-
-process JOIN_TIER_PHRANK {
- tag "${scores.simpleName}"
-
- input:
- path scores
- path phrank
-
- path ref_annot_dir
- path ref_var_tier_dir
- path ref_merge_expand_dir
-
- output:
- path "${scores.simpleName}_scores.txt.gz", emit: compressed_scores
- path "${scores.simpleName}_Tier.v2.tsv", emit: tier
-
- script:
- """
- mv $scores scores.csv
- VarTierDiseaseDBFalse.R ${params.ref_ver}
- generate_new_matrix_2.py ${params.run_id} ${params.ref_ver}
- mv scores.txt.gz ${scores.simpleName}_scores.txt.gz
- mv Tier.v2.tsv ${scores.simpleName}_Tier.v2.tsv
- """
-}
-
-process MERGE_SCORES_BY_CHROMOSOME {
- publishDir "${params.outdir}/${params.run_id}/merged", mode: "copy"
-
- input:
- path phrank
- path tier, stageAs: "*_Tier.v2.tsv"
- path compressed_scores, stageAs: "*_scores.txt.gz"
-
- path ref_annot_dir
- path ref_mod5_diffusion_dir
- path ref_merge_expand_dir
-
- output:
- path "${params.run_id}.matrix.txt", emit: merged_matrix
- path "scores.txt.gz", emit: merged_compressed_scores
-
- script:
- """
- # Merge matrices
- awk 'FNR==1 && NR!=1{next;}{print}' ${tier} > Tier.v2.tsv
-
- # Merge compressed scores
- header_written=false
-
- for file in ${compressed_scores}; do
- if [ "\$header_written" = false ]; then
- # Write the first file's content, including the header
- zcat \${file} | gzip > scores.txt.gz
- header_written=true
- else
- # Skip the header and append the rest of the data
- zcat \${file} | tail -n +2 | gzip >> scores.txt.gz
- fi
- done
-
- post_processing.py ${params.run_id} ${params.ref_ver}
- """
-}
-
-process PREDICTION {
- publishDir "${params.outdir}/${params.run_id}/prediction/", mode: "copy"
-
- input:
- path merged_matrix
- path merged_compressed_scores
-
- path ref_predict_new_dir
- path ref_model_inputs_dir
-
- output:
- path "conf_4Model/*.csv"
- path "conf_4Model/integrated/*.csv"
-
- script:
- """
- mkdir final_matrix_expanded
- mkdir conf_4Model
-
- run_final.py ${params.run_id}
- merge_rm.py ${params.run_id}
- extraModel_main.py -id ${params.run_id}
- """
-}
-
-workflow VCF_PRE_PROCESS {
- take:
- vcf
- tbi
- fasta
- fasta_index
- fasta_dict
-
- main:
- CONVERT_GVCF(
- vcf,
- tbi,
- fasta,
- fasta_index,
- fasta_dict,
- params.chrmap
- )
- FILTER_UNPASSED(
- CONVERT_GVCF.out.vcf,
- CONVERT_GVCF.out.tbi,
- params.chrmap
- )
- FILTER_MITO_AND_UNKOWN_CHR(
- FILTER_UNPASSED.out.vcf,
- FILTER_UNPASSED.out.tbi,
- )
- FILTER_BED(
- FILTER_MITO_AND_UNKOWN_CHR.out.vcf,
- FILTER_MITO_AND_UNKOWN_CHR.out.tbi,
- moduleDir.resolve(params.ref_filter_bed),
- )
- FILTER_PROBAND(
- FILTER_BED.out.vcf,
- FILTER_BED.out.tbi,
- params.ref_gnomad_genome,
- params.ref_gnomad_genome_idx,
- params.ref_gnomad_exome,
- params.ref_gnomad_exome_idx
- )
-
- emit:
- vcf = FILTER_PROBAND.out.vcf
-}
-
-workflow PHRANK_SCORING {
- take:
- vcf
-
- main:
- VCF_TO_VARIANTS(vcf)
- VARIANTS_TO_ENSEMBL(VCF_TO_VARIANTS.out, params.ref_loc)
- ENSEMBL_TO_GENESYM(VARIANTS_TO_ENSEMBL.out, params.ref_to_sym, params.ref_sorted_sym)
- GENESYM_TO_PHRANK(ENSEMBL_TO_GENESYM.out,
- params.input_hpo,
- params.phrank_dagfile,
- params.phrank_disease_annotation,
- params.phrank_gene_annotation,
- params.phrank_disease_gene)
-
- emit:
- phrank = GENESYM_TO_PHRANK.out
-}
+validateParameters()
workflow {
- NORMALIZE_VCF(params.input_vcf)
- BUILD_REFERENCE_INDEX()
-
- VCF_PRE_PROCESS(
- NORMALIZE_VCF.out.vcf,
- NORMALIZE_VCF.out.tbi,
- BUILD_REFERENCE_INDEX.out.fasta,
- BUILD_REFERENCE_INDEX.out.fasta_index,
- BUILD_REFERENCE_INDEX.out.fasta_dict,
- )
-
- SPLIT_VCF_BY_CHROMOSOME(VCF_PRE_PROCESS.out.vcf)
- ANNOTATE_BY_VEP(
- SPLIT_VCF_BY_CHROMOSOME.out.chr_vcfs.flatten(),
- params.vep_dir_cache,
- params.vep_dir_plugins,
- params.vep_custom_gnomad,
- params.vep_custom_clinvar,
- params.vep_custom_hgmd,
- params.vep_plugin_revel,
- params.vep_plugin_spliceai_snv,
- params.vep_plugin_spliceai_indel,
- params.vep_plugin_cadd,
- params.vep_plugin_dbnsfp,
- file(params.vep_idx)
- )
-
- HPO_SIM(params.input_hpo,
- params.omim_hgmd_phen,
- params.omim_obo,
- params.omim_genemap2,
- params.omim_pheno)
-
- ANNOTATE_BY_MODULES (
- ANNOTATE_BY_VEP.out.vep_output,
- HPO_SIM.out.hgmd_sim,
- HPO_SIM.out.omim_sim,
- file(params.ref_annot_dir)
- )
-
- PHRANK_SCORING(
- NORMALIZE_VCF.out.vcf,
- )
-
- JOIN_TIER_PHRANK (
- ANNOTATE_BY_MODULES.out.scores,
- PHRANK_SCORING.out,
-
- file(params.ref_annot_dir),
- file(params.ref_var_tier_dir),
- file(params.ref_merge_expand_dir)
- )
+ data = PREPARE_DATA()
+ chrmap_file = data.map { it.chrmap_file }
+ ref_model_inputs_dir = data.map { it.ref_model_inputs_dir }
+ fasta_tuple = data.map { it.fasta_tuple }
+
+ (vcf, hpo) = HANDLE_INPUT()
+
+ if (params.input_ped) {
+ vcf = VCF_PRE_PROCESS_TRIO(
+ vcf,
+ file(params.input_ped),
+ fasta_tuple.map { it[0] },
+ fasta_tuple.map { it[1] },
+ fasta_tuple.map { it[2] },
+ chrmap_file,
+ )
+ }
- MERGE_SCORES_BY_CHROMOSOME(
- PHRANK_SCORING.out,
- JOIN_TIER_PHRANK.out.tier.collect(),
- JOIN_TIER_PHRANK.out.compressed_scores.collect(),
- file(params.ref_annot_dir),
- file(params.ref_mod5_diffusion_dir),
- file(params.ref_merge_expand_dir)
- )
+ if (!params.input_variant && !params.input_phenopacket) {
+ vcf = VCF_PRE_PROCESS(
+ vcf,
+ data,
+ )
+ }
- // Run Prediction on the final merged output
+ GENERATE_SINGLETON_FEATURES(vcf, hpo, data)
PREDICTION(
- MERGE_SCORES_BY_CHROMOSOME.out.merged_matrix,
- MERGE_SCORES_BY_CHROMOSOME.out.merged_compressed_scores,
- file(params.ref_predict_new_dir),
- file(params.ref_model_inputs_dir)
+ GENERATE_SINGLETON_FEATURES.out.merged_matrix,
+ GENERATE_SINGLETON_FEATURES.out.merged_compressed_scores,
+ ref_model_inputs_dir,
)
+
+ if (params.input_ped) {
+ GENERATE_TRIO_FEATURES(
+ GENERATE_SINGLETON_FEATURES.out.merged_compressed_scores,
+ PREDICTION.out.default_predictions,
+ VCF_PRE_PROCESS_TRIO.out.inheritance,
+ )
+ PREDICTION_TRIO(
+ GENERATE_SINGLETON_FEATURES.out.merged_compressed_scores,
+ GENERATE_TRIO_FEATURES.out.triomatrix,
+ )
+ }
}
diff --git a/modules.json b/modules.json
new file mode 100644
index 0000000..547c8d9
--- /dev/null
+++ b/modules.json
@@ -0,0 +1,5 @@
+{
+ "name": "LiuzLab/AI_MARRVEL",
+ "homePage": "https://github.com/LiuzLab/AI_MARRVEL",
+ "repos": {}
+}
diff --git a/modules/local/prepare_data/main.nf b/modules/local/prepare_data/main.nf
new file mode 100644
index 0000000..9b12ee9
--- /dev/null
+++ b/modules/local/prepare_data/main.nf
@@ -0,0 +1,173 @@
+process RESTRUCTURE_LOCAL_DATA_EXCEPT_VEP {
+ input:
+ path local_data_path
+
+ output:
+ path "*"
+
+ """
+ find $local_data_path -follow -mindepth 1 -maxdepth 1 -not -name "vep" -exec ln -s {} . \\;
+ """
+}
+
+process RESTRUCTURE_LOCAL_DATA_ONLY_VEP {
+ input:
+ path local_data_path
+
+ output:
+ path "vep"
+
+ """
+ ln -s $local_data_path/vep .
+ """
+}
+
+process STORE_S3_BUCKET_DATA_EXCEPT_VEP {
+ tag "Takes 5 minutes to run"
+ storeDir "./${params.storedir}/s3_bucket_data_except_vep/${params.s3_bucket_data_name}"
+
+ output:
+ path "*"
+
+ """
+ aws s3 sync s3://${params.s3_bucket_data_name} . --exclude "vep/*"
+ """
+}
+
+process STORE_S3_BUCKET_DATA_ONLY_VEP {
+ storeDir "./${params.storedir}/s3_bucket_data_only_vep/v0"
+
+ input:
+ val s3_bucket_data_name
+
+ output:
+ path "vep"
+
+ // NOTE(JL): The path may be changed to s3://aim-data-dependencies-public-vep-v0 in future
+ """
+ aws s3 sync s3://$s3_bucket_data_name/vep ./vep
+ """
+}
+
+process SPLIT_DATA {
+ input:
+ path "data_except_vep/*"
+ path "data_only_vep/*"
+ path bed_filter_file // NOTE(JL): Not tested yet
+ val exome_filter // NOTE(JL): Not tested yet
+
+ output:
+ path "data_except_vep/bcf_annotate/chrmap.txt", emit: chrmap_file
+ path "ref_filter.bed", emit: ref_filter_bed
+
+ path "data_except_vep/annotate", emit: ref_annot_dir
+ path "data_except_vep/var_tier", emit: ref_var_tier_dir
+ path "data_except_vep/merge_expand", emit: ref_merge_expand_dir
+ path "data_except_vep/mod5_diffusion", emit: ref_mod5_diffusion_dir
+ path "data_except_vep/predict_new", emit: ref_predict_new_dir
+ path "data_except_vep/model_inputs", emit: ref_model_inputs_dir
+
+ // for phrank
+ tuple(
+ path("ref_loc.txt"), // ref_loc
+ path("data_except_vep/phrank/${params.ref_ver}/ensembl_to_symbol.txt"), // ref_to_sym
+
+ path("data_except_vep/phrank/${params.ref_ver}/child_to_parent.txt"), // dagfile
+ path("data_except_vep/phrank/${params.ref_ver}/disease_to_pheno.txt"), // disease_annotation
+ path("data_except_vep/phrank/${params.ref_ver}/gene_to_phenotype.txt"), // gene_annotation
+ path("data_except_vep/phrank/${params.ref_ver}/disease_to_gene.txt"), // disease_gene
+ emit: phrank_tuple
+ )
+
+ // OMIM and HPO
+ tuple(
+ path("data_except_vep/omim_annotate/${params.ref_ver}/HGMD_phen.tsv"), // omim_hgmd_phen
+ path("data_except_vep/omim_annotate/hp.obo"), // omim_obo
+ path("data_except_vep/omim_annotate/${params.ref_ver}/genemap2_v2022.rds"), // omim_genemap2
+ path("data_except_vep/omim_annotate/${params.ref_ver}/HPO_OMIM.tsv"), // omim_pheno
+ emit: omim_tuple
+ )
+
+ // GNOMAD VCF
+ tuple(
+ path("data_except_vep/filter_vep/${params.ref_ver}/gnomad.${params.ref_ver}.blacklist.genomes.vcf.gz"), // ref_gnomad_genome
+ path("data_except_vep/filter_vep/${params.ref_ver}/gnomad.${params.ref_ver}.blacklist.genomes.vcf.gz.tbi"), // ref_gnomad_genome_idx
+ path("data_except_vep/filter_vep/${params.ref_ver}/gnomad.${params.ref_ver}.blacklist.exomes.vcf.gz"), // ref_gnomad_exome
+ path("data_except_vep/filter_vep/${params.ref_ver}/gnomad.${params.ref_ver}.blacklist.exomes.vcf.gz.tbi"), // ref_gnomad_exome_idx
+ emit: gnomad_tuple
+ )
+
+ // VEP
+ tuple(
+ path("data_only_vep/vep/${params.ref_ver}/"), // vep_dir_cache
+ path("data_only_vep/vep/${params.ref_ver}/Plugins/"), // vep_dir_plugins
+ path("gnomad.genomes*.vcf.gz", arity: 1), // vep_custom_gnomad
+ path("data_only_vep/vep/${params.ref_ver}/clinvar_20220730.vcf.gz"), // vep_custom_clinvar
+ path("data_only_vep/vep/${params.ref_ver}/HGMD_Pro_2022.2_${params.ref_ver}.vcf.gz"), // vep_custom_hgmd
+ path("new_tabbed_revel*.tsv.gz", arity: 1), // vep_plugin_revel
+ path("data_only_vep/vep/${params.ref_ver}/spliceai_scores.masked.snv.${params.ref_ver}.vcf.gz"), // vep_plugin_spliceai_snv
+ path("data_only_vep/vep/${params.ref_ver}/spliceai_scores.masked.indel.${params.ref_ver}.vcf.gz"), // vep_plugin_spliceai_indel
+ path("*_whole_genome_*.tsv.gz", arity: 1), // vep_plugin_cadd
+ path("dbNSFP*.gz", arity: 1), // vep_plugin_dbnsfp
+ path("data_only_vep/vep/${params.ref_ver}/*.tbi"), // vep_idx
+ emit: vep_tuple
+ )
+
+ script:
+ """
+ # tree .
+
+ ref_assembly=\$( [ "${params.ref_ver}" = "hg19" ] && echo "grch37" || echo "grch38" )
+ ln -s data_except_vep/phrank/${params.ref_ver}/\${ref_assembly}_symbol_to_location.txt ./ref_loc.txt
+
+ if [[ -s "${bed_filter_file}" ]]; then # Check if the bed filter file is not empty.
+ ln -s "${bed_filter_file}" ref_filter.bed
+ elif [[ "${exome_filter}" == "true" ]]; then # Check if the exome filter is true.
+ ln -s "./data_except_vep/filter_exonic/${params.ref_ver}.bed" ref_filter.bed
+ else
+ touch ref_filter.bed
+ fi
+
+ # VEP
+ if [[ "${params.ref_ver}" == "hg19" ]]; then
+ vep_gnomad_name="gnomad.genomes.r2.1.sites.grch37_noVEP.vcf.gz"
+ vep_cadd_name="hg19_whole_genome_SNVs.tsv.gz"
+ vep_dbnsfp_name="dbNSFP4.3a_grch37.gz"
+ vep_revel_name="new_tabbed_revel.tsv.gz"
+ else
+ vep_gnomad_name="gnomad.genomes.GRCh38.v3.1.2.sites.vcf.gz"
+ vep_cadd_name="hg38_whole_genome_SNV.tsv.gz"
+ vep_dbnsfp_name="dbNSFP4.1a_grch38.gz"
+ vep_revel_name="new_tabbed_revel_grch38.tsv.gz"
+ fi
+
+ ln -s "./data_only_vep/vep/${params.ref_ver}/\${vep_gnomad_name}" .
+ ln -s "./data_only_vep/vep/${params.ref_ver}/\${vep_cadd_name}" .
+ ln -s "./data_only_vep/vep/${params.ref_ver}/\${vep_dbnsfp_name}" .
+ ln -s "./data_only_vep/vep/${params.ref_ver}/\${vep_revel_name}" .
+
+ """
+}
+
+process BUILD_REFERENCE_INDEX {
+ container "broadinstitute/gatk"
+ storeDir "${params.storedir}/general/reference_index/"
+
+ output:
+ tuple(
+ path("final_${params.ref_ver}.fa"),
+ path("final_${params.ref_ver}.fa.fai"),
+ path("final_${params.ref_ver}.dict"),
+ emit: fasta_tuple
+ )
+
+ script:
+ """
+ wget --quiet http://hgdownload.soe.ucsc.edu/goldenPath/${params.ref_ver}/bigZips/${params.ref_ver}.fa.gz
+ gunzip ${params.ref_ver}.fa.gz
+ sed 's/>chr/>/g' ${params.ref_ver}.fa > num_prefix_${params.ref_ver}.fa
+ samtools faidx num_prefix_${params.ref_ver}.fa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M > final_${params.ref_ver}.fa
+ samtools faidx final_${params.ref_ver}.fa
+ gatk CreateSequenceDictionary -R final_${params.ref_ver}.fa
+ """
+}
diff --git a/modules/local/singleton/main.nf b/modules/local/singleton/main.nf
new file mode 100644
index 0000000..2727458
--- /dev/null
+++ b/modules/local/singleton/main.nf
@@ -0,0 +1,612 @@
+import groovy.json.JsonOutput
+
+process GENERATE_MANIFEST_JSON {
+ publishDir "${params.outdir}/${params.run_id}/manifest/", mode: 'copy'
+
+ input:
+ path ref_dir
+
+ output:
+ path "manifest.json"
+ path "git.hash"
+ path "data_dependencies_tree.hash"
+ path "data_dependencies_tree.txt"
+
+ script:
+
+ """
+cat <> manifest.json
+${JsonOutput.prettyPrint(JsonOutput.toJson(workflow.manifest))}
+EOF
+
+ (cd $ref_dir && find . -type f -printf '%p\\t%s\\n') \\
+ | sort \\
+ | tee data_dependencies_tree.txt \\
+ | md5sum >> data_dependencies_tree.hash
+
+ git -C ${projectDir} rev-parse HEAD >> git.hash
+ """
+}
+
+process PHENOPACKET_TO_VARIANTS_AND_HPOS {
+ input:
+ path phenopacket_json
+
+ output:
+ path "${params.run_id}.variants.txt", emit: variants
+ path "${params.run_id}.hpos.txt", emit: hpo
+
+ """
+ jq -r '.phenotypicFeatures[] | select(.excluded != true) | .type.id' $phenopacket_json > ${params.run_id}.hpos.txt
+ jq -r '.interpretations[].diagnosis.genomicInterpretations[].variantInterpretation.variationDescriptor.vcfRecord | "\\(.chrom | sub("^chr";""))_\\(.pos)_\\(.ref)_\\(.alt)"' $phenopacket_json > ${params.run_id}.variants.unsorted.txt
+ sort -t'_' -k1,1V -k2,2n -k3,3 -k4,4 ${params.run_id}.variants.unsorted.txt > ${params.run_id}.variants.txt
+ """
+}
+
+process GENERATE_INPUT_VARIANTS {
+ input:
+ val input_variant
+
+ output:
+ path "${params.run_id}.variants.txt", emit: variants
+
+ """
+ echo $input_variant > ${params.run_id}.variants.txt
+ """
+}
+
+process GENERATE_INPUT_VCF {
+ input:
+ val variants
+
+ output:
+ path "input.vcf.gz", emit: vcf
+
+ """
+ generate_input_vcf.py $variants
+ bgzip input.vcf
+ """
+}
+
+process VALIDATE_VCF {
+ container 'quay.io/biocontainers/vcftools:0.1.16--pl5321hdcf5f25_11'
+
+ input:
+ path vcf
+
+ output:
+ path "$vcf", emit: vcf
+
+ """
+ vcf-validator $vcf
+ """
+}
+
+// Process to handle the VCF file
+process NORMALIZE_VCF {
+ input:
+ path vcf
+
+ output:
+ path "input.vcf.gz", emit: vcf
+ path "input.vcf.gz.tbi", emit: tbi
+
+ script:
+ """
+ # Determine the file type
+ INPUT_VCF_TYPE="\$(file -b $vcf)"
+
+ if echo "\${INPUT_VCF_TYPE}" | grep -q 'symbolic link to'; then
+ SYM_LINK="\$(readlink -f $vcf)"
+ INPUT_VCF_TYPE="\$(file -b \${SYM_LINK})"
+ fi
+
+ if echo "\${INPUT_VCF_TYPE}" | grep -q 'BGZF'; then
+ echo "The file is in BGZF format, ready for tabix."
+ cp $vcf input.vcf.gz
+ elif echo "\${INPUT_VCF_TYPE}" | grep -q 'gzip compressed data'; then
+ echo "GZIP format detected, converting to BGZF."
+ gunzip -c $vcf | bgzip > input.vcf.gz
+ elif echo "\${INPUT_VCF_TYPE}" | grep -q -e 'ASCII text' -e 'Unicode text' -e 'Variant Call Format'; then
+ echo "Plain VCF file detected, compressing and indexing."
+ bgzip -c $vcf > input.vcf.gz
+ else
+ echo "The file $vcf does not exist or is not a recognized format."
+ exit 1
+ fi
+
+ tabix -p vcf input.vcf.gz
+ """
+}
+
+process FILTER_BED {
+ input:
+ path vcf
+ path tbi
+ path ref_filter_bed
+
+ output:
+ path "${params.run_id}.recode.vcf.gz", emit: vcf
+ path "${params.run_id}.recode.vcf.gz.tbi", emit: tbi
+
+ script:
+ """
+ if [[ -s ${ref_filter_bed} ]]; then # Check if the bed filter file is not empty.
+ awk '{gsub(/^chr/, ""); print}' ${ref_filter_bed} > bed
+ bcftools filter --regions-file bed ${vcf} -Oz -o "${params.run_id}.recode.vcf.gz"
+ tabix -p vcf "${params.run_id}.recode.vcf.gz"
+ else
+ cp ${vcf} "${params.run_id}.recode.vcf.gz"
+ cp ${tbi} "${params.run_id}.recode.vcf.gz.tbi"
+ fi
+ """
+}
+
+process CONVERT_GVCF {
+ container "broadinstitute/gatk"
+
+ input:
+ path vcf
+ path tbi
+ path fasta
+ path fasta_index
+ path fasta_dict
+ path chrmap_file
+
+ output:
+ path "${params.run_id}.nog.vcf.gz", emit: vcf
+ path "${params.run_id}.nog.vcf.gz.tbi", emit: tbi
+
+
+ script:
+ """
+ # Define input/output paths and reference genome
+ reference_genome="$fasta"
+ input_file="$vcf"
+ output_file="${params.run_id}.nog.vcf.gz"
+
+ # Step 0: Check for
+ zcat "\$input_file" | head -n 10000 | grep -q "" || { echo "It's not gVCF"; cp $vcf ${params.run_id}.nog.vcf.gz; cp $tbi ${params.run_id}.nog.vcf.gz.tbi; exit 0; }
+
+ # Step 1: Annotate and remove ID field
+ echo "Step 1: Annotating and removing ID field..."
+ bcftools annotate --rename-chrs "$chrmap_file" -x ID "\$input_file" -Oz -o step1.vcf.gz
+ tabix step1.vcf.gz
+ bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y step1.vcf.gz -Oz -o step1_1.vcf.gz
+
+ # Step 2: Sort the VCF file
+ echo "Step 2: Sorting the VCF file..."
+ bcftools sort step1_1.vcf.gz -Oz -o step2.vcf.gz
+
+ # Step 2.1: Index step2.vcf.gz with tabix
+ echo "Indexing step2.vcf.gz with tabix..."
+ tabix -p vcf step2.vcf.gz
+
+ # Step 3: Genotype GVCFs with GATK
+ echo "Step 3: Running GenotypeGVCFs with GATK..."
+ gatk GenotypeGVCFs -R "\$reference_genome" -V step2.vcf.gz -O step3.vcf.gz --allow-old-rms-mapping-quality-annotation-data
+
+ # Step 4: Filter based on quality with GATK
+ echo "Step 4: Running VariantFiltration with GATK..."
+ gatk VariantFiltration -V step3.vcf.gz -O step4.vcf.gz --filter-expression "QUAL < 30.0" --filter-name "LowQual"
+
+ # Rename the final output file
+ echo "Renaming the final output file..."
+ mv step4.vcf.gz "\$output_file"
+
+ # Index the final output using tabix
+ echo "Indexing the final output with tabix..."
+ tabix -p vcf "\$output_file"
+
+ # Display the number of non-comment lines (ignore lines starting with #)
+ lines=\$(zcat "\$output_file" | grep -v '^#' | wc -l)
+ echo "File: \$output_file has \$lines lines (excluding comment lines)."
+
+ # Clean up intermediate files
+ echo "Cleaning up intermediate files..."
+ rm -f step*.vcf.gz*
+ """
+}
+
+
+process FILTER_UNPASSED {
+ input:
+ path vcf
+ path tbi
+ path chrmap
+
+ output:
+ path "${params.run_id}.filt.vcf.gz", emit: vcf
+ path "${params.run_id}.filt.vcf.gz.tbi", emit: tbi
+
+ script:
+ """
+ # Annotate with new chromosomes and preserve original coordinates in ID
+ bcftools annotate --rename-chrs $chrmap -x ID $vcf -Oz -o ${params.run_id}-annot
+
+ # Annotate with new IDs based on CHROM, POS, REF, ALT
+ bcftools annotate --set-id +'%CHROM\\_%POS\\_%REF\\_%FIRST_ALT' ${params.run_id}-annot -Oz -o ${params.run_id}-add-id.vcf.gz
+
+ #run quality filters
+ bcftools filter ${params.run_id}-add-id.vcf.gz -i'FILTER == "PASS"' -Oz -o ${params.run_id}.filt.vcf.gz
+
+ #check number of variants left
+ variant_count=\$(bcftools view -H ${params.run_id}.filt.vcf.gz | wc -l)
+ if [ "\${variant_count}" -gt 0 ]; then
+ echo "Quality filtering completed successfully. Variants passing the filters: \${variant_count}"
+ else
+ echo "The VCF file doesn't have FILTER annotation, or all variants filtered."
+ echo "Pipeline will proceed with unfiltered VCF file."
+ cp ${params.run_id}-add-id.vcf.gz ${params.run_id}.filt.vcf.gz
+ fi
+ tabix -p vcf ${params.run_id}.filt.vcf.gz
+ """
+}
+
+
+process FILTER_MITO_AND_UNKOWN_CHR {
+ publishDir "${params.outdir}/${params.run_id}/vcf/", mode: 'copy'
+ input:
+ path vcf
+ path tbi
+
+ output:
+ path "${params.run_id}.filt.rmMT.vcf.gz", emit: vcf
+ path "${params.run_id}.filt.rmMT.vcf.gz.tbi", emit: tbi
+
+
+ script:
+ """
+ bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y $vcf -o ${params.run_id}.filt.rmMT.vcf
+
+ bgzip ${params.run_id}.filt.rmMT.vcf
+ tabix ${params.run_id}.filt.rmMT.vcf.gz
+ """
+}
+
+
+process VCF_TO_VARIANTS {
+ input:
+ path vcf
+
+ output:
+ path "${params.run_id}-var-filt.txt", emit: var
+
+ script:
+ """
+ zcat $vcf | awk 'substr(\$0, 1, 1) != "#"' | cut -f1,2,4,5 | sed 's/\t/:/g' > ${params.run_id}-var.txt
+ cat ${params.run_id}-var.txt | sed 's/chr//g' | sort -u > ${params.run_id}-var-filt.txt
+ """
+}
+
+
+process VARIANTS_TO_ENSEMBL {
+ input:
+ path var
+ path ref_loc
+
+ output:
+ path "${params.run_id}-ensmbl.txt"
+
+ script:
+ """
+ location_to_gene.py $var $ref_loc | \\
+ sed 's/:/\\t/g' | sed 's/X\\t/23\\t/g' | sed 's/Y\\t/24\\t/g' | \\
+ sed 's/MT\\t/25\\t/g' > ${params.run_id}-ensmbl.txt
+ """
+}
+
+
+process ENSEMBL_TO_GENESYM {
+ input:
+ path ensmbl
+ path ref_to_sym
+
+ output:
+ path "${params.run_id}-gene.txt", emit: gene
+
+ script:
+ """
+ # Generate sorted gene to symbol file
+ sort -t \$'\t' -k2,2 $ref_to_sym > sorted_ref_to_sym.txt
+
+ cat $ensmbl | sort -k5,5 | join -1 5 -2 1 - $ref_to_sym | sed 's/ /\\t/g' | cut -f2- > genesym.txt
+ cat genesym.txt | cut -f5 | sort -u | join -t\$'\\t' -1 1 -2 2 - sorted_ref_to_sym.txt | cut -f2 | sort -u > ${params.run_id}-gene.txt
+ """
+}
+
+
+process GENESYM_TO_PHRANK {
+ publishDir "${params.outdir}/${params.run_id}/phrank/", mode: 'copy'
+
+ input:
+ path gene
+ path hpo
+ path dagfile
+ path disease_annotation
+ path gene_annotation
+ path disease_gene
+
+ output:
+ path "${params.run_id}.phrank.txt", emit: phrank
+
+ script:
+ """
+ run_phrank.py \\
+ $gene $hpo $dagfile $disease_annotation $gene_annotation $disease_gene > ${params.run_id}.phrank.txt
+ """
+}
+
+
+process HPO_SIM {
+ container 'zhandongliulab/aim-lite-r'
+
+ input:
+ path hpo, name: "input.hpos.txt"
+ path omim_hgmd_phen
+ path omim_obo
+ path omim_genemap2
+ path omim_pheno
+
+ output:
+ path "${params.run_id}-cz", emit: hgmd_sim
+ path "${params.run_id}-dx", emit: omim_sim
+
+ script:
+ """
+ cp $hpo input.copied.hpos.txt
+ if [[ -z \$(egrep 'HP:[0-9]{7}' $hpo) ]] ; then
+ echo "HP:0000001" > input.copied.hpos.txt
+ fi
+
+ phenoSim.R input.copied.hpos.txt $omim_hgmd_phen $omim_obo $omim_genemap2 $omim_pheno \\
+ ${params.run_id}-cz ${params.run_id}-dx
+ """
+
+}
+
+process FILTER_PROBAND {
+ publishDir "${params.outdir}/${params.run_id}/vcf/", mode: 'copy'
+
+ input:
+ path vcf
+ path tbi
+ path ref_gnomad_genome
+ path ref_gnomad_genome_idx
+ path ref_gnomad_exome
+ path ref_gnomad_exome_idx
+
+ output:
+ path "${params.run_id}.filt.rmBL.vcf.gz", emit: vcf
+
+ script:
+ """
+ mkdir -m777 isec_tmp1
+ bcftools isec -p isec_tmp1 -w 1 -Oz \\
+ $vcf $ref_gnomad_genome
+ # tabix isec_tmp1/0000.vcf.gz
+
+ mkdir -m777 isec_tmp2
+ bcftools isec -p isec_tmp2 -w 1 -Oz \\
+ $vcf $ref_gnomad_exome
+ # tabix isec_tmp2/0000.vcf.gz
+
+ mkdir -m777 isec_tmp3
+ bcftools isec -p isec_tmp3 -Ov \\
+ isec_tmp1/0000.vcf.gz isec_tmp2/0000.vcf.gz
+
+ mv isec_tmp3/0002.vcf ${params.run_id}.filt.rmBL.vcf
+ bgzip ${params.run_id}.filt.rmBL.vcf
+ """
+
+}
+
+process SPLIT_VCF_BY_CHROMOSOME {
+ input:
+ path vcf
+
+ output:
+ path "chr*.vcf.gz", emit: chr_vcfs
+
+ script:
+ """
+ # Get the list of chromosomes from the VCF file
+
+ bcftools index ${vcf}
+
+ bcftools query -f '%CHROM\n' ${vcf} | sort | uniq > chrom_list.txt
+ # Split the VCF file by chromosome
+ while read chrom; do
+ bcftools view -r \${chrom} ${vcf} -Oz -o chr\${chrom}.vcf.gz
+ done < chrom_list.txt
+ """
+}
+
+process ANNOTATE_BY_VEP {
+ container 'ensemblorg/ensembl-vep:release_104.3'
+ tag "${vcf.simpleName}"
+ publishDir "${params.outdir}/${params.run_id}/vep/", mode: "copy"
+
+ input:
+ path vcf
+ path vep_dir_cache
+ path vep_dir_plugins
+ path vep_custom_gnomad
+ path vep_custom_clinvar
+ path vep_custom_hgmd
+ path vep_plugin_revel
+ path vep_plugin_spliceai_snv
+ path vep_plugin_spliceai_indel
+ path vep_plugin_cadd
+ path vep_plugin_dbnsfp
+ path vep_idx
+
+ output:
+ path "${vcf.baseName}-vep.txt", emit: vep_output
+
+ script:
+ def ref_assembly = (params.ref_ver == 'hg38') ? 'GRCh38' : 'GRCh37'
+ """
+ /opt/vep/src/ensembl-vep/vep \\
+ --dir_cache ${vep_dir_cache} \\
+ --dir_plugins ${vep_dir_plugins} \\
+ --fork ${task.cpus} --everything --format vcf \\
+ --cache --offline --tab --force_overwrite \\
+ --species homo_sapiens --assembly ${ref_assembly} \\
+ --custom ${vep_custom_gnomad},gnomADg,vcf,exact,0,AF,AF_popmax,controls_nhomalt \\
+ --custom ${vep_custom_clinvar},clinvar,vcf,exact,0,CLNREVSTAT,CLNSIG,CLNSIGCONF \\
+ --custom ${vep_custom_hgmd},hgmd,vcf,exact,0,CLASS,GENE,PHEN,RANKSCORE \\
+ --af_gnomad --plugin REVEL,${vep_plugin_revel},ALL \\
+ --plugin SpliceAI,snv=${vep_plugin_spliceai_snv},indel=${vep_plugin_spliceai_indel},cutoff=0.5 \\
+ --plugin CADD,${vep_plugin_cadd},ALL \\
+ --plugin dbNSFP,${vep_plugin_dbnsfp},ALL \\
+ --individual all --output_file ${vcf.baseName}-vep.txt --input_file $vcf \\
+ --buffer_size 50
+ """
+}
+
+process ANNOTATE_BY_MODULES {
+ tag "${vep.simpleName}"
+
+ input:
+ path vep
+ path hgmd_sim, stageAs: "hgmd_sim.tsv"
+ path omim_sim, stageAs: "omim_sim.tsv"
+ path ref_annot_dir
+
+ output:
+ path "${vep.baseName}_scores.csv", emit: scores
+
+ script:
+ """
+ feature.py \\
+ ${params.impact_filter ? "-enableLIT" : ""} \\
+ -diseaseInh AD \\
+ -modules curate,conserve \\
+ -inFileType vepAnnotTab \\
+ -patientFileType one \\
+ -patientHPOsimiOMIM ${omim_sim} \\
+ -patientHPOsimiHGMD ${hgmd_sim} \\
+ -varFile ${vep} \\
+ -genomeRef ${params.ref_ver}
+
+ mv scores.csv ${vep.baseName}_scores.csv
+ """
+}
+
+process ANNOTATE_TIER {
+ container 'zhandongliulab/aim-lite-r'
+ tag "${scores.simpleName}"
+
+ input:
+ path scores
+ path phrank
+ path ref_annot_dir
+ path ref_var_tier_dir
+
+ output:
+ path "${scores.simpleName}_Tier.v2.tsv", emit: tier
+
+ script:
+ """
+ mv $scores scores.csv
+ VarTierDiseaseDBFalse.R ${params.ref_ver}
+ mv Tier.v2.tsv ${scores.simpleName}_Tier.v2.tsv
+ """
+}
+
+process JOIN_PHRANK {
+ tag "${scores.simpleName}"
+
+ input:
+ path scores
+ path phrank
+ path ref_merge_expand_dir
+
+ output:
+ path "${scores.simpleName}_scores.txt.gz", emit: compressed_scores
+
+ script:
+ """
+ mv $scores scores.csv
+ generate_new_matrix_2.py ${params.run_id} ${params.ref_ver}
+ mv scores.txt.gz ${scores.simpleName}_scores.txt.gz
+ """
+}
+
+process MERGE_SCORES_BY_CHROMOSOME {
+ publishDir "${params.outdir}/${params.run_id}/merged", mode: "copy"
+
+ input:
+ path phrank
+ path tier, stageAs: "*_Tier.v2.tsv"
+ path compressed_scores, stageAs: "*_scores.txt.gz"
+
+ path ref_annot_dir
+ path ref_mod5_diffusion_dir
+ path ref_merge_expand_dir
+
+ output:
+ path "${params.run_id}.matrix.txt", emit: merged_matrix
+ path "scores.txt.gz", emit: merged_compressed_scores
+
+ script:
+ """
+ # Merge matrices
+ awk 'FNR==1 && NR!=1{next;}{print}' ${tier} > Tier.v2.tsv
+
+ # Merge compressed scores
+ header_written=false
+
+ for file in ${compressed_scores}; do
+ if [ "\$header_written" = false ]; then
+ # Write the first file's content, including the header
+ zcat \${file} | gzip > scores.txt.gz
+ header_written=true
+ else
+ # Skip the header and append the rest of the data
+ zcat \${file} | tail -n +2 | gzip >> scores.txt.gz
+ fi
+ done
+
+ post_processing.py ${params.run_id} ${params.ref_ver}
+ """
+}
+
+process PREDICTION {
+ publishDir "${params.outdir}/${params.run_id}/prediction/", mode: "copy"
+
+ input:
+ path merged_matrix
+ path merged_compressed_scores
+ path ref_model_inputs_dir
+
+ output:
+ path "conf_4Model/${params.run_id}_default_predictions.csv", emit: "default_predictions"
+ path "conf_4Model/${params.run_id}_recessive_predictions.csv", optional: true
+ path "conf_4Model/${params.run_id}_nd_predictions.csv", optional: true
+ path "conf_4Model/${params.run_id}_nd_recessive_predictions.csv", optional: true
+ path "final_matrix_expanded/*.csv.gz"
+ path "shap_outputs"
+
+ script:
+ """
+ mkdir final_matrix_expanded
+ mkdir conf_4Model
+
+ run_final.py \
+ $ref_model_inputs_dir/default/final_model.job \
+ $ref_model_inputs_dir/default/features.csv \
+ $merged_matrix \
+ ${params.run_id}.default_prediction.csv
+
+ # Generate final matrix expanded
+ merge_rm.py \
+ ${params.run_id}.default_prediction.csv \
+ $merged_compressed_scores \
+ final_matrix_expanded/${params.run_id}.expanded.csv.gz
+
+ # Generate conf_4Model
+ extraModel_main.py -id ${params.run_id}
+ """
+}
diff --git a/modules/local/trio/main.nf b/modules/local/trio/main.nf
new file mode 100644
index 0000000..e3631a5
--- /dev/null
+++ b/modules/local/trio/main.nf
@@ -0,0 +1,180 @@
+process VCF_PRE_PROCESS_TRIO {
+ container "broadinstitute/gatk"
+ publishDir "${params.outdir}/${params.run_id}/vcf_trio", mode: 'copy'
+
+ input:
+ path vcf
+ path ped
+ path fasta
+ path fasta_index
+ path fasta_dict
+ path chrmap_file
+
+ output:
+ path "${params.run_id}_fixed.vcf.gz", emit: "vcf"
+ path "${params.run_id}.inheritance.txt", emit: "inheritance"
+
+ script:
+ """
+ # Extract proband, maternal, and paternal id from ped file, sample IDs in ped should be the same as in the vcf file
+
+ # 1) Make sure there are exactly 3 non-empty lines
+ row_count=\$(awk 'NF' "$ped" | wc -l)
+ if [ "\$row_count" -ne 3 ]; then
+ echo "ERROR: Expected 3 rows in '$ped' (found \$row_count)" >&2
+ exit 1
+ fi
+
+ # 2) Find the proband row (both parent fields != 0)
+ # PED format: 1.FAMID 2.INDIV 3.PID 4.MID 5.SEX 6.PHENOTYPE
+ proband_line=\$(awk '\$3 != "0" && \$4 != "0" { print; exit }' "$ped")
+ if [ -z "\$proband_line" ]; then
+ echo "ERROR: No row with both paternal (field 3) and maternal (field 4) IDs found" >&2
+ exit 1
+ fi
+
+ # 3) Parse out the IDs
+ # We only care about fields 2,3,4 here.
+ read -r _ SAMPLE_P SAMPLE_F SAMPLE_M _ <<< "\$proband_line"
+
+ echo "Normalize joint VCF file, split multiallelics rows"
+ bcftools norm --multiallelics -both \
+ -Oz -o ./${params.run_id}.tmp.vcf.gz \
+ $vcf
+ tabix -p vcf ./${params.run_id}.tmp.vcf.gz
+
+ bcftools norm --rm-dup none \
+ -Oz -o ./${params.run_id}.joint.vcf.gz \
+ ./${params.run_id}.tmp.vcf.gz
+
+ tabix -p vcf ./${params.run_id}.joint.vcf.gz
+ zcat ./${params.run_id}.joint.vcf.gz | grep '^#CHROM' > ./vcfheaders.txt #* extract header from vcf file
+
+ # create ./sample_ids.txt
+ trio_rename_VCFheader.py \
+ \$SAMPLE_P \
+ \$SAMPLE_F \
+ \$SAMPLE_M \
+ ./vcfheaders.txt
+
+ bcftools reheader --samples ./sample_ids.txt \
+ -o ./${params.run_id}.joint2.vcf.gz \
+ ./${params.run_id}.joint.vcf.gz
+
+ mv ./${params.run_id}.joint2.vcf.gz ./${params.run_id}.joint.vcf.gz
+ tabix -f -p vcf ./${params.run_id}.joint.vcf.gz
+
+ bcftools annotate --rename-chrs "$chrmap_file" -x ID ./${params.run_id}.joint.vcf.gz -Oz -o step1.vcf.gz
+ tabix step1.vcf.gz
+ bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y step1.vcf.gz -Oz -o step1_1.vcf.gz
+
+ mv step1_1.vcf.gz ./${params.run_id}.joint.vcf.gz
+
+ # Index the final output using tabix
+ echo "Indexing the final output with tabix..."
+ tabix -f -p vcf ./${params.run_id}.joint.vcf.gz
+
+ # De Novo Calling using VariantAnnotator
+ #* GATK 4.2.6.0
+ echo "GATK De Novo Calling..."
+ gatk VariantAnnotator \
+ -A PossibleDeNovo \
+ -R ${fasta} \
+ --pedigree $ped \
+ -V ./${params.run_id}.joint.vcf.gz \
+ -O ./${params.run_id}.DeNovo.g.vcf
+
+ echo "Check variants inheritance..."
+
+ # Extract patient VCF from joint VCF
+ #=======================================================
+ # TODOs
+ #=======================================================
+
+ echo "Extract proband VCF..."
+ #* extract the proband variants from the joint vcf file
+ bcftools view -c1 -Oz -o./${params.run_id}.vcf.gz -s \${SAMPLE_P} ${params.run_id}.joint.vcf.gz
+
+ echo "Fix VCF formatting..."
+ cat ${projectDir}/resources/trio_pipeline/acceptable_header.txt | grep -v '#CHROM' > header.txt
+ zcat ./${params.run_id}.vcf.gz | grep '#CHROM' | head -1 >> header.txt
+ zcat ./${params.run_id}.vcf.gz | grep -v '#' | awk '
+ BEGIN {FS = "\\t"} {
+ \$7 = "PASS";
+ \$8 = ".";
+ gsub(":.*", "", \$9);
+ gsub(":.*", "", \$10);
+ gsub("\\|", "/", \$10);
+ print \$1 "\\t" \$2 "\\t" \$3 "\\t" \$4 "\\t" \$5 "\\t" \$6 "\\t" \$7 "\\t" \$8 "\\t" \$9 "\\t" \$10
+ }' > body.txt
+ cat header.txt body.txt | bgzip > ./${params.run_id}_fixed.vcf.gz
+
+ # create ${params.run_id}.inheritance.txt and ${params.run_id}.summary.txt
+ trio_check_inheritance.GATK.py \
+ ${params.run_id} \
+ ./${params.run_id}.DeNovo.g.vcf \
+ ${params.ref_ver} \
+ \$SAMPLE_P \
+ \$SAMPLE_F \
+ \$SAMPLE_M
+
+
+ """
+}
+
+process GENERATE_TRIO_FEATURES {
+ publishDir "${params.outdir}/${params.run_id}/features_trio", mode: 'copy'
+
+ input:
+ path merged_compressed_scores
+ path merged_matrix
+ path inheritance
+
+ output:
+ path "${params.run_id}.triomatrix.tsv", emit: triomatrix
+
+ script:
+ """
+ trio_merge_inheritance.py \
+ $merged_compressed_scores \
+ $merged_matrix \
+ $inheritance \
+ ./${params.run_id}.triomatrix.tsv
+ """
+}
+
+process PREDICTION_TRIO {
+ container "zhandongliulab/aim-lite-oldpython"
+ publishDir "${params.outdir}/${params.run_id}/prediction_trio", mode: 'copy'
+
+ input:
+ path merged_compressed_scores
+ path triomatrix
+
+ output:
+ path "*.trio.expanded.csv.gz"
+ path "*.trio.prediction.csv"
+
+ script:
+ """
+ run_final.py \
+ ${projectDir}/resources/trio_pipeline/rf_trio_base.job \
+ ${projectDir}/resources/trio_pipeline/features.csv \
+ $triomatrix \
+ ./${params.run_id}.trio.csv
+ run_final.py \
+ ${projectDir}/resources/trio_pipeline/rf_trio_NDG.job \
+ ${projectDir}/resources/trio_pipeline/features_NDG.csv \
+ $triomatrix \
+ ./${params.run_id}.trio.NDG.csv
+
+
+ # Generate ${params.run_id}.trio.prediction.csv
+ trio_merge_rm.py ${params.run_id}
+
+ merge_rm.py \
+ ${params.run_id}.trio.prediction.csv \
+ $merged_compressed_scores \
+ ${params.run_id}.trio.expanded.csv.gz
+ """
+}
diff --git a/modules/local/utils/main.nf b/modules/local/utils/main.nf
new file mode 100644
index 0000000..1074741
--- /dev/null
+++ b/modules/local/utils/main.nf
@@ -0,0 +1,8 @@
+def showVersion() {
+ if (!params.version) {
+ return
+ }
+
+ println workflow.manifest.version
+ exit 0
+}
diff --git a/nextflow.config b/nextflow.config
index a70ecc0..254ad6b 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,92 +1,74 @@
params {
- run_id = 1
+ input_vcf = null
+ input_phenopacket = null
+ input_variant = null
+ input_ped = null
+ input_hpo = null
+ outdir = "./out"
+ storedir = "./store"
+
+ run_id = "run1"
ref_ver = "hg19"
- ref_dir = " "
- bed_filter = " "
+ ref_dir = null
+ s3_bucket_data_name = "aim-data-dependencies-2.3-public"
+ bed_filter = null
exome_filter = false
+ impact_filter = false
help = false
version = false
+}
- ref_assembly = params.ref_ver == "hg19" ? "grch37" : "grch38"
-
- // for data dependency
- chrmap = "${ref_dir}/bcf_annotate/chrmap.txt"
-
- ref_loc = "${ref_dir}/phrank/${ref_ver}/${ref_assembly}_symbol_to_location.txt"
- ref_to_sym = "${ref_dir}/phrank/${ref_ver}/ensembl_to_symbol.txt"
- ref_sorted_sym = "${ref_dir}/phrank/${ref_ver}/gene_to_symbol_sorted.txt"
+includeConfig 'conf/base.config'
- // FILTER BED
- // EXONIC FILTER BED
- ref_exonic_filter_bed = "${ref_dir}/filter_exonic/${ref_ver}.bed"
- if (params.bed_filter != "") {
- ref_filter_bed = bed_filter
- } else if (params.exome_filter) {
- ref_filter_bed = ref_exonic_filter_bed
- } else {
- ref_filter_bed = "/dev/null"
+profiles {
+ debug {
+ timeline.enabled = true
+ trace.enabled = true
+ report.enabled = true
+ }
+ docker {
+ process.beforeScript = 'echo "Running Docker Profile."'
+ singularity.enabled = false
+ docker.enabled = true
+ }
+ singularity {
+ process.beforeScript = 'echo "Running Singularity Profile."'
+ docker.enabled = false
+ singularity.enabled = true
}
-
- // for phrank
- phrank_dagfile = "${ref_dir}/phrank/${ref_ver}/child_to_parent.txt"
- phrank_disease_annotation = "${ref_dir}/phrank/${ref_ver}/disease_to_pheno.txt"
- phrank_gene_annotation = "${ref_dir}/phrank/${ref_ver}/gene_to_phenotype.txt"
- phrank_disease_gene = "${ref_dir}/phrank/${ref_ver}/disease_to_gene.txt"
-
- // OMIM and HPO
- omim_hgmd_phen = "${ref_dir}/omim_annotate/${ref_ver}/HGMD_phen.tsv"
- omim_obo = "${ref_dir}/omim_annotate/hp.obo"
- omim_genemap2 = "${ref_dir}/omim_annotate/${ref_ver}/genemap2_v2022.rds"
- omim_pheno = "${ref_dir}/omim_annotate/${ref_ver}/HPO_OMIM.tsv"
-
- // GNOMAD VCF
- ref_gnomad_genome = "${ref_dir}/filter_vep/${ref_ver}/gnomad.${ref_ver}.blacklist.genomes.vcf.gz"
- ref_gnomad_genome_idx = "${ref_gnomad_genome}.tbi"
- ref_gnomad_exome = "${ref_dir}/filter_vep/${ref_ver}/gnomad.${ref_ver}.blacklist.exomes.vcf.gz"
- ref_gnomad_exome_idx = "${ref_gnomad_exome}.tbi"
-
- // VEP
- vep_dbnsfp_name = params.ref_ver == "hg19" ? "dbNSFP4.3a_grch37.gz" : "dbNSFP4.1a_grch38.gz"
- vep_gnomad_name = params.ref_ver == "hg19" ? "gnomad.genomes.r2.1.sites.grch37_noVEP.vcf.gz" : "gnomad.genomes.GRCh38.v3.1.2.sites.vcf.gz"
- vep_cadd_name = params.ref_ver == "hg19" ? "hg19_whole_genome_SNVs.tsv.gz" : "hg38_whole_genome_SNV.tsv.gz"
-
- vep_dir_cache = "${ref_dir}/vep/${ref_ver}/"
- vep_dir_plugins = "${ref_dir}/vep/${ref_ver}/Plugins/"
- vep_custom_gnomad = "${ref_dir}/vep/${ref_ver}/${vep_gnomad_name}"
- vep_custom_clinvar = "${ref_dir}/vep/${ref_ver}/clinvar_20220730.vcf.gz"
- vep_custom_hgmd = "${ref_dir}/vep/${ref_ver}/HGMD_Pro_2022.2_${ref_ver}.vcf.gz"
- vep_plugin_revel = "${ref_dir}/vep/${ref_ver}/new_tabbed_revel_${ref_assembly}.tsv.gz" // changed for hg19
- vep_plugin_spliceai_snv = "${ref_dir}/vep/${ref_ver}/spliceai_scores.masked.snv.${ref_ver}.vcf.gz"
- vep_plugin_spliceai_indel = "${ref_dir}/vep/${ref_ver}/spliceai_scores.masked.indel.${ref_ver}.vcf.gz"
- vep_plugin_cadd = "${ref_dir}/vep/${ref_ver}/${vep_cadd_name}" // changed for hg19
- vep_plugin_dbnsfp = "${ref_dir}/vep/${ref_ver}/${vep_dbnsfp_name}"
- vep_idx = "${ref_dir}/vep/${ref_ver}/*.tbi"
-
- // Documentation
- usage_file = "${projectDir}/docs/source/nf_usage.txt"
-
- script_chunking = "${projectDir}/scripts/split_chunks.py"
- script_annot = "${projectDir}/scripts/annotation/*.py"
-
- ref_annot_dir = "${ref_dir}/annotate"
- ref_var_tier_dir = "${ref_dir}/var_tier"
- ref_merge_expand_dir = "${ref_dir}/merge_expand"
- ref_mod5_diffusion_dir = "${ref_dir}/mod5_diffusion"
- ref_predict_new_dir = "${ref_dir}/predict_new"
- ref_model_inputs_dir = "${ref_dir}/model_inputs"
-
}
-docker.enabled = true
+manifest {
+ author = 'LiuzLab'
+ name = 'LiuzLab/AI_MARRVEL'
+ version = '1.1.0'
-process {
- cpus = 2
- memory = { 25.GB * task.attempt }
- errorStrategy = { task.exitStatus in [143,137,104,134,139,140] ? 'retry' : 'finish' }
- maxRetries = 5
-
- container = "chaozhongliu/aim-lite:latest"
- cache = "lenient"
+ description = """AI-MARRVEL (AIM) is an AI system for rare genetic disease diagnosis."""
+ homePage = 'https://github.com/LiuzLab/AI_MARRVEL'
+ defaultBranch = 'main'
+ nextflowVersion = '>=23.10.1'
+ doi = '10.1056/AIoa2300009'
}
+plugins {
+ id 'nf-schema@2.3.0'
+}
+validation {
+ help {
+ enabled = true
+ command = """
+ nextflow run /path/to/AI_MARRVEL/main.nf \\
+ -profile \\
+ --ref_dir /path/to/dependencies/ \\
+ --input_vcf /path/to/input.vcf \\
+ --input_hpo /path/to/input.hpos.txt \\
+ --outdir /path/to/output \\
+ --storedir /path/to/store \\
+ --run_id \\
+ --ref_ver
+ """
+ fullParameter = "help_full"
+ showHiddenParameter = "show_hidden"
+ }
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
new file mode 100644
index 0000000..a98eb45
--- /dev/null
+++ b/nextflow_schema.json
@@ -0,0 +1,113 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://raw.githubusercontent.com/LiuzLab/AI_MARRVEL/main/nextflow_schema.json",
+ "title": "LiuzLab/AI_MARRVEL pipeline parameters",
+ "description": "AI-MARRVEL (AIM) is an AI system for rare genetic disease diagnosis.",
+ "type": "object",
+ "$defs": {
+ "input": {
+ "title": "Input",
+ "type": "object",
+ "description": "",
+ "default": "",
+ "properties": {
+ "input_vcf": {
+ "type": "string",
+ "description": "Path to input VCF file.",
+ "pattern": "^\\S+\\.vcf(.gz)?$",
+ "format": "file-path"
+ },
+ "input_phenopacket": {
+ "type": "string",
+ "description": "Path to input phenopacket JSON file.",
+ "pattern": "^\\S+\\.json?$",
+ "format": "file-path"
+ },
+ "input_variant": {
+ "type": "string",
+ "description": "Input variant in the format 'chrom-pos-ref-alt', e.g., '1-123456-A-T'.",
+ "pattern": "^(\\S+)_(\\S+)_(\\S+)_(\\S+)$"
+ },
+ "input_hpo": {
+ "type": "string",
+ "description": "Path to input HPO file.",
+ "format": "file-path"
+ },
+ "input_ped": {
+ "type": "string",
+ "description": "Path to the pedgree file for trio mode",
+ "format": "file-path"
+ },
+ "outdir": {
+ "type": "string",
+ "default": "./out",
+ "description": "Output directory."
+ },
+ "storedir": {
+ "type": "string",
+ "default": "./store",
+ "description": "Output directory."
+ },
+ "ref_dir": {
+ "type": "string",
+ "description": "Path to aim pipeline dependencies directory.",
+ "format": "directory-path"
+ },
+ "s3_bucket_data_name": {
+ "type": "string",
+ "default": "aim-data-dependencies-2.3-public",
+ "description": "[Not Implemented] S3 bucket name to aim pipeline dependencies directory.",
+ "format": "path",
+ "hidden": true
+ },
+ "run_id": {
+ "type": "string",
+ "default": "1",
+ "description": "Unique identifier for this run."
+ },
+ "ref_ver": {
+ "type": "string",
+ "default": "hg19",
+ "enum": ["hg19", "hg38"],
+ "description": "Reference genome version"
+ },
+ "bed_filter": {
+ "type": "string",
+ "description": "Path to a BED file to perform an analysis only for regions of interest"
+ },
+ "exome_filter": {
+ "type": "boolean",
+ "description": "Enable exonic filter. Addition will filter out variants outside of exonic region"
+ },
+ "impact_filter": {
+ "type": "boolean",
+ "description": "Enable low impact. Addition will enable low impact transcripts."
+ }
+ },
+ "required": ["outdir", "ref_dir", "run_id", "ref_ver"]
+ },
+ "generic_options": {
+ "title": "Generic options",
+ "type": "object",
+ "fa_icon": "fas fa-file-import",
+ "description": "Less common options for the pipeline, typically set in a config file.",
+ "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.",
+ "properties": {
+ "version": {
+ "type": "boolean",
+ "description": "Display version and exit.",
+ "fa_icon": "fas fa-question-circle",
+ "hidden": true
+ }
+ }
+ }
+ },
+ "allOf": [
+ {
+ "$ref": "#/$defs/input"
+ },
+ {
+ "$ref": "#/$defs/generic_options"
+ }
+ ]
+}
diff --git a/params.yaml.example b/params.yaml.example
deleted file mode 100644
index bf3f876..0000000
--- a/params.yaml.example
+++ /dev/null
@@ -1,9 +0,0 @@
-# Please Copy This File to `params.yaml`
-# And Complete Placeholder Marked With ,
-# Examples Are In Comment
-# NOTE: It's intended that params.yaml file is not tracked by git.
-
-ref_dir: # Example: "/home/jaeyeon/Data/aim_data_dependencies/"
-input_vcf: # Example: "/home/jaeyeon/Data/aim_test/demo.vcf.gz"
-input_hpo: # Example: "/home/jaeyeon/Data/aim_test/demo.hpos.txt"
-outdir: # Example: "./out"
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 90fa59d..587cfad 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,12 +1,10 @@
{
- "include": [
- "bin"
- ],
-
- "defineConstant": {
- "DEBUG": true
- },
-
- "reportMissingImports": "error",
- "reportMissingTypeStubs": false,
+ "include": ["bin"],
+
+ "defineConstant": {
+ "DEBUG": true
+ },
+
+ "reportMissingImports": "error",
+ "reportMissingTypeStubs": false
}
diff --git a/requirements.txt b/requirements.txt
index 4db1b60..006e382 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,11 @@
pandas==1.4.3
phrank
scipy
-xgboost
+xgboost<=2.1.4
tqdm
matplotlib
requests
tables
-scikit-learn==1.1.2
+shap
+lime
+scikit-learn>=1.3.0
diff --git a/resources/trio_pipeline/acceptable_header.txt b/resources/trio_pipeline/acceptable_header.txt
new file mode 100644
index 0000000..ce32661
--- /dev/null
+++ b/resources/trio_pipeline/acceptable_header.txt
@@ -0,0 +1,198 @@
+##fileformat=VCFv4.2
+##FORMAT=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PATIENT
diff --git a/resources/trio_pipeline/features.csv b/resources/trio_pipeline/features.csv
new file mode 100755
index 0000000..8ba592e
--- /dev/null
+++ b/resources/trio_pipeline/features.csv
@@ -0,0 +1 @@
+diffuse_Phrank_STRING,hgmdSymptomScore,omimSymMatchFlag,hgmdSymMatchFlag,clinVarSymMatchFlag,omimGeneFound,omimVarFound,hgmdGeneFound,hgmdVarFound,clinVarVarFound,clinVarGeneFound,clinvarNumP,clinvarNumLP,clinvarNumLB,clinvarNumB,dgvVarFound,decipherVarFound,curationScoreHGMD,curationScoreOMIM,curationScoreClinVar,conservationScoreDGV,omimSymptomSimScore,hgmdSymptomSimScore,GERPpp_RS,gnomadAF,gnomadAFg,LRT_score,LRT_Omega,phyloP100way_vertebrate,gnomadGeneZscore,gnomadGenePLI,gnomadGeneOELof,gnomadGeneOELofUpper,IMPACT,CADD_phred,CADD_PHRED,DANN_score,REVEL_score,fathmm_MKL_coding_score,conservationScoreGnomad,conservationScoreOELof,Polyphen2_HDIV_score,Polyphen2_HVAR_score,SIFT_score,zyg,FATHMM_score,M_CAP_score,MutationAssessor_score,ESP6500_AA_AF,ESP6500_EA_AF,hom,hgmd_rs,spliceAImax,nc_ClinVar_Exp,nc_HGMD_Exp,nc_isPLP,nc_isBLB,c_isPLP,c_isBLB,nc_CLNREVSTAT,c_CLNREVSTAT,nc_RANKSCORE,c_RANKSCORE,CLASS,phrank,isB/LB,isP/LP,cons_transcript_ablation,cons_splice_acceptor_variant,cons_splice_donor_variant,cons_stop_gained,cons_frameshift_variant,cons_stop_lost,cons_start_lost,cons_transcript_amplification,cons_inframe_insertion,cons_inframe_deletion,cons_missense_variant,cons_protein_altering_variant,cons_splice_region_variant,cons_splice_donor_5th_base_variant,cons_splice_donor_region_variant,c_ClinVar_Exp_Del_to_Missense,c_ClinVar_Exp_Different_pChange,c_ClinVar_Exp_Same_pChange,c_HGMD_Exp_Del_to_Missense,c_HGMD_Exp_Different_pChange,c_HGMD_Exp_Same_pChange,c_HGMD_Exp_Stop_Loss,c_HGMD_Exp_Start_Loss,IMPACT.from.Tier,TierAD,TierAR,TierAR.adj,No.Var.HM,No.Var.H,No.Var.M,No.Var.L,AD.matched,AR.matched,recessive,dominant,simple_repeat,Homozygous_Recessive_Risk,De_Novo_Risk,X_Linked_Risk,Compound_heterozygous_Risk,Inherited_dominant_Risk
diff --git a/resources/trio_pipeline/features_NDG.csv b/resources/trio_pipeline/features_NDG.csv
new file mode 100644
index 0000000..b8731d5
--- /dev/null
+++ b/resources/trio_pipeline/features_NDG.csv
@@ -0,0 +1 @@
+conservationScoreDGV,gnomadAF,gnomadAFg,LRT_score,LRT_Omega,gnomadGeneZscore,gnomadGenePLI,gnomadGeneOELofUpper,IMPACT,CADD_PHRED,conservationScoreGnomad,conservationScoreOELof,zyg,ESP6500_AA_AF,ESP6500_EA_AF,hom,spliceAImax,cons_transcript_ablation,cons_splice_acceptor_variant,cons_splice_donor_variant,cons_stop_gained,cons_frameshift_variant,cons_stop_lost,cons_start_lost,cons_transcript_amplification,cons_inframe_insertion,cons_inframe_deletion,cons_missense_variant,cons_protein_altering_variant,cons_splice_region_variant,cons_splice_donor_5th_base_variant,cons_splice_donor_region_variant,IMPACT.from.Tier,TierAD,TierAR,TierAR.adj,No.Var.HM,No.Var.H,No.Var.M,No.Var.L,simple_repeat,Homozygous_Recessive_Risk,De_Novo_Risk,X_Linked_Risk,Compound_heterozygous_Risk,Inherited_dominant_Risk
diff --git a/resources/trio_pipeline/rf_trio_NDG.job b/resources/trio_pipeline/rf_trio_NDG.job
new file mode 100644
index 0000000..e7dcc2e
Binary files /dev/null and b/resources/trio_pipeline/rf_trio_NDG.job differ
diff --git a/resources/trio_pipeline/rf_trio_base.job b/resources/trio_pipeline/rf_trio_base.job
new file mode 100644
index 0000000..aec0190
Binary files /dev/null and b/resources/trio_pipeline/rf_trio_base.job differ
diff --git a/subworkflows/local/handle_input/main.nf b/subworkflows/local/handle_input/main.nf
new file mode 100644
index 0000000..299926d
--- /dev/null
+++ b/subworkflows/local/handle_input/main.nf
@@ -0,0 +1,32 @@
+
+
+include {
+ PHENOPACKET_TO_VARIANTS_AND_HPOS; GENERATE_INPUT_VCF; GENERATE_INPUT_VARIANTS
+} from "../../../modules/local/singleton"
+
+workflow HANDLE_INPUT {
+ if (params.input_phenopacket) {
+ PHENOPACKET_TO_VARIANTS_AND_HPOS(params.input_phenopacket)
+ GENERATE_INPUT_VCF(PHENOPACKET_TO_VARIANTS_AND_HPOS.out.variants)
+ vcf = GENERATE_INPUT_VCF.out.vcf
+ hpo = PHENOPACKET_TO_VARIANTS_AND_HPOS.out.hpo
+ } else if (params.input_variant) {
+ GENERATE_INPUT_VARIANTS(params.input_variant)
+ GENERATE_INPUT_VCF(GENERATE_INPUT_VARIANTS.out.variants)
+ vcf = GENERATE_INPUT_VCF.out.vcf
+ hpo = params.input_hpo
+ } else if (params.input_vcf) {
+ vcf = file(params.input_vcf)
+ hpo = params.input_hpo
+ } else {
+ error "No input VCF or phenopacket or variant provided."
+ }
+
+ if (!hpo) {
+ hpo = file(moduleDir.resolve("../../../assets/NO_FILE"))
+ }
+
+ emit:
+ vcf = vcf
+ hpo = hpo
+}
\ No newline at end of file
diff --git a/subworkflows/local/prepare_data/main.nf b/subworkflows/local/prepare_data/main.nf
new file mode 100644
index 0000000..eddf110
--- /dev/null
+++ b/subworkflows/local/prepare_data/main.nf
@@ -0,0 +1,63 @@
+include {
+ RESTRUCTURE_LOCAL_DATA_EXCEPT_VEP;
+ RESTRUCTURE_LOCAL_DATA_ONLY_VEP;
+ STORE_S3_BUCKET_DATA_EXCEPT_VEP;
+ STORE_S3_BUCKET_DATA_ONLY_VEP;
+ SPLIT_DATA;
+ BUILD_REFERENCE_INDEX;
+} from "../../../modules/local/prepare_data"
+
+include {
+ GENERATE_MANIFEST_JSON
+} from "../../../modules/local/singleton"
+
+workflow PREPARE_DATA {
+ if (params.ref_dir) {
+ GENERATE_MANIFEST_JSON(file(params.ref_dir))
+ data_except_vep = RESTRUCTURE_LOCAL_DATA_EXCEPT_VEP(file(params.ref_dir))
+ data_only_vep = RESTRUCTURE_LOCAL_DATA_ONLY_VEP(file(params.ref_dir))
+ } else {
+ error "Not Implemented"
+ // data_except_vep = STORE_S3_BUCKET_DATA_EXCEPT_VEP()
+ // data_only_vep = STORE_S3_BUCKET_DATA_ONLY_VEP()
+ }
+
+ (
+ chrmap_file,
+ ref_filter_bed,
+ ref_annot_dir,
+ ref_var_tier_dir,
+ ref_merge_expand_dir,
+ ref_mod5_diffusion_dir,
+ ref_predict_new_dir,
+ ref_model_inputs_dir,
+ phrank_tuple,
+ omim_tuple,
+ gnomad_tuple,
+ vep_tuple
+ ) = SPLIT_DATA(
+ data_except_vep,
+ data_only_vep,
+ params.bed_filter ? file(params.bed_filter) : moduleDir.resolve("../../../assets/NO_FILE"),
+ params.exome_filter,
+ )
+
+ fasta_tuple = BUILD_REFERENCE_INDEX()
+
+ emit:
+ data = chrmap_file.map{["chrmap_file", it]}.concat(
+ ref_filter_bed.map{["ref_filter_bed", it]},
+ ref_annot_dir.map{["ref_annot_dir", it]},
+ ref_var_tier_dir.map{["ref_var_tier_dir", it]},
+ ref_merge_expand_dir.map{["ref_merge_expand_dir", it]},
+ ref_mod5_diffusion_dir.map{["ref_mod5_diffusion_dir", it]},
+ ref_predict_new_dir.map{["ref_predict_new_dir", it]},
+ ref_model_inputs_dir.map{["ref_model_inputs_dir", it]},
+
+ fasta_tuple.map{["fasta_tuple", it]},
+ gnomad_tuple.map{["gnomad_tuple", it]},
+ omim_tuple.map{["omim_tuple", it]},
+ phrank_tuple.map{["phrank_tuple", it]},
+ vep_tuple.map{["vep_tuple", it]}
+ ).toList().map{ it.collectEntries { k, v -> [(k): v]} }
+}
\ No newline at end of file
diff --git a/subworkflows/local/singleton/main.nf b/subworkflows/local/singleton/main.nf
new file mode 100644
index 0000000..a9d8493
--- /dev/null
+++ b/subworkflows/local/singleton/main.nf
@@ -0,0 +1,157 @@
+include {
+ VALIDATE_VCF; NORMALIZE_VCF; GENERATE_INPUT_VCF;
+ VCF_TO_VARIANTS; VARIANTS_TO_ENSEMBL; ENSEMBL_TO_GENESYM; GENESYM_TO_PHRANK;
+ CONVERT_GVCF; FILTER_UNPASSED; FILTER_MITO_AND_UNKOWN_CHR; FILTER_BED; FILTER_PROBAND;
+ SPLIT_VCF_BY_CHROMOSOME; ANNOTATE_BY_VEP; HPO_SIM; ANNOTATE_BY_MODULES;
+ ANNOTATE_TIER; JOIN_PHRANK; MERGE_SCORES_BY_CHROMOSOME; PREDICTION
+} from "../../../modules/local/singleton"
+
+workflow VCF_PRE_PROCESS {
+ take:
+ input_vcf
+ data
+
+ main:
+ chrmap_file = data.map { it.chrmap_file }
+ ref_filter_bed = data.map { it.ref_filter_bed }
+ fasta_tuple = data.map { it.fasta_tuple }
+ gnomad_tuple = data.map { it.gnomad_tuple }
+
+ VALIDATE_VCF(input_vcf)
+ NORMALIZE_VCF(VALIDATE_VCF.out.vcf)
+
+ CONVERT_GVCF(
+ NORMALIZE_VCF.out.vcf,
+ NORMALIZE_VCF.out.tbi,
+ fasta_tuple.map { it[0] },
+ fasta_tuple.map { it[1] },
+ fasta_tuple.map { it[2] },
+ chrmap_file
+ )
+ FILTER_UNPASSED(
+ CONVERT_GVCF.out.vcf,
+ CONVERT_GVCF.out.tbi,
+ chrmap_file
+ )
+ FILTER_MITO_AND_UNKOWN_CHR(
+ FILTER_UNPASSED.out.vcf,
+ FILTER_UNPASSED.out.tbi,
+ )
+ FILTER_BED(
+ FILTER_MITO_AND_UNKOWN_CHR.out.vcf,
+ FILTER_MITO_AND_UNKOWN_CHR.out.tbi,
+ ref_filter_bed,
+ )
+ FILTER_PROBAND(
+ FILTER_BED.out.vcf,
+ FILTER_BED.out.tbi,
+ gnomad_tuple.map { it[0] },
+ gnomad_tuple.map { it[1] },
+ gnomad_tuple.map { it[2] },
+ gnomad_tuple.map { it[3] },
+ )
+
+ emit:
+ vcf = FILTER_PROBAND.out.vcf
+}
+
+workflow PHRANK_SCORING {
+ take:
+ vcf
+ hpo
+ data
+
+ main:
+ phrank_tuple = data.map { it.phrank_tuple }
+ VCF_TO_VARIANTS(vcf)
+ VARIANTS_TO_ENSEMBL(
+ VCF_TO_VARIANTS.out,
+ phrank_tuple.map { it[0] },
+ )
+ ENSEMBL_TO_GENESYM(
+ VARIANTS_TO_ENSEMBL.out,
+ phrank_tuple.map { it[1] },
+ )
+ GENESYM_TO_PHRANK(
+ ENSEMBL_TO_GENESYM.out,
+ hpo,
+ phrank_tuple.map { it[2] },
+ phrank_tuple.map { it[3] },
+ phrank_tuple.map { it[4] },
+ phrank_tuple.map { it[5] },
+ )
+
+ emit:
+ phrank = GENESYM_TO_PHRANK.out
+}
+
+workflow GENERATE_SINGLETON_FEATURES {
+ take:
+ vcf
+ hpo
+ data
+
+ main:
+ ref_annot_dir = data.map { it.ref_annot_dir }
+ ref_var_tier_dir = data.map { it.ref_var_tier_dir }
+ ref_merge_expand_dir = data.map { it.ref_merge_expand_dir }
+ ref_mod5_diffusion_dir = data.map { it.ref_mod5_diffusion_dir }
+ omim_tuple = data.map { it.omim_tuple }
+ vep_tuple = data.map { it.vep_tuple }
+
+ SPLIT_VCF_BY_CHROMOSOME(vcf)
+ ANNOTATE_BY_VEP(
+ SPLIT_VCF_BY_CHROMOSOME.out.chr_vcfs.flatten(),
+ vep_tuple.map { it[0] },
+ vep_tuple.map { it[1] },
+ vep_tuple.map { it[2] },
+ vep_tuple.map { it[3] },
+ vep_tuple.map { it[4] },
+ vep_tuple.map { it[5] },
+ vep_tuple.map { it[6] },
+ vep_tuple.map { it[7] },
+ vep_tuple.map { it[8] },
+ vep_tuple.map { it[9] },
+ vep_tuple.map { it[10] },
+ )
+
+ HPO_SIM(
+ hpo,
+ omim_tuple.map { it[0] },
+ omim_tuple.map { it[1] },
+ omim_tuple.map { it[2] },
+ omim_tuple.map { it[3] },
+ )
+ ANNOTATE_BY_MODULES (
+ ANNOTATE_BY_VEP.out.vep_output,
+ HPO_SIM.out.hgmd_sim,
+ HPO_SIM.out.omim_sim,
+ ref_annot_dir,
+ )
+
+ PHRANK_SCORING(vcf, hpo, data)
+ ANNOTATE_TIER (
+ ANNOTATE_BY_MODULES.out.scores,
+ PHRANK_SCORING.out,
+ ref_annot_dir,
+ ref_var_tier_dir,
+ )
+ JOIN_PHRANK (
+ ANNOTATE_BY_MODULES.out.scores,
+ PHRANK_SCORING.out,
+ ref_merge_expand_dir,
+ )
+
+ MERGE_SCORES_BY_CHROMOSOME(
+ PHRANK_SCORING.out,
+ ANNOTATE_TIER.out.tier.collect(),
+ JOIN_PHRANK.out.compressed_scores.collect(),
+ ref_annot_dir,
+ ref_mod5_diffusion_dir,
+ ref_merge_expand_dir,
+ )
+
+ emit:
+ merged_matrix = MERGE_SCORES_BY_CHROMOSOME.out.merged_matrix
+ merged_compressed_scores = MERGE_SCORES_BY_CHROMOSOME.out.merged_compressed_scores
+}
diff --git a/utils/GRI_bed_filter/README.md b/utils/GRI_bed_filter/README.md
index 7038380..f66004c 100644
--- a/utils/GRI_bed_filter/README.md
+++ b/utils/GRI_bed_filter/README.md
@@ -1,13 +1,23 @@
# Genomic Region of Interest filter
+
\
\
-##### Data files for generation: \
-1. Gencode v46 gene annotation \
-2. HGMD 2022 database \
-3. Clinvar 8/6/2024 database \
+##### Data files for generation:
+
+1. Gencode v46 gene annotation\
+2. HGMD 2022 database\
+3. Clinvar 8/6/2024 database\
+4. SpliceAI v1.3 prediction data\
+
+##### BED file description:
+
+1. Version 1 --- gene only: BED \<--- protein_coding gene + HGMD mutations (50 flanking bp) + Cinvar mutations (50 flanking bp)\
+ (Note: Promoter upstream 1kb is dropped as requested by Dr.Liu)\
+2. Version 2 --- exon only: BED \<--- protein_coding gene + HGMD mutations (50 flanking bp) + Cinvar mutations (50 flanking bp) + SpliceAI predicted positions (50 flanking bp)\
+3. Additionally, initial bed files which also contain a source column can be generated, that indicates where each position entry comes from (gencode, hgmd, clinvar, or spliceAI)
-##### BED file description: \
-1. Version 1: BED \<--- protein_coding gene + HGMD mutations (50 flanking bp) + Cinvar mutations (50 flanking bp) \
-(Note: Promoter upstream 1kb is dropped as requested by Dr.Liu)
+##### BED file format:
+1. chr \| start \| end\
+2. The 1st column contains prefix of "chr" for chromosomes, for example, "chr1, chr2, .... chr Y".
\ No newline at end of file
diff --git a/utils/GRI_bed_filter/version1_gene-only/gri_v1.R b/utils/GRI_bed_filter/version1_gene-only/gri_v1.R
index 82c4625..5cd8f63 100644
--- a/utils/GRI_bed_filter/version1_gene-only/gri_v1.R
+++ b/utils/GRI_bed_filter/version1_gene-only/gri_v1.R
@@ -22,7 +22,7 @@ library(readr)
library(data.table)
# Function to generate GRI region based on genome version
-generate_gri_geneonly <- function(genome_version = "hg38") {
+generate_gri_v1 <- function(genome_version = "hg38") {
# data preparation --------------------------------------------------------
## hgmd local file name
@@ -34,7 +34,7 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
hg38 <- BSgenome.Hsapiens.UCSC.hg38
size <- seqlengths(hg38)
chr_size <-
- data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24),]
+ data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24), ]
##load gencode data
url <-
@@ -58,7 +58,7 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
hg19 <- BSgenome.Hsapiens.UCSC.hg19
size <- seqlengths(hg19)
chr_size <-
- data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24),]
+ data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24), ]
##load gencode data
url <-
@@ -141,7 +141,7 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
mutate(source = "HGMD_2022")
## exclude "rejected variants" in HGMD database
- hgmd_noR <- hgmd[-which(hgmd$CLASS == "R"),]
+ hgmd_noR <- hgmd[-which(hgmd$CLASS == "R"), ]
hgmd_noR <- hgmd_noR |>
dplyr::mutate(seqnames = paste0("chr", seqnames))
@@ -149,7 +149,7 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
hgmd_noR <- merge(hgmd_noR, chr_size, by = "seqnames")
hgmd_noR_flank <- hgmd_noR |>
dplyr::mutate(
- final_start = ifelse(start > 50, start - 50, 0),
+ final_start = ifelse(start > 50, start - 50, 1),
final_end = ifelse((end + 50) > chr_end, chr_end, end + 50),
.after = seqnames
)
@@ -201,7 +201,7 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
## add flanking region
clinvar_patho_flank <- clinvar_patho |>
mutate(
- final_start = ifelse(start > 50, start - 50, 0),
+ final_start = ifelse(start > 50, start - 50, 1),
final_end = ifelse((end + 50) > chr_end, chr_end, end + 50),
.after = seqnames
)
@@ -233,6 +233,13 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
## check bed coverage
coverage_bed(bed_gencode_hgmd_clinvar.dt)
+ # convert start position from 1-based into 0-based format of bed file
+ bed_gencode_hgmd_clinvar.dt <- bed_gencode_hgmd_clinvar.dt |>
+ mutate(start = pmax(0, start - 1))
+
+ bed_gencode_hgmd_clinvar <- bed_gencode_hgmd_clinvar |>
+ mutate(start = pmax(0, start - 1))
+
# write the bed file
file_name <- paste0("gene_only.", genome_version, ".bed")
write_tsv(bed_gencode_hgmd_clinvar.dt[, c(1:3)],
@@ -248,5 +255,5 @@ generate_gri_geneonly <- function(genome_version = "hg38") {
}
# Apply generate_gri_geneonly function to generate bed files --------------
-generate_gri_geneonly("hg19")
-generate_gri_geneonly("hg38")
+generate_gri_v1("hg19")
+generate_gri_v1("hg38")
diff --git a/utils/GRI_bed_filter/version2_exon-only/gri_v2.R b/utils/GRI_bed_filter/version2_exon-only/gri_v2.R
new file mode 100644
index 0000000..e607d58
--- /dev/null
+++ b/utils/GRI_bed_filter/version2_exon-only/gri_v2.R
@@ -0,0 +1,301 @@
+# BED region = all exons
+# + HGMD mutations (50 flanking bp)
+# + Cinvar mutations (50 flanking bp)
+# + spliceAI data (50 flanking bp)
+#
+# 1. gencode data:
+# https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.basic.annotation.gtf.gz
+# https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/GRCh37_mapping/gencode.v46lift37.basic.annotation.gtf.gz
+#
+# 2. HGMD data:
+# HGMD_Pro_2024.3_hg38.vcf HGMD_Pro_2024.3_hg19.vcf
+#
+# 3. Clinvar data:
+# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/
+# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/
+#
+# 4. spliceAI data v1.3 is on skyriver server:
+# /home/zhijiany/data-transfer/AIM/spliceAI-2019/exome_filter
+
+### Notes: 1. Please prepare/download all the data files before running the script
+### 2. Decompress the gz files to avoid error when loading the data
+
+library(VariantAnnotation)
+library(BSgenome.Hsapiens.UCSC.hg38)
+library(BSgenome.Hsapiens.UCSC.hg19)
+library(httr)
+library(dplyr)
+library(readr)
+library(data.table)
+
+generate_gri_v2 <- function(genome_version = "hg38") {
+ # data preparation --------------------------------------------------------
+
+ ## construct splice data file name
+ snv <- paste0("snv.", genome_version, ".vcf")
+ indel <- paste0("indel.", genome_version, ".vcf")
+ columnnames <- unlist(strsplit("CHROM POS ID REF ALT QUAL FILTER INFO", "\t"))
+
+ ## load spliceAI data
+ splice_snv <- read_table(snv,
+ comment = "#",
+ col_names = columnnames,
+ col_types = "cncccccc"
+ )
+
+ splice_indel <- read_table(indel,
+ comment = "#",
+ col_names = columnnames,
+ col_types = "cncccccc"
+ )
+
+
+ ## hgmd local file name
+ hgmd_file <- paste0("HGMD_Pro_2024.3_", genome_version, ".vcf")
+
+ ## use different data based on genome version
+ if (genome_version == "hg38") {
+ ## load chr size
+ hg38 <- BSgenome.Hsapiens.UCSC.hg38
+ size <- seqlengths(hg38)
+ chr_size <-
+ data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24), ]
+
+ ## load gencode data
+ gtf_data <- rtracklayer::import("gencode.v46.basic.annotation.gtf")
+
+ ## load hgmd data
+ hgmd_data <- readVcf(hgmd_file, genome = "hg38")
+
+ ## load clinvar data
+ clinvar_data <- readVcf("clinvar.vcf.gz", genome = "hg38")
+ } else if (genome_version == "hg19") {
+ ## load chr size
+ hg19 <- BSgenome.Hsapiens.UCSC.hg19
+ size <- seqlengths(hg19)
+ chr_size <-
+ data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24), ]
+
+ ## load gencode data
+ gtf_data <- rtracklayer::import("gencode.v46lift37.basic.annotation.gtf")
+
+ ## load hgmd data
+ hgmd_data <- readVcf(hgmd_file, genome = "hg19")
+
+ ## load clinvar data
+ clinvar_data <- readVcf("clinvar.vcf.gz", genome = "hg19")
+ }
+
+
+ ## function to check bed file coverage
+ coverage_bed <- function(bed_file, chromosome_size = chr_size) {
+ ## calculate hg chr bp length
+ seq.size <- sum(chromosome_size[, 2])
+ ## calculate bed size
+ bed.size <- sum(bed_file[, 4])
+ ## get percentage
+ size.per <- bed.size / seq.size
+ print(paste(
+ paste0("bed file coverage is: ", size.per * 100, "% ;"),
+ bed.size,
+ "bp out of",
+ seq.size,
+ "bp"
+ ))
+ }
+
+ # 1. include gencode v46 annotation ----------------------------
+
+ ## convert gtf data to data.table and add chr size
+ gtf_dt <- as.data.table(gtf_data)
+ gtf_dt_size <-
+ merge(gtf_dt, chr_size, by = "seqnames") # no MT chr
+
+ ## select exon only
+ exons <-
+ gtf_dt_size[type == "exon"]
+
+ ## extract gene location info and save into bed format
+ ## add source column
+ bed_coding <- exons |>
+ dplyr::select(
+ chr = seqnames,
+ start = start,
+ end = end,
+ name = gene_name,
+ strand = strand,
+ ) |>
+ mutate(source = "GencodeV46.exon")
+
+ ## merge overlapped region
+ gr <- GRanges(
+ seqnames = bed_coding$chr,
+ ranges = IRanges(start = bed_coding$start, end = bed_coding$end)
+ )
+
+ merged_gr <- reduce(gr)
+ bed_coding.dt <- as.data.table(merged_gr)
+ coverage_bed(bed_coding.dt) # about 4% coverage
+
+
+ # 2. include hgmd 2022 variants -----------------------------------------
+
+ ## extract fields and convert to data.table
+ info_dt <- as.data.table(info(hgmd_data))
+ range_dt <- as.data.table(rowRanges(hgmd_data))
+
+ ## merged useful columns
+ hgmd <- cbind(range_dt, info_dt) |>
+ dplyr::select(seqnames, start, end, GENE, STRAND, CLASS, MUT) |>
+ mutate(source = "HGMD_2024")
+
+ ## exclude "rejected variants" in HGMD database
+ hgmd_noR <- hgmd[-which(hgmd$CLASS == "R"), ]
+ hgmd_noR <- hgmd_noR |>
+ dplyr::mutate(seqnames = paste0("chr", seqnames))
+
+ ## add 50bp flanking region
+ hgmd_noR <- merge(hgmd_noR, chr_size, by = "seqnames")
+ hgmd_noR_flank <- hgmd_noR |>
+ dplyr::mutate(
+ final_start = ifelse(start > 50, start - 50, 1),
+ final_end = ifelse((end + 50) > chr_end, chr_end, end + 50),
+ .after = seqnames
+ )
+
+ ## generate bed
+ bed_hgmd <- hgmd_noR_flank |>
+ dplyr::select(
+ chr = seqnames,
+ start = final_start,
+ end = final_end,
+ name = GENE,
+ strand = STRAND,
+ source
+ )
+
+ ## merge bed files from step1 and step2, and reduce them into non-overlapping regions
+ bed_gencode_hgmd <- rbind(bed_coding, bed_hgmd, fill = TRUE) |>
+ arrange(chr, start, end)
+
+ gr <- GRanges(
+ seqnames = bed_gencode_hgmd$chr,
+ ranges = IRanges(start = bed_gencode_hgmd$start, end = bed_gencode_hgmd$end)
+ )
+
+ merged_gr <- reduce(gr)
+ bed_gencode_hgmd.dt <- as.data.table(merged_gr)
+ coverage_bed(bed_gencode_hgmd.dt) # about 4.16% coverage
+
+ # 3. Include clinvar variants----------------------------------------
+
+ ## obtain pathogenic variants
+ info_df <-
+ as.data.frame(clinvar_data@info) # faster to load as dataframe than datatable
+ range_df <- as.data.frame(clinvar_data@rowRanges)
+ clinvar <- cbind(range_df, info_df)
+ clinvar_patho <- clinvar |>
+ filter(grepl("pathogenic", CLNSIG))
+
+ ## remove MT chromosome
+ ## rename seqnames
+ clinvar_patho <- clinvar_patho |>
+ filter(seqnames != "MT")
+ clinvar_patho <- clinvar_patho |>
+ mutate(seqnames = paste0("chr", seqnames))
+
+ ## add chromosome size column
+ clinvar_patho <- merge(clinvar_patho, chr_size, by = "seqnames")
+
+ ## add flanking region
+ clinvar_patho_flank <- clinvar_patho |>
+ mutate(
+ final_start = ifelse(start > 50, start - 50, 1),
+ final_end = ifelse((end + 50) > chr_end, chr_end, end + 50),
+ .after = seqnames
+ )
+
+ ## generate bed
+ bed_clinvar <- clinvar_patho_flank |>
+ dplyr::select(
+ chr = seqnames,
+ start = final_start,
+ end = final_end,
+ name = GENEINFO,
+ strand = strand
+ ) |>
+ mutate(source = "Clinvar_latest") # the Clinvar link provides latest vcf file
+
+ ## merge and reduce
+ bed_gencode_hgmd_clinvar <-
+ rbind(bed_gencode_hgmd, bed_clinvar) |>
+ arrange(chr, start, end)
+
+ gr <- GRanges(
+ seqnames = bed_gencode_hgmd_clinvar$chr,
+ ranges = IRanges(start = bed_gencode_hgmd_clinvar$start, end = bed_gencode_hgmd_clinvar$end)
+ )
+
+ merged_gr <- reduce(gr)
+ bed_gencode_hgmd_clinvar.dt <- as.data.table(merged_gr)
+
+ ## check bed coverage
+ coverage_bed(bed_gencode_hgmd_clinvar.dt) # coverage about 4.22%
+
+
+
+ # 4. Include spliceAI data ----------------------------------------------------
+ # Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
+
+ # merge snv and indel and add 50 flanking bp
+ splice <- rbind(splice_snv, splice_indel) |>
+ mutate(seqnames = paste0("chr", CHROM))
+ splice <- merge(splice, chr_size, by = "seqnames")
+
+ splice_flank <- splice |>
+ mutate(
+ chr = seqnames,
+ start = ifelse(POS - 50 < 1, 1, POS - 50),
+ end = ifelse(POS + 50 > chr_end, chr_end, POS + 50),
+ source = "spliceAI_v1.3"
+ ) |>
+ dplyr::select(chr, start, end, source)
+
+ ## merge and reduce
+ bed_gencode_hgmd_clinvar_splice <-
+ rbind(bed_gencode_hgmd_clinvar, splice_flank, fill = TRUE) |>
+ arrange(chr, start, end)
+
+ gr <- GRanges(
+ seqnames = bed_gencode_hgmd_clinvar_splice$chr,
+ ranges = IRanges(start = bed_gencode_hgmd_clinvar_splice$start, end = bed_gencode_hgmd_clinvar_splice$end)
+ )
+
+ merged_gr <- reduce(gr)
+ splice.dt <- as.data.table(merged_gr)
+ print("Final coverage is below:")
+ coverage_bed(splice.dt) # coverage is about (4.36% without 50bp flanks) 5.75%
+
+ # convert start position from 1-based to 0-based of bed format ------------
+ splice.dt <- splice.dt |>
+ mutate(start = pmax(0, start - 1))
+
+ bed_gencode_hgmd_clinvar_splice <- bed_gencode_hgmd_clinvar_splice |>
+ mutate(start = pmax(0, start - 1))
+
+ # save the file
+ # write the bed file
+ file_name <- paste0("exon_only.", genome_version, ".bed")
+ write_tsv(splice.dt[, c(1:3)], file = file_name, col_names = FALSE)
+
+ # save the initial bed files which are not reduced, which contains the source column to indicate where that region is from
+ file_name <- paste0("exon_only_source.", genome_version, ".bed")
+ write_tsv(bed_gencode_hgmd_clinvar_splice,
+ file = file_name,
+ col_names = FALSE
+ )
+}
+
+# Apply generate_gri_geneonly function to generate bed files --------------
+# generate_gri_v2("hg19")
+generate_gri_v2("hg38")
\ No newline at end of file
diff --git a/utils/hgmd_update/merge_expand/AA_abbreviations.txt b/utils/hgmd_update/merge_expand/AA_abbreviations.txt
new file mode 100755
index 0000000..03f1743
--- /dev/null
+++ b/utils/hgmd_update/merge_expand/AA_abbreviations.txt
@@ -0,0 +1,23 @@
+Full_Name ThreeLetter OneLetter
+Alanine Ala A
+Arginine Arg R
+Asparagine Asn N
+Aspartate Asp D
+Aspartate/Asparagine Asx B
+Cysteine Cys C
+Glutamate Glu E
+Glutamine Gln Q
+Glutamate/Glutamine Glx Z
+Glycine Gly G
+Histidine His H
+Isoleucine Ile I
+Leucine Leu L
+Lysine Lys K
+Methionine Met M
+Phenylalanine Phe F
+Proline Pro P
+Serine Ser S
+Threonine Thr T
+Tryptophan Trp W
+Tyrosine Tyr Y
+Valine Val V
diff --git a/utils/hgmd_update/merge_expand/expandvar.R b/utils/hgmd_update/merge_expand/expandvar.R
new file mode 100644
index 0000000..c53eccf
--- /dev/null
+++ b/utils/hgmd_update/merge_expand/expandvar.R
@@ -0,0 +1,1188 @@
+# This script is used to expand the variants in the HGMD vcf file to gDNA level.
+# (Originall written by @Dongxue Mao, reorganized by @Zhijian Yu)
+
+
+
+# Install and load libraries ---------------------------------------------
+required_packages <- c("vcfR", "stringr", "stringi", "reshape2",
+ "tidyr", "plyr", "tidyr", "data.table", "tictoc")
+
+missing_packages <- required_packages[!(required_packages %in%
+ rownames(installed.packages()))]
+
+if(length(missing_packages)) {
+ install.packages(missing_packages)
+}
+
+library(vcfR)
+library(stringr)
+library(stringi)
+library(reshape2)
+library(tidyr)
+library(plyr) # could potentially be replaced by dplyr, needs testing
+library("data.table")
+library(tictoc)
+
+
+# Prepare for data processing ---------------------------------------------
+
+# arg1: hgmd input vcf.gz
+# arg2: out file directory
+# arg3: reference file: AA_abbreviations.txt
+# arg4: genome version
+
+## parse the args
+args = commandArgs(trailingOnly = T)
+
+print("Printing out arguments;")
+print(paste("args1:", args[1], sep = " "))
+
+print(paste("args2:", args[2], sep = " "))
+
+print(paste("args3:", args[3], sep = " "))
+
+print(paste("args4:", args[4], sep = " "))
+
+print("")
+
+genome_version <- args[4]
+
+#set working directory
+setwd(args[2])
+
+#create output folders
+system("mkdir -p {Expand_Result,Expand_Result_final,TransVarInput,TransVarOut}")
+
+
+
+# Step 1 - ParseInfo_mc ----------------------------------------------------------------------------------------------------
+
+## step 1 parse the infor in the hgmd vcf save as RDS
+## step 2: dcast infor column and combine with other columns
+
+## parse the infor in the hgmd vcf save as RDS ####
+print("Runnning Step 1 --- ParseInfo_mc")
+tic("step1")
+
+# load the vcf
+vcf <- read.vcfR(args[1], verbose = F)
+vcf.df <- vcf@fix
+vcf.df <- as.data.frame(vcf.df)
+
+
+# seperate the info column to key, value
+# rbind to long df
+vcf_dt <- as.data.table(vcf.df) # Convert to data.table for efficiency
+
+# Split the INFO column into fields, handling ";" within quotes
+vcf_dt[, fields := stri_split_regex(INFO, ';(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', simplify = FALSE)]
+
+# Unnest the fields into long format
+result_dt <- vcf_dt[, .(field = unlist(fields)), by = ID]
+
+# Split each field into key and value based on the first "=" using sub
+result_dt[, key := sub("=.*", "", field)]
+result_dt[, value := sub("^[^=]*=", "", field)]
+
+# Select and reorder the desired columns, including 'field'
+results_df <- result_dt[, .(field, key, value, id = ID)]
+
+# Convert to data.frame
+results_df <- as.data.frame(results_df)
+
+print("saving result")
+saveRDS(results_df, file.path(args[2], "hgmd_mc.rds"))
+toc()
+
+# Step 2 - Combine info ----------------------------------------------------------------------------------------------------
+
+print("Runnning Step 2 --- CombineInfo")
+tic("step2")
+
+## dcast infor column and combine with other columns ####
+info.l <- readRDS(file.path(args[2], "hgmd_mc.rds"))
+info.w <- reshape2::dcast(info.l, id ~ key, value.var = "value")
+
+# combine with other columns
+vcf <- read.vcfR(args[1], verbose = FALSE)
+vcf.df <- vcf@fix
+vcf.df <- as.data.frame(vcf.df)
+
+vcf.all <- merge(vcf.df[, 1:7],
+ info.w,
+ by.x = "ID",
+ by.y = "id",
+ all = T)
+
+saveRDS(vcf.all, file.path(args[2], "hgmd_all.rds"))
+toc()
+
+
+# Step 3&4 - pro2genome ----------------------------------------------------------------------------------------------------
+
+tic("step 3 and 4")
+print("Runnning Step 3&4 --- pro2genome")
+
+# parse pro2g result from TransVar
+vcf.all <- readRDS(file.path(args[2], "hgmd_all.rds"))
+
+# keep CLASS == "DM" and "DM?"
+vcf.all <- vcf.all[vcf.all$CLASS %in% c("DM", "DM?"), ]
+write.table(
+ vcf.all,
+ file.path(args[2], "hgmd_DM.txt"),
+ sep = "\t",
+ col.names = T,
+ row.names = F,
+ quote = F
+)
+saveRDS(vcf.all, file.path(args[2], "hgmd_DM.rds"))
+
+# load var in HGMD
+hgmd.orig <- readRDS(file.path(args[2], "hgmd_DM.rds"))
+hgmd <- hgmd.orig
+# parse PROT column
+hgmd$PROT2 <- gsub(".[0-9]+%3A", ":", hgmd$PROT)
+hgmd$PROT2 <- gsub("%3B", ";", hgmd$PROT2)
+hgmd$PROT2 <- gsub("%3D", "=", hgmd$PROT2)
+hgmd$PROT2 <- gsub("\\(|\\)", "", hgmd$PROT2)
+
+## pre-filter the HGMD entries ####
+# parse PROT column
+hgmd.part <- hgmd[, c("ID", "GENE", "PROT", "PROT2")]
+
+# drop fs var (no expansion for fs)
+hgmd.filt <- hgmd.part[!grepl("fs", hgmd.part$PROT), ]
+
+# drop var with multiple protein change (no expansion for them) (there are 48)
+hgmd.filt <- hgmd.filt[!grepl(";", hgmd.filt$PROT2), ]
+
+# drop synonymous var (will parse as non-coding)
+hgmd.filt <- hgmd.filt[!grepl("=", hgmd.filt$PROT2), ]
+
+# drop stop loss or stop gain (with * in the PROT)
+hgmd.filt <- hgmd.filt[!grepl("\\*", hgmd.filt$PROT2), ]
+
+# drop inframe insertions and delins (ins and delins)
+hgmd.filt <- hgmd.filt[!grepl("ins", hgmd.filt$PROT2), ]
+hgmd.filt <- hgmd.filt[!grepl("dup", hgmd.filt$PROT2), ]
+
+# drop inframe deletions that affect more than 1 aa
+del <- hgmd.filt[grepl("del", hgmd.filt$PROT2), ]
+del <- separate(
+ data = del,
+ col = "PROT2",
+ into = c("NP_ID", "pChange"),
+ remove = F,
+ sep = ":"
+)
+del_multi_AA <- del[grepl("_", del$pChange), ]
+hgmd.filt <- hgmd.filt[!(hgmd.filt$ID %in% del_multi_AA$ID), ]
+
+hgmd.non.coding <- hgmd[!(hgmd$ID %in% hgmd.filt$ID), ]
+
+# check No of del with 2 AA del
+del_multi_AA$pChange <- gsub("del", "", del_multi_AA$pChange)
+del_multi_AA$pChange <- gsub("p\\.", "", del_multi_AA$pChange)
+del_multi_AA <- separate(
+ data = del_multi_AA,
+ col = "pChange",
+ into = c("p_start", "p_end"),
+ remove = F,
+ sep = "_"
+)
+del_multi_AA$p_start <- str_extract(del_multi_AA$p_start, "[0-9]+$")
+del_multi_AA$p_end <- str_extract(del_multi_AA$p_end, "[0-9]+$")
+del_multi_AA$del_plength <- as.numeric(del_multi_AA$p_end) - as.numeric(del_multi_AA$p_start) + 1
+
+# drop ones that are NA in PROT
+hgmd.filt <- hgmd.filt[!is.na(hgmd.filt$PROT), ]
+
+# save hgmd.filt
+write.table(
+ hgmd.filt,
+ file.path(args[2], "Expand_Result", "hgmd.filt.txt"),
+ sep = "\t",
+ col.names = T,
+ row.names = F,
+ quote = F
+)
+saveRDS(hgmd.filt, file.path(args[2], "Expand_Result", "hgmd.filt.rds"))
+
+## parse the single AA del ####
+# Add alt of the AA (all possible change of change, except the ref)
+del.1aa <- hgmd.filt[grepl("del", hgmd.filt$PROT2), ]
+del.1aa <- separate(
+ data = del.1aa,
+ col = "PROT2",
+ into = c("NP_ID", "pChange"),
+ remove = F,
+ sep = ":"
+)
+del.1aa$pChange <- gsub("del", "", del.1aa$pChange)
+del.1aa$pChange <- gsub("p\\.", "", del.1aa$pChange)
+del.1aa$pLoc <- str_extract(del.1aa$pChange, "[0-9]+$")
+del.1aa$pRef <- str_extract(del.1aa$pChange, "^[A-z]+")
+
+# load AA table
+AA.table <- read.delim2(args[3], sep = "\t", header = T)
+AA.filt <- AA.table[!grepl("\\/", AA.table$Full_Name), ]
+AA.3letter <- AA.filt$ThreeLetter
+AA.1letter <- AA.filt$OneLetter
+# c('Ile','Phe','Lys','Tyr','Val','Glu','Met','Thr','Leu','Asp','Gly','Gln','Pro','Ser','His','Asn','Cys','Arg','Ala','Trp','Ter')
+
+del.1aa2 <- del.1aa[rep(seq_len(nrow(del.1aa)), each = 20), ]
+del.1aa2$pAlt <- AA.3letter
+
+# remove Synonymous
+del.1aa3 <- del.1aa2[del.1aa2$pRef != del.1aa2$pAlt, ]
+
+# format the expanded AA to PROT2 format
+del.1aa3$PROT2_exp <- paste0(del.1aa3$NP_ID,
+ ":p.",
+ del.1aa3$pRef,
+ del.1aa3$pLoc,
+ del.1aa3$pAlt)
+
+##TransVar with del.1aa ####
+
+var4transVar.Del <- unique(del.1aa3$PROT2_exp)
+write.table(
+ var4transVar.Del,
+ file.path(args[2], "TransVarInput", "del_var.txt"),
+ quote = F,
+ col.names = F,
+ row.names = F
+)
+
+# Warning from TransVar [_annotate_snv_protein] warning: unknown alternative: TER, ignore alternative.
+
+
+## run TransVar step 1 ####
+system(
+ paste(
+ "conda run -n hgmd transvar panno --refseq -l TransVarInput/del_var.txt --idmap protein_id --refversion",
+ genome_version,
+ "> TransVarOut/out_del_1aa.txt"
+ ),
+ intern = FALSE
+)
+
+# load result
+del2g.orig <- read.delim2(
+ file.path(args[2], "TransVarOut", "out_del_1aa.txt"),
+ header = T,
+ stringsAsFactors = F
+) # 59567-2022version 80678-2024version
+del2g <- del2g.orig[del2g.orig$transcript != ".", ]
+
+# del2g <- del2g[grepl("candidate_codons=",del2g$info),] # 50711
+var <- unique(del2g$input)
+failed.del2g <- del.1aa3[!(del.1aa3$PROT2_exp %in% var) &
+ del.1aa3$pAlt != "Ter", ]
+
+length(unique(failed.del2g$ID))
+
+# add the HGMD info
+# 1. add the HGMD ID
+del2g.wID <- merge(del2g,
+ del.1aa3[, c("PROT2_exp", "ID", "PROT2")],
+ by.x = "input",
+ by.y = "PROT2_exp",
+ all.x = T)
+
+# save the result
+write.table(
+ del2g.wID,
+ file.path(args[2], "Expand_Result", "del2g.wID.txt"),
+ sep = "\t",
+ col.names = T,
+ row.names = F,
+ quote = F
+)
+saveRDS(del2g.wID, file.path(args[2], "Expand_Result", "del2g.wID.rds"))
+
+
+##TransVar with NP ####
+
+# select var to run TransVar: non del
+NP_df <- hgmd.filt[!(hgmd.filt$PROT2 %in% del.1aa3$PROT2), ]
+
+var4transVar.p <- unique(NP_df$PROT2)
+table(grepl("\\?", var4transVar.p))
+
+write.table(
+ var4transVar.p[!grepl("\\?", var4transVar.p)],
+ file.path(args[2], "TransVarInput", "NP_var.txt"),
+ quote = F,
+ col.names = F,
+ row.names = F
+)
+
+NP_df_exp <- NP_df
+NP_df_exp <- separate(
+ data = NP_df_exp,
+ col = "PROT2",
+ into = c("NP_ID", "pChange"),
+ remove = F,
+ sep = ":"
+)
+NP_df_exp$pChange <- gsub("p\\.", "", NP_df_exp$pChange)
+NP_df_exp$pAlt <- str_extract(NP_df_exp$pChange, "([A-z]|\\?)+$")
+NP_df_exp$pRef <- str_extract(NP_df_exp$pChange, "^[A-z]+")
+NP_df_exp$pLoc <- str_extract(NP_df_exp$pChange, "[0-9]+")
+
+# AA in different format 1 letter or 3 letter
+# parse them seperately
+
+NP_df_exp_3aa <- NP_df_exp[NP_df_exp$pAlt %in% AA.3letter, ]
+NP_df_exp_1aa <- NP_df_exp[NP_df_exp$pAlt %in% LETTERS, ]
+NP_df_exp_qMark <- NP_df_exp[NP_df_exp$pAlt == "?", ]
+
+# 3 letters
+NP_df_exp_3aa <- NP_df_exp_3aa[rep(seq_len(nrow(NP_df_exp_3aa)), each = 20), ]
+NP_df_exp_3aa$pAlt <- AA.3letter
+NP_df_exp_3aa <- NP_df_exp_3aa[NP_df_exp_3aa$pRef != NP_df_exp_3aa$pAlt, ]
+NP_df_exp_3aa$PROT2_exp <- paste0(
+ NP_df_exp_3aa$NP_ID,
+ ":p.",
+ NP_df_exp_3aa$pRef,
+ NP_df_exp_3aa$pLoc,
+ NP_df_exp_3aa$pAlt
+)
+
+# 1 letter
+NP_df_exp_1aa <- NP_df_exp_1aa[rep(seq_len(nrow(NP_df_exp_1aa)), each = 20), ]
+NP_df_exp_1aa$pAlt <- AA.1letter
+NP_df_exp_1aa <- NP_df_exp_1aa[NP_df_exp_1aa$pRef != NP_df_exp_1aa$pAlt, ]
+NP_df_exp_1aa$PROT2_exp <- paste0(
+ NP_df_exp_1aa$NP_ID,
+ ":p.",
+ NP_df_exp_1aa$pRef,
+ NP_df_exp_1aa$pLoc,
+ NP_df_exp_1aa$pAlt
+)
+
+pRef <- as.data.frame(table(NP_df_exp_qMark$pRef))
+pRef <- pRef$Var1
+# check if all are start loss
+length(setdiff(pRef, c("M", "Met"))) == 0
+
+# rename pRef
+NP_df_exp_qMark$pRef = "Met"
+NP_df_exp_qMark <- NP_df_exp_qMark[rep(seq_len(nrow(NP_df_exp_qMark)), each = 19), ]
+AA.nonStart = AA.3letter[AA.3letter != "Met"]
+
+NP_df_exp_qMark$pAlt <- AA.nonStart
+NP_df_exp_qMark$PROT2_exp <- paste0(
+ NP_df_exp_qMark$NP_ID,
+ ":p.",
+ NP_df_exp_qMark$pRef,
+ NP_df_exp_qMark$pLoc,
+ NP_df_exp_qMark$pAlt
+)
+
+NP_df_exp_final <- rbind(NP_df_exp_3aa, NP_df_exp_1aa, NP_df_exp_qMark)
+
+var4transVar.missense <- unique(c(
+ NP_df_exp_3aa$PROT2_exp,
+ NP_df_exp_1aa$PROT2_exp,
+ NP_df_exp_qMark$PROT2_exp
+)) # 2186825
+write.table(
+ var4transVar.missense,
+ file.path(args[2], "TransVarInput", "NP_missense_var.txt"),
+ quote = F,
+ col.names = F,
+ row.names = F
+)
+
+##run TransVar step 2 ####
+system(
+ paste(
+ "conda run -n hgmd transvar panno -l TransVarInput/NP_var.txt --refseq --idmap protein_id --refversion",
+ genome_version,
+ "> TransVarOut/out_NP.txt"
+ ),
+ intern = FALSE
+)
+
+system(
+ paste(
+ "conda run -n hgmd transvar panno -l TransVarInput/NP_missense_var.txt --refseq --idmap protein_id --refversion",
+ genome_version,
+ "> TransVarOut/out_NP_missense.txt"
+ ),
+ intern = FALSE
+)
+
+## load transVar NP result ####
+# NP
+NP2g.orig <- read.delim2(
+ file.path(args[2], "TransVarOut", "out_NP.txt"),
+ header = T,
+ stringsAsFactors = F
+) # 166544
+# write.table(NP2g.orig, file.path(args[2],"TransVarOut","out_NP.txt"), col.names = T, row.names = F, quote = F, sep = "\t")
+
+NP2g <- NP2g.orig[NP2g.orig$transcript != ".", ]
+NP2g <- NP2g[grepl("candidate_codons=", NP2g$info), ] # all failed are M1?, start loss, can be rescued in NP2g_exp
+
+# check failed
+var <- unique(NP2g$input) # 164045
+# failed.NP.var <- var4transVar.p[!(var4transVar.p %in% var)] # 2303
+
+# NP exp
+NP2g_exp.orig <- fread(
+ file.path(args[2], "TransVarOut", "out_NP_missense.txt"),
+ header = T,
+ stringsAsFactors = F,
+ sep = "\t"
+)
+
+NP2g_exp <- NP2g_exp.orig[NP2g_exp.orig$transcript != ".", ]
+# NP2g_exp <- NP2g_exp[grepl("candidate_codons=",NP2g_exp$info),] # 2167881
+
+# check failed
+var_exp <- unique(NP2g_exp$input)
+
+NP2g_exp.wID <- merge(
+ NP2g_exp,
+ NP_df_exp_final[, c("PROT2_exp", "ID", "PROT2")],
+ by.x = "input",
+ by.y = "PROT2_exp",
+ all.x = T
+)
+write.table(
+ NP2g_exp.wID,
+ file.path(args[2], "Expand_Result", "NP2g_exp.wID.txt"),
+ sep = "\t",
+ col.names = T,
+ row.names = F,
+ quote = F
+)
+saveRDS(NP2g_exp.wID,
+ file.path(args[2], "Expand_Result", "NP2g_exp.wID.rds"))
+
+
+## ADD hgmd ID to the NP2g result ####
+NP2g.wID <- merge(NP2g,
+ hgmd.filt,
+ by.x = "input",
+ by.y = "PROT2",
+ all.x = T)
+NP2g.wID$input_type <- "hgmd.NP_ID"
+NP2g.wID$PROT2 <- NP2g.wID$input
+
+# save the result
+write.table(
+ NP2g.wID,
+ file.path(args[2], "Expand_Result", "NP2g.wID.txt"),
+ sep = "\t",
+ col.names = T,
+ row.names = F,
+ quote = F
+)
+saveRDS(NP2g.wID, file.path(args[2], "Expand_Result", "NP2g.wID.rds"))
+
+## with gDNA ####
+
+## step 1: gDNA to protein change. ####
+# most failed due to change in the Gene name/ transcript ID / protein location
+# for those, run TransVar with gDNA -> pro change
+# then run TransVar with pro change -> get the gDNA change
+
+var <- unique(c(NP2g.wID$input, del2g.wID$PROT2, NP2g_exp.wID$PROT2))
+failed.NP2g <- hgmd.filt[!(hgmd.filt$PROT2 %in% var), ]
+failed.ID <- unique(failed.NP2g$ID)
+
+# get the gDNA for the failed.NP2g (failed NP and Gene)
+gDNA.failed.NP2g <- hgmd[hgmd$ID %in% failed.ID, c("ID", "CHROM", "POS", "REF", "ALT", "PROT2")]
+gDNA.failed.NP2g$gDNA <- paste0(
+ "chr",
+ gDNA.failed.NP2g$CHROM,
+ ":g.",
+ gDNA.failed.NP2g$POS,
+ gDNA.failed.NP2g$REF,
+ ">",
+ gDNA.failed.NP2g$ALT
+)
+
+write.table(
+ unique(gDNA.failed.NP2g$gDNA),
+ file.path(args[2], "TransVarInput", "gDNA_var.txt"),
+ quote = F,
+ col.names = F,
+ row.names = F
+)
+
+## run TransVar step 3 ####
+system(
+ paste(
+ "conda run -n hgmd transvar ganno -l TransVarInput/gDNA_var.txt --refseq --idmap protein_id --refversion",
+ genome_version,
+ "> TransVarOut/out_gDNA.txt"
+ ),
+ intern = FALSE
+)
+
+# extract the ref and alt protein for future mapping
+gDNA.failed.NP2g.part <- gDNA.failed.NP2g[, c("ID", "gDNA", "PROT2")]
+gDNA.failed.NP2g.part <- separate(
+ data = gDNA.failed.NP2g.part,
+ col = "PROT2",
+ into = c("NP_ID", "pChange"),
+ remove = F,
+ sep = ":"
+)
+gDNA.failed.NP2g.part$pChange <- gsub("p\\.", "", gDNA.failed.NP2g.part$pChange)
+gDNA.failed.NP2g.part$pAlt <- str_extract(gDNA.failed.NP2g.part$pChange, "([A-z]|\\?)+$")
+gDNA.failed.NP2g.part$pRef <- str_extract(gDNA.failed.NP2g.part$pChange, "^[A-z]+")
+gDNA.failed.NP2g.part$pLoc <- str_extract(gDNA.failed.NP2g.part$pChange, "[0-9]+")
+
+# replace 3 letter AA to 1 letter AA (TransVar using 1 letter AA)
+gDNA.failed.NP2g.part$pAlt <- mapvalues(gDNA.failed.NP2g.part$pAlt,
+ from = AA.table$ThreeLetter,
+ to = AA.table$OneLetter)
+gDNA.failed.NP2g.part$pRef <- mapvalues(gDNA.failed.NP2g.part$pRef,
+ from = AA.table$ThreeLetter,
+ to = AA.table$OneLetter)
+# consider stop: U and Ter
+gDNA.failed.NP2g.part$pRef <- mapvalues(gDNA.failed.NP2g.part$pRef,
+ from = c("U", "Ter"),
+ to = c("*", "*"))
+
+# load result
+pDNA2pro.orig <- read.delim2(
+ file.path(args[2], "TransVarOut", "out_gDNA.txt"),
+ header = T,
+ stringsAsFactors = F
+)
+pDNA2pro <- pDNA2pro.orig[grepl("p.", pDNA2pro.orig$coordinates.gDNA.cDNA.protein.), c("input", "gene", "coordinates.gDNA.cDNA.protein.", "info")]
+
+# parse to info and extract the aliases(NP ID)
+# vectorized transformation
+library(dplyr)
+info.df <- pDNA2pro |>
+ mutate(original_info = as.character(info)) |>
+ separate_rows(info, sep = ";") |>
+ rename(fields = info) |>
+ separate(
+ fields,
+ into = c("key", "value"),
+ sep = "=",
+ fill = "right",
+ extra = "merge",
+ remove = FALSE
+ ) |>
+ rename(coordinates = coordinates.gDNA.cDNA.protein.) |>
+ select(fields, key, value, original_info, coordinates, input) |>
+ rename(info = original_info)
+library(plyr)
+
+info.w.orig <- reshape2::dcast(
+ info.df,
+ formula = input + coordinates + info ~ key,
+ value.var = "value",
+ fill = NULL
+)
+info.w <- info.w.orig[, c("input", "coordinates", "aliases")]
+info.w <- unique(info.w)
+info.w <- info.w %>%
+ separate(
+ col = "coordinates",
+ c("gDNA", "cDNA", "protein"),
+ sep = "/",
+ remove = T
+ )
+info.w <- info.w[, c("input", "aliases", "protein")]
+colnames(info.w) <- c("input", "TransVar_NP_ID", "TransVar_pChange")
+# extract TransVar_pRef, TransVar_pAlt
+info.w$TransVar_pChange <- gsub("p\\.", "", info.w$TransVar_pChange)
+info.w$TransVar_pAlt <- str_extract(info.w$TransVar_pChange, "([A-z]|\\?|\\*)+$")
+info.w$TransVar_pRef <- str_extract(info.w$TransVar_pChange, "^[A-z]+|\\*")
+info.w$TransVar_pLoc <- str_extract(info.w$TransVar_pChange, "[0-9]+")
+
+# compare with original HGMD pRef, pAlt
+pDNA2pro.wID <- merge(
+ gDNA.failed.NP2g.part,
+ info.w,
+ by.x = "gDNA",
+ by.y = "input",
+ all.y = T
+)
+pDNA2pro.wID.all <- merge(
+ gDNA.failed.NP2g.part,
+ info.w,
+ by.x = "gDNA",
+ by.y = "input",
+ all = T
+)
+ID.failed <- gDNA.failed.NP2g.part$ID[!(gDNA.failed.NP2g.part$gDNA %in% info.w$input)] # 150
+# keep the entries where both pRef and pAlt matches
+
+# Some of the cases are difficult
+# 1. inframe del --- parse separately
+# 2. start loss --- parse separately
+# 3. reversed p ref/alt from HGMD (most likely error from hgmd) handled by the for loop bellow
+
+pDNA2pro.wID.selected <- data.frame()
+
+# Vectorized Transformation with dplyr
+library(dplyr)
+
+pDNA2pro.wID.selected <- pDNA2pro.wID %>%
+ mutate(
+ priority = case_when(
+ pRef == TransVar_pRef & pAlt == TransVar_pAlt ~ 1,
+ pAlt == TransVar_pRef & pRef == TransVar_pAlt ~ 2,
+ pRef == TransVar_pRef &
+ paste0(pAlt, pRef) == TransVar_pAlt ~ 3,
+ pRef == TransVar_pRef & pRef == "M" & pLoc == "1" ~ 4,
+ TRUE ~ NA_real_
+ )
+ ) %>%
+ filter(!is.na(priority)) %>%
+ group_by(ID) %>%
+ slice(1) %>%
+ select(-priority) |>
+ ungroup()
+
+# reload plyr to mask functions for compatability
+library(plyr)
+
+# pDNA2pro.failed <- gDNA.failed.NP2g.part[gDNA.failed.NP2g.part$ID %in% ID.failed,]
+pDNA2pro.failed <- pDNA2pro.wID.all[pDNA2pro.wID.all$ID %in% ID.failed, ]
+length(unique(pDNA2pro.failed$gDNA))
+# "CD169908" "CD198588", TransVar failed to map to a pChange, therefore, no expansion can be done for this two cases.
+
+# expand the pDNA2pro.wID
+pDNA2pro.wID_exp <- pDNA2pro.wID[rep(seq_len(nrow(pDNA2pro.wID)), each = 20), ]
+pDNA2pro.wID_exp$exp.pAlt <- AA.filt$OneLetter
+
+pDNA2pro.wID_exp <- pDNA2pro.wID_exp[pDNA2pro.wID_exp$TransVar_pRef != pDNA2pro.wID_exp$exp.pAlt, ]
+pDNA2pro.wID_exp$PROT2_exp <- paste0(
+ pDNA2pro.wID_exp$TransVar_NP_ID,
+ ":p.",
+ pDNA2pro.wID_exp$TransVar_pRef,
+ pDNA2pro.wID_exp$TransVar_pLoc,
+ pDNA2pro.wID_exp$exp.pAlt
+)
+
+write.table(
+ unique(pDNA2pro.wID_exp$PROT2_exp),
+ file.path(args[2], "TransVarInput", "gDNA2pro_exp_var.txt"),
+ quote = F,
+ col.names = F,
+ row.names = F
+)
+
+## run transvar step 4 ####
+system(
+ paste(
+ "conda run -n hgmd transvar panno -l TransVarInput/gDNA2pro_exp_var.txt --refseq --idmap protein_id --refversion",
+ genome_version,
+ "> TransVarOut/out_gDNA2pro_exp.txt"
+ ),
+ intern = FALSE
+)
+
+## step 2: then map back to gDNA ####
+
+# load result
+gDNA2pro2g.orig <- read.delim2(
+ file.path(args[2], "TransVarOut", "out_gDNA2pro_exp.txt"),
+ header = T,
+ stringsAsFactors = F
+)
+gDNA2pro2g <- gDNA2pro2g.orig[gDNA2pro2g.orig$transcript != ".", ]
+gDNA2pro2g <- gDNA2pro2g[grepl("candidate_codons=", gDNA2pro2g$info), ] # same nrow as gDNA2pro2g.orig
+gDNA2pro2g$idx <- rownames(gDNA2pro2g)
+g2p2g.info <- gDNA2pro2g[, c("idx", "info")]
+
+colnames(gDNA2pro2g)[1] <- "PROT2_exp"
+gDNA2pro2g.wID <- merge(
+ gDNA2pro2g,
+ pDNA2pro.wID_exp,
+ by.x = "PROT2_exp",
+ by.y = "PROT2_exp",
+ all = T
+)
+
+## save the gDNA2pro2g result ####
+write.table(
+ gDNA2pro2g.wID,
+ file.path(args[2], "Expand_Result", "gDNA2pro2g.wID.txt"),
+ sep = "\t",
+ col.names = T,
+ row.names = F,
+ quote = F
+)
+saveRDS(gDNA2pro2g.wID,
+ file.path(args[2], "Expand_Result", "gDNA2pro2g.wID.rds"))
+toc()
+
+
+# Step 5 - gDNA2VCF ----------------------------------------------------------------------------------------------------
+# step5 extract the gDNA chagne and convert to VCF formats
+
+print("Runnning Step 5 --- gDNA2VCF")
+tic("step 5")
+
+g.built <- genome_version
+del.orig <- readRDS(file.path(args[2], "Expand_Result", "del2g.wID.rds"))
+NP_exp.orig <- readRDS(file.path(args[2], "Expand_Result", "NP2g_exp.wID.rds"))
+gDNA.orig <- readRDS(file.path(args[2], "Expand_Result", "gDNA2pro2g.wID.rds"))
+
+# hgmd.orig <- readRDS("hgmd.filt.rds")
+hgmd.orig <- readRDS(file.path(args[2], "hgmd_DM.rds"))
+hgmd <- hgmd.orig[, c("ID", "RANKSCORE", "CLASS")]
+
+# function to parse the infor column
+# parse to info and extract the all the possible gDNA changes
+library(dplyr) #use dplyr instead of plyr here
+parse_info <- function(input.df) {
+ input.df$idx <- as.numeric(rownames(input.df))
+ infor.wIdx <- input.df[, c("idx", "info")]
+
+ # Vectorized Transformation
+ info.df <- infor.wIdx %>%
+ separate_rows(info, sep = ";") %>%
+ separate(info,
+ into = c("key", "value"),
+ sep = "=",
+ fill = "right") %>%
+ select(idx, key, value)
+
+ info.df$idx <- as.numeric(info.df$idx)
+ info.df.w <- reshape2::dcast(
+ info.df,
+ formula = idx ~ key,
+ value.var = "value",
+ fill = NULL
+ )
+
+ if (sum(info.df.w$idx != infor.wIdx$idx) == 0) {
+ output.df <- cbind(input.df, info.df.w)
+ output.df$idx <- NULL
+ output.df$coordinates.gDNA <- gsub("\\/.*", "", output.df$coordinates.gDNA.cDNA.protein.)
+
+ return(output.df)
+ } else {
+ print("error")
+ }
+
+}
+
+# parse the info
+del.parse.info <- parse_info(del.orig)
+saveRDS(del.parse.info,
+ file.path(args[2], "Expand_Result", "Step5_del.parse.info.rds"))
+
+
+NP_exp.orig <- as.data.table(NP_exp.orig)
+colnames(NP_exp.orig)[5] <- "coordinates.gDNA.cDNA.protein."
+NP_exp.parse.info <- parse_info(NP_exp.orig)
+saveRDS(
+ NP_exp.parse.info,
+ file.path(args[2], "Expand_Result", "Step5_NP_exp.parse.info.rds")
+)
+
+gDNA.parse.info <- parse_info(gDNA.orig)
+saveRDS(
+ gDNA.parse.info,
+ file.path(args[2], "Expand_Result","Step5_gDNA.parse.info.rds")
+)
+
+del.parse.info <- readRDS(file.path(args[2], "Expand_Result", "Step5_del.parse.info.rds"))
+NP_exp.parse.info <- readRDS(file.path(args[2], "Expand_Result", "Step5_NP_exp.parse.info.rds"))
+gDNA.parse.info <- readRDS(file.path(args[2], "Expand_Result", "Step5_gDNA.parse.info.rds"))
+
+
+library(plyr) #go back with plyr
+if (T) {
+ del.failed.idx <- is.na(del.parse.info$candidate_mnv_variants) &
+ is.na(del.parse.info$candidate_snv_variants) &
+ is.na(del.parse.info$coordinates.gDNA)
+ del.failed <- del.parse.info[del.failed.idx, ]
+ del.filt <- del.parse.info[!del.failed.idx, ]
+
+ del.gDNA <- del.filt[, c(
+ "ID",
+ "PROT2",
+ "input",
+ "candidate_mnv_variants",
+ "candidate_snv_variants",
+ "coordinates.gDNA"
+ )]
+
+ # parse the candidate variants
+
+ del.gDNA.mnv <- del.filt[!is.na(del.filt$candidate_mnv_variants), c("ID", "PROT2", "input", "candidate_mnv_variants")]
+ del.gDNA.snv <- del.filt[!is.na(del.filt$candidate_snv_variants), c("ID", "PROT2", "input", "candidate_snv_variants")]
+ del.gDNA.orig <- del.filt[!is.na(del.filt$coordinates.gDNA), c("ID", "PROT2", "input", "coordinates.gDNA")]
+ # colnames(del.gDNA.mnv) <- colnames(del.gDNA.snv) <- colnames(del.gDNA.orig) <- c("ID","PROT2_exp","input","candidate_variants")
+ colnames(del.gDNA.mnv) <- colnames(del.gDNA.snv) <- colnames(del.gDNA.orig) <- c("ID", "PROT2", "PROT2_exp", "candidate_variants") # edits Jan 24, 2023
+ del.gDNA <- rbind(del.gDNA.snv, del.gDNA.orig)
+
+ del.gDNA <- del.gDNA %>%
+ mutate(candidate_variants = strsplit(candidate_variants, ",")) %>%
+ unnest(candidate_variants) %>%
+ as.data.frame()
+
+ NP_exp.failed.idx <- is.na(NP_exp.parse.info$candidate_mnv_variants) &
+ is.na(NP_exp.parse.info$candidate_snv_variants) &
+ is.na(NP_exp.parse.info$coordinates.gDNA)
+ NP_exp.failed <- NP_exp.parse.info[NP_exp.failed.idx, ]
+ NP_exp.filt <- NP_exp.parse.info[!NP_exp.failed.idx, ]
+
+ NP_exp.gDNA <- NP_exp.filt[, c(
+ "ID",
+ "PROT2",
+ "input",
+ "candidate_mnv_variants",
+ "candidate_snv_variants",
+ "coordinates.gDNA"
+ )]
+
+ # parse the candidate variants
+
+ NP_exp.gDNA.mnv <- NP_exp.filt[!is.na(NP_exp.filt$candidate_mnv_variants), c("ID", "PROT2", "input", "candidate_mnv_variants")]
+ NP_exp.gDNA.snv <- NP_exp.filt[!is.na(NP_exp.filt$candidate_snv_variants), c("ID", "PROT2", "input", "candidate_snv_variants")]
+ NP_exp.gDNA.orig <- NP_exp.filt[!is.na(NP_exp.filt$coordinates.gDNA), c("ID", "PROT2", "input", "coordinates.gDNA")]
+ colnames(NP_exp.gDNA.mnv) <- colnames(NP_exp.gDNA.snv) <- colnames(NP_exp.gDNA.orig) <- c("ID", "PROT2", "PROT2_exp", "candidate_variants")
+ NP_exp.gDNA <- rbind(NP_exp.gDNA.snv, NP_exp.gDNA.mnv, NP_exp.gDNA.orig)
+
+ NP_exp.gDNA <- NP_exp.gDNA %>%
+ mutate(candidate_variants = strsplit(candidate_variants, ",")) %>%
+ unnest(candidate_variants) %>%
+ as.data.frame()
+
+ ## gDNA ####
+
+ gDNA.failed.idx <- is.na(gDNA.parse.info$candidate_mnv_variants) &
+ is.na(gDNA.parse.info$candidate_snv_variants) &
+ is.na(gDNA.parse.info$coordinates.gDNA)
+ gDNA.failed <- gDNA.parse.info[gDNA.failed.idx, ]
+ gDNA.filt <- gDNA.parse.info[!gDNA.failed.idx, ]
+
+ gDNA.gDNA <- gDNA.filt[, c(
+ "ID",
+ "PROT2",
+ "PROT2_exp",
+ "candidate_mnv_variants",
+ "candidate_snv_variants",
+ "coordinates.gDNA"
+ )]
+ # parse the candidate variants
+
+ gDNA.gDNA.mnv <- gDNA.filt[!is.na(gDNA.filt$candidate_mnv_variants), c("ID", "PROT2", "PROT2_exp", "candidate_mnv_variants")]
+ gDNA.gDNA.snv <- gDNA.filt[!is.na(gDNA.filt$candidate_snv_variants), c("ID", "PROT2", "PROT2_exp", "candidate_snv_variants")]
+ gDNA.gDNA.orig <- gDNA.filt[!is.na(gDNA.filt$coordinates.gDNA), c("ID", "PROT2", "PROT2_exp", "coordinates.gDNA")]
+ colnames(gDNA.gDNA.mnv) <- colnames(gDNA.gDNA.snv) <- colnames(gDNA.gDNA.orig) <- c("ID", "PROT2", "PROT2_exp", "candidate_variants")
+ gDNA.gDNA <- rbind(gDNA.gDNA.snv, gDNA.gDNA.mnv, gDNA.gDNA.orig)
+
+ gDNA.gDNA <- gDNA.gDNA %>%
+ mutate(candidate_variants = strsplit(candidate_variants, ",")) %>%
+ unnest(candidate_variants) %>%
+ as.data.frame()
+
+ ## Try convert gDNA to VCF format with code ####
+ # from del
+ del.gDNA$gChr <- str_extract(del.gDNA$candidate_variants,
+ "^chr([:digit:]+)|^(chr[A-Z])")
+ del.gDNA$gChr <- gsub("chr", "", del.gDNA$gChr)
+ del.gDNA$gLoc <- str_extract(del.gDNA$candidate_variants, "g.([:digit:]+)")
+ del.gDNA$gLoc <- gsub("g.", "", del.gDNA$gLoc)
+ # del.gDNA$gRef <- str_extract(del.gDNA$candidate_variants, "g.(([:digit:]|\\_)+|[A-Z]+)")
+ del.gDNA$gRef_tmp <- str_extract(del.gDNA$candidate_variants, "(([A-z]|\\>)+)$")
+ del.gDNA$gRef <- str_extract(del.gDNA$gRef_tmp, "([A-Z]+)")
+ del.gDNA$gRef_tmp <- NULL
+ del.gDNA$gAlt <- str_extract(del.gDNA$candidate_variants, "([A-Z])+$")
+ del.gDNA$HGMD_exp_type <- "Del_to_Missense"
+
+ # from gDNA
+ gDNA.gDNA$gChr <- str_extract(gDNA.gDNA$candidate_variants,
+ "^chr([:digit:]+)|^(chr[A-Z])")
+ gDNA.gDNA$gChr <- gsub("chr", "", gDNA.gDNA$gChr)
+ gDNA.gDNA$gLoc <- str_extract(gDNA.gDNA$candidate_variants, "g.([:digit:]+)")
+ gDNA.gDNA$gLoc <- gsub("g.", "", gDNA.gDNA$gLoc)
+ # gDNA.gDNA$gRef <- str_extract(gDNA.gDNA$candidate_variants, "g.(([:digit:]|\\_)+|[A-Z]+)")
+ gDNA.gDNA$gRef_tmp <- str_extract(gDNA.gDNA$candidate_variants, "(([A-z]|\\>)+)$")
+ gDNA.gDNA$gRef <- str_extract(gDNA.gDNA$gRef_tmp, "([A-Z]+)")
+ gDNA.gDNA$gRef_tmp <- NULL
+ gDNA.gDNA$gAlt <- str_extract(gDNA.gDNA$candidate_variants, "([A-Z])+$")
+
+ # add the HGMD_exp_type
+ PROT2_Alt <- str_extract(gDNA.gDNA$PROT2, "([A-z])+$")
+ PROT2_exp_Alt <- str_extract(gDNA.gDNA$PROT2_exp, "([A-z])+$")
+ gDNA.gDNA$HGMD_exp_type[PROT2_Alt == PROT2_exp_Alt] <- "Same_pChange"
+ gDNA.gDNA$HGMD_exp_type[PROT2_Alt != PROT2_exp_Alt] <- "Different_pChange"
+ gDNA.gDNA$HGMD_exp_type[grepl("\\?", gDNA.gDNA$PROT2)] <- "Start_Loss"
+ gDNA.gDNA$HGMD_exp_type[grepl("\\*", gDNA.gDNA$PROT2_exp)] <- "Stop_Loss"
+
+ table(gDNA.gDNA$HGMD_exp_type)
+
+ # from NP_exp.gDNA
+ NP_exp.gDNA$gChr <- str_extract(NP_exp.gDNA$candidate_variants,
+ "^chr([:digit:]+)|^(chr[A-Z])")
+ NP_exp.gDNA$gChr <- gsub("chr", "", NP_exp.gDNA$gChr)
+ NP_exp.gDNA$gLoc <- str_extract(NP_exp.gDNA$candidate_variants, "g.([:digit:]+)")
+ NP_exp.gDNA$gLoc <- gsub("g.", "", NP_exp.gDNA$gLoc)
+ # NP_exp.gDNA$gRef <- str_extract(NP_exp.gDNA$candidate_variants, "g.(([:digit:]|\\_)+|[A-Z]+)")
+ NP_exp.gDNA$gRef_tmp <- str_extract(NP_exp.gDNA$candidate_variants, "(([A-z]|\\>)+)$")
+ NP_exp.gDNA$gRef <- str_extract(NP_exp.gDNA$gRef_tmp, "([A-Z]+)")
+ NP_exp.gDNA$gRef_tmp <- NULL
+ NP_exp.gDNA$gAlt <- str_extract(NP_exp.gDNA$candidate_variants, "([A-Z])+$")
+
+ # add the HGMD_exp_type
+ PROT2_Alt <- str_extract(NP_exp.gDNA$PROT2, "([A-z])+$")
+ PROT2_exp_Alt <- str_extract(NP_exp.gDNA$PROT2_exp, "([A-z])+$")
+ # check if the length of Alt, exp_Alt are the same
+ PROT2_Alt_nchar <- nchar(PROT2_Alt)
+ PROT2_exp_Alt_nchar <- nchar(PROT2_exp_Alt)
+ table(PROT2_Alt_nchar == PROT2_exp_Alt_nchar) # all T
+
+ NP_exp.gDNA$HGMD_exp_type[PROT2_Alt == PROT2_exp_Alt] <- "Same_pChange"
+ NP_exp.gDNA$HGMD_exp_type[PROT2_Alt != PROT2_exp_Alt] <- "Different_pChange"
+ NP_exp.gDNA$HGMD_exp_type[grepl("\\?", NP_exp.gDNA$PROT2)] <- "Start_Loss"
+ NP_exp.gDNA$HGMD_exp_type[grepl("\\*", NP_exp.gDNA$PROT2_exp)] <- "Stop_Loss"
+
+ table(is.na(NP_exp.gDNA$HGMD_exp_type))
+
+ ## add the RankScore ####
+ HGMD_exp.all <- rbind(del.gDNA, NP_exp.gDNA, gDNA.gDNA)
+
+ HGMD_exp.all <- merge(HGMD_exp.all,
+ hgmd,
+ by.x = "ID",
+ by.y = "ID",
+ all.x = T)
+ saveRDS(HGMD_exp.all,
+ file.path(args[2], "Expand_Result_final", "hgmd.Coding.rds"))
+ write.table(
+ HGMD_exp.all,
+ file.path(args[2], "Expand_Result_final", "hgmd.Coding.txt"),
+ col.names = T,
+ row.names = F,
+ quote = F,
+ sep = "\t"
+ )
+
+}
+
+toc()
+
+# Step 6 - nonCoding ----------------------------------------------------------------------------------------------------
+print("Runnning Step 6 --- nonCoding")
+tic("step 6")
+
+hgmd.orig <- readRDS(file.path(args[2], "hgmd_DM.rds"))
+hgmd.filt <- readRDS(file.path(args[2], "Expand_Result", "hgmd.filt.rds"))
+
+hgmd <- hgmd.orig
+hgmd$PROT2 <- gsub(".[0-9]+%3A", ":", hgmd$PROT)
+hgmd$PROT2 <- gsub("%3B", ";", hgmd$PROT2)
+hgmd$PROT2 <- gsub("%3D", "=", hgmd$PROT2)
+hgmd$PROT2 <- gsub("\\(|\\)", "", hgmd$PROT2)
+
+## pre-filter the HGMD entries ####
+
+# parse PROT column
+hgmd <- hgmd[, c("ID", "CHROM", "POS", "REF", "ALT", "GENE", "PROT", "PROT2")]
+
+## var to drop ####
+# frame shift (drop)
+hgmd$fs <- grepl("fs", hgmd$PROT2)
+
+# drop var with multiple protein change (no expansion for them) (there are 48)
+hgmd$multi_pChange <- grepl(";", hgmd$PROT2)
+
+# drop stop loss or stop gain (with * in the PROT)
+hgmd$StopLossGain <- grepl("\\*", hgmd$PROT2)
+
+# drop inframe insertions and delins (ins and delins), keep del
+hgmd$ins <- grepl("ins", hgmd$PROT2)
+hgmd$dup <- grepl("dup", hgmd$PROT2)
+
+# drop inframe deletions that affect more than 1 aa
+# del2 <- hgmd.filt[grepl("del", hgmd.filt$PROT2),]
+del <- hgmd[grepl("del", hgmd$PROT2), ]
+del <- separate(
+ data = del,
+ col = "PROT2",
+ into = c("NP_ID", "pChange"),
+ remove = F,
+ sep = ":"
+)
+del_multi_AA <- del[grepl("_", del$pChange), ] # dim 2319
+hgmd$del_multi_AA <- hgmd$ID %in% del_multi_AA$ID
+
+# drop ones that are NA in PROT
+hgmd$NA_PROT2 <- is.na(hgmd$PROT2)
+
+# synonymous var (will parse as non-coding)
+hgmd$synonymous <- grepl("=", hgmd$PROT2)
+
+# hgmd drop
+drop_for_pExp <- hgmd$fs + hgmd$multi_pChange + hgmd$StopLossGain + hgmd$ins + hgmd$dup + hgmd$del_multi_AA + hgmd$synonymous + hgmd$NA_PROT2
+table(drop_for_pExp)
+# same number as hgmd.filt, confirmed
+
+# for non-coding
+# first, not in hgmd.filt
+# then drop hgmd$fs + hgmd$multi_pChange + hgmd$StopLossGain + hgmd$ins + hgmd$dup + hgmd$del_multi_AA
+drop_for_noncoding <- hgmd$fs + hgmd$multi_pChange + hgmd$StopLossGain + hgmd$ins + hgmd$dup + hgmd$del_multi_AA + (hgmd$ID %in% hgmd.filt$ID)
+table(drop_for_noncoding) # 32165 remaining for non-coding
+
+hgmd.non.coding <- hgmd[drop_for_noncoding == 0, ]
+
+hgmd.non.coding$RefLength <- nchar(hgmd.non.coding$REF)
+hgmd.non.coding$AltLength <- nchar(hgmd.non.coding$ALT)
+hgmd.non.coding$delLength <- hgmd.non.coding$RefLength - hgmd.non.coding$AltLength
+
+hgmd.non.coding$ExpPos_start <- as.numeric(hgmd.non.coding$POS) - 2
+hgmd.non.coding$ExpPos_stop <- as.numeric(hgmd.non.coding$POS) + hgmd.non.coding$RefLength + 2
+hgmd.non.coding.part <- hgmd.non.coding[, c(
+ "ID",
+ "CHROM",
+ "POS",
+ "REF",
+ "ALT",
+ "GENE",
+ "PROT2",
+ "RefLength",
+ "AltLength",
+ "delLength",
+ "ExpPos_start",
+ "ExpPos_stop"
+)]
+
+Ref.length.df <- as.data.frame(table(hgmd.non.coding.part$RefLength[hgmd.non.coding.part$RefLength <= 10]))
+Ref.length.df$Var1 <- as.character(Ref.length.df$Var1)
+Ref.length.df <- rbind(Ref.length.df, c(">10", sum(hgmd.non.coding.part$RefLength > 10)))
+
+del.length.df <- as.data.frame(table(hgmd.non.coding.part$delLength[hgmd.non.coding.part$delLength <= 10 &
+ hgmd.non.coding.part$delLength >= 0]))
+del.length.df$Var1 <- as.character(del.length.df$Var1)
+del.length.df <- rbind(del.length.df, c(">10", sum(hgmd.non.coding.part$delLength > 10)))
+del.length.df <- rbind(del.length.df, c("<0", sum(hgmd.non.coding.part$delLength < 0)))
+
+hgmd.non.coding.select <- hgmd.non.coding.part[hgmd.non.coding.part$RefLength <= 10 &
+ hgmd.non.coding.part$delLength >= 0, ]
+
+hgmd.non.coding.all <- merge(
+ hgmd.non.coding.select,
+ hgmd.orig[, c("ID", "RANKSCORE")],
+ by.x = "ID",
+ by.y = "ID",
+ all.x = T
+)
+hgmd.non.coding.all$HGMD_Exp <- "nonCoding"
+hgmd.non.coding.final <- hgmd.non.coding.all[, c(
+ "CHROM",
+ "ExpPos_start",
+ "ExpPos_stop",
+ "RANKSCORE",
+ "HGMD_Exp",
+ "RefLength",
+ "delLength"
+)]
+
+saveRDS(
+ hgmd.non.coding.final,
+ file.path(args[2], "Expand_Result_final", "hgmd.nonCoding.rds")
+)
+write.table(
+ hgmd.non.coding.final,
+ file.path(args[2], "Expand_Result_final", "hgmd.nonCoding.txt"),
+ sep = "\t",
+ col.names = T,
+ quote = F,
+ row.names = F
+)
+
+saveRDS(hgmd.non.coding.all,
+ file.path(args[2], "Expand_Result", "hgmd.nonCoding.all.rds"))
+write.table(
+ hgmd.non.coding.all,
+ file.path(args[2], "Expand_Result", "hgmd.nonCoding.all.txt"),
+ sep = "\t",
+ col.names = T,
+ quote = F,
+ row.names = F
+)
+
+
+# Step 7 dropped
+# Step 8 - reformat4AI ----------------------------------------------------------------------------------------------------
+print("Runnning Step 8 (skipping step7) --- reformat4AI")
+tic("step 8")
+# reformat expansion
+Coding <- fread(file.path(args[2], "Expand_Result_final", "hgmd.Coding.txt"))
+nonCoding <- fread(file.path(args[2], "Expand_Result_final", "hgmd.nonCoding.txt"))
+
+Coding <- Coding[, c("ID",
+ "HGMD_exp_type",
+ "gAlt",
+ "gRef",
+ "RANKSCORE",
+ "CLASS",
+ "gLoc",
+ "gChr")]
+colnames(Coding) <- c("hgmdID",
+ "c_HGMD_Exp",
+ "alt",
+ "ref",
+ 'c_RANKSCORE',
+ 'CLASS',
+ 'new_pos',
+ "new_chr")
+
+Coding$new_chr[Coding$new_chr == "X"] <- "23"
+Coding$new_chr[Coding$new_chr == "Y"] <- "24"
+
+
+
+## One potential issue!!!!!!: it keeps col.names = T when writing the output, which could cause the issue in reading this tsv file in R; However, it doesn't affect results when testing with pd in Python
+write.table(
+ Coding,
+ file.path(args[2], "Expand_Result_final", "hgmd_c.tsv"),
+ sep = "\t",
+ col.names = T,
+ row.names = T,
+ quote = F
+)
+
+nonCoding <- nonCoding[, c(
+ "RANKSCORE",
+ "HGMD_Exp",
+ "RefLength",
+ "delLength",
+ "ExpPos_start",
+ "ExpPos_stop",
+ "CHROM"
+)]
+colnames(nonCoding) <- c(
+ "nc_RANKSCORE",
+ 'nc_HGMD_Exp',
+ 'hgmd_refLength',
+ 'hgmd_delLength',
+ "new_start",
+ 'new_stop',
+ "new_chr"
+)
+
+nonCoding$new_chr[nonCoding$new_chr == "X"] <- "23"
+nonCoding$new_chr[nonCoding$new_chr == "Y"] <- "24"
+
+## One potential issue!!!!!!: it keeps col.names = T when writing the output, which could cause the issue in reading this tsv file in R; However, it doesn't affect results when testing with pd in Python
+write.table(
+ nonCoding,
+ file.path(args[2], "Expand_Result_final", "hgmd_nc.tsv"),
+ sep = "\t",
+ col.names = T,
+ row.names = T,
+ quote = F
+)
+toc()
\ No newline at end of file
diff --git a/utils/hgmd_update/merge_expand/setup.sh b/utils/hgmd_update/merge_expand/setup.sh
new file mode 100644
index 0000000..467442c
--- /dev/null
+++ b/utils/hgmd_update/merge_expand/setup.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+#----------------------------- Setup -----------------------------
+
+HGMD_hg19_DIR=HGMD_Pro_2024.3_hg19.vcf.gz
+HGMD_hg38_DIR=HGMD_Pro_2024.3_hg38.vcf.gz
+
+OUT_hg19_DIR=output/hg19
+OUT_hg38_DIR=output/hg38
+
+REF_hg19_DIR=references/hg19
+REF_hg38_DIR=references/hg38
+REF_aa=references/AA_abbreviations.txt
+
+RSCRIPT_PATH=expandvar.R
+
+# download reference genomes
+wget -P $REF_hg19_DIR https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz
+wget -P $REF_hg38_DIR https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
+
+# decompress reference genomes
+gunzip $REF_hg19_DIR/*.gz
+
+# download AA abbreviations
+# just Amino Acids 3 letters and 1 letter names, there is a copy at skyriver: /home/zhijiany/workdir/references/AA_abbreviations.txt
+
+# setup conda environment
+conda create -n hgmd r-base python=3.11 samtools -y
+conda activate hgmd
+
+# index reference genomes
+conda run -n hgmd samtools faidx $REF_hg19_DIR/hg19.fa
+conda run -n hgmd samtools faidx $REF_hg38_DIR/hg38.fa
+
+# install transvar
+conda run -n hgmd pip install transvar
+
+# setup transvar annotations and reference genomes
+#hg19
+transvar config --download_anno --refversion hg19
+transvar config -k reference -v $REF_hg19_DIR/hg19.fa --refversion hg19
+
+#hg38
+transvar config --download_anno --refversion hg38
+transvar config -k reference -v $REF_hg38_DIR/hg38.fa --refversion hg38
+
+
+
+#----------------------------- Run the R script -----------------------------
+
+#Run the Rscript for hg19
+conda run -n hgmd Rscript $RSCRIPT_PATH $HGMD_hg19_DIR $OUT_hg19_DIR $REF_aa hg19
+
+#Run the R script for hg38
+conda run -n hgmd Rscript $RSCRIPT_PATH $HGMD_hg38_DIR $OUT_hg38_DIR $REF_aa hg38
+