Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ Choose one:
1. `/path/to/ViWrap/yamls/ViWrap.yml` indicates the ViWrap.yml file address. This file was placed within the yamls folder of ViWrap directory.

2. `/path/to/ViWrap_conda_environments` indicates the directory that you will need to use to store all conda environments for ViWrap.
3. For the first step, one can also use one-line conda environment setting up string as: `conda create -c bioconda -c conda-forge -p /path/to/ViWrap_conda_environments/ViWrap python=3.8 biopython=1.80 mamba=1.3.0 numpy=1.24.2 pandas=1.5.3 pyfastx=0.8.4 matplotlib=3.6.3 seaborn=0.12.2 diamond=2.0.15 hmmer=3.3.2 pyparsing=2.4.7`
3. For the first step, one can also use one-line conda environment setting up string as: `conda create -c bioconda -c conda-forge -p /path/to/ViWrap_conda_environments/ViWrap python=3.8 biopython=1.80 mamba=1.5.1 numpy=1.24.2 pandas=1.5.3 pyfastx=0.8.4 matplotlib=3.6.3 seaborn=0.12.2 diamond=2.0.15 hmmer=3.3.2 pyparsing=2.4.7`

2. Install in normal conda folder

Expand Down
218 changes: 128 additions & 90 deletions scripts/master_downloader.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def main(args):
sys.exit(f"Could not find conda env dirs within {args['conda_env_dir']}")

if os.path.exists(args['db_dir']):
sys.exit(f"The db dir of {args['db_dir']} has also ready been set up")
#sys.exit(f"The db dir of {args['db_dir']} has also ready been set up")
logger.info(f"The db dir of {args['db_dir']} has also ready been set up")
else:
os.mkdir(args['db_dir'])

Expand All @@ -66,123 +67,160 @@ def main(args):

# Step 2 Make VIBRANT db
vibrant_db_dir_absolute_path = os.path.abspath(args['VIBRANT_db'])
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT')} bash {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT/bin/download-db.sh')} {vibrant_db_dir_absolute_path}")

if os.path.exists(f"{os.path.join(args['db_dir'], 'VIBRANT_db')}"):
logger.info(f"{time_current} | Found VIBRANT_db. Skip download.")
else:
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT')} bash {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT/bin/download-db.sh')} {vibrant_db_dir_absolute_path}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | VIBRANT db has been set up")

# Step 3 Make geNomad db
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-geNomad')} genomad download-database {args['db_dir']}")
if os.path.exists(f"{os.path.join(args['db_dir'], 'genomad_db')}"):
logger.info(f"{time_current} | Found genomad_db. Skip download.")
else:
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-geNomad')} genomad download-database {args['db_dir']}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | geNomad db has been set up")

# Step 4 Make Tax classification db
os.mkdir(args['Tax_classification_db'])
if os.path.exists(f"{os.path.join(args['db_dir'], args['Tax_classification_db'])}"):
logger.info(f"{time_current} | Found {args['Tax_classification_db']}. Tax_classification_db db seems already set up.")
else:
os.mkdir(args['Tax_classification_db'])

###############################################
# Part I Download NCBI RefSeq viral protein db#
###############################################
###############################################
# Part I Download NCBI RefSeq viral protein db#
###############################################

## Step 4.1 Download NCBI RefSeq viral protein and protein gpff
scripts.downloadDB.dl_refseq_viral_protein(args['Tax_classification_db'])
scripts.downloadDB.dl_refseq_viral_protein_gpff(args['Tax_classification_db'])
## Step 4.1 Download NCBI RefSeq viral protein and protein gpff
scripts.downloadDB.dl_refseq_viral_protein(args['Tax_classification_db'])
scripts.downloadDB.dl_refseq_viral_protein_gpff(args['Tax_classification_db'])

## Step 4.2 Parse to get protein to NCBI taxonomy info
scripts.downloadDB.parse_gpff(args['Tax_classification_db'])
## Step 4.2 Parse to get protein to NCBI taxonomy info
scripts.downloadDB.parse_gpff(args['Tax_classification_db'])

## Step 4.3 Grep NCBI RefSeq viral proteins with taxonomy info
scripts.downloadDB.grep_NCBI_RefSeq_viral_proteins_w_tax(args['Tax_classification_db'])
## Step 4.3 Grep NCBI RefSeq viral proteins with taxonomy info
scripts.downloadDB.grep_NCBI_RefSeq_viral_proteins_w_tax(args['Tax_classification_db'])

## Step 4.4 Reformat NCBI tax to ICTV 8-rank tax
ictv_tax_info = os.path.join(args['root_dir'], 'database/ICTV_Master_Species_List.txt')
pro2ictv_8_rank_tax = os.path.join(args['Tax_classification_db'], 'pro2ictv_8_rank_tax.txt')
scripts.downloadDB.reformat_NCBI_tax_to_ICTV_8_rank_tax(args['Tax_classification_db'], ictv_tax_info, pro2ictv_8_rank_tax)
## Step 4.4 Reformat NCBI tax to ICTV 8-rank tax
ictv_tax_info = os.path.join(args['root_dir'], 'database/ICTV_Master_Species_List.txt')
pro2ictv_8_rank_tax = os.path.join(args['Tax_classification_db'], 'pro2ictv_8_rank_tax.txt')
scripts.downloadDB.reformat_NCBI_tax_to_ICTV_8_rank_tax(args['Tax_classification_db'], ictv_tax_info, pro2ictv_8_rank_tax)

## Step 4.5 Make diamond blastp db
scripts.downloadDB.make_diamond_db(args['Tax_classification_db'])
## Step 4.5 Make diamond blastp db
scripts.downloadDB.make_diamond_db(args['Tax_classification_db'])

## Step 4.6 Remove useless files
scripts.downloadDB.remove(args['Tax_classification_db'])

##########################
# Part II Download VOG db#
##########################
## Step 4.7 Parse to get VOG marker list
vog_marker_table = os.path.join(args['root_dir'], 'database/VOG_marker_table.txt')
os.system(f"cp {os.path.join(args['root_dir'], 'database/VOG_marker_table.txt')} {os.path.join(args['Tax_classification_db'], 'VOG_marker_table.txt')}")
vog_marker_list = scripts.downloadDB.get_vog_marker_table(vog_marker_table)

## Step 4.8 Download the latest VOG db and pick VOG markers
scripts.downloadDB.get_marker_vog_hmm(vog_marker_list, args['Tax_classification_db'])

#############################
# Part III Download IMGVR db#
#############################
## Step 4.9 cp and degzip IMGVR db
os.system(f"cat {os.path.join(args['root_dir'], 'database/IMGVR_high-quality_phage_vOTU_representatives.tar.gz*')} > {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}")
os.system(f"tar xzf {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')} --directory {args['Tax_classification_db']}")
os.system(f"mv {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives/*')} {args['Tax_classification_db']}")
os.system(f"rm {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}")
os.system(f"rm -r {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | Tax classification db has been set up")
## Step 4.6 Remove useless files
scripts.downloadDB.remove(args['Tax_classification_db'])
##########################
# Part II Download VOG db#
##########################
## Step 4.7 Parse to get VOG marker list
vog_marker_table = os.path.join(args['root_dir'], 'database/VOG_marker_table.txt')
os.system(f"cp {os.path.join(args['root_dir'], 'database/VOG_marker_table.txt')} {os.path.join(args['Tax_classification_db'], 'VOG_marker_table.txt')}")
vog_marker_list = scripts.downloadDB.get_vog_marker_table(vog_marker_table)

## Step 4.8 Download the latest VOG db and pick VOG markers
scripts.downloadDB.get_marker_vog_hmm(vog_marker_list, args['Tax_classification_db'])
#############################
# Part III Download IMGVR db#
#############################
## Step 4.9 cp and degzip IMGVR db
os.system(f"cat {os.path.join(args['root_dir'], 'database/IMGVR_high-quality_phage_vOTU_representatives.tar.gz*')} > {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}")
os.system(f"tar xzf {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')} --directory {args['Tax_classification_db']}")
os.system(f"mv {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives/*')} {args['Tax_classification_db']}")
os.system(f"rm {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}")
os.system(f"rm -r {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives')}")
time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | Tax classification db has been set up")


# Step 5 Make CheckV db
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} checkv download_database {args['db_dir']} >/dev/null 2>&1")
os.system(f"mv {os.path.join(args['db_dir'], 'checkv-db-v*')} {args['CheckV_db']}")
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} export CHECKVDB={args['CheckV_db']}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | CheckV db has been set up")
if os.path.exists(f"{os.path.join(args['db_dir'], args['CheckV_db'])}"):
logger.info(f"{time_current} | Found CheckV_db. CheckV db seems already set up.")
else:
if os.path.exists(f"{os.path.join(args['db_dir'], 'checkv-db-v*')}"):
logger.info(f"{time_current} | Found checkv-db-v*. Skip download.")
else:
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} checkv download_database {args['db_dir']} >/dev/null 2>&1")

os.system(f"mv {os.path.join(args['db_dir'], 'checkv-db-v*')} args['CheckV_db']")
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} export CHECKVDB={args['CheckV_db']}")
time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | CheckV db has been set up")


# Step 6 Make iPHoP db
os.system(f"wget -c https://portal.nersc.gov/cfs/m342/iphop/db/iPHoP.latest_rw.tar.gz --no-check-certificate -O {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}")
os.mkdir(args['iPHoP_db'])
os.system(f"tar xzf {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')} --directory {args['iPHoP_db']}")
os.system(f"mv {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/*')} {args['iPHoP_db']}")
os.system(f"rm {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}")
os.system(f"rmdir {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | iPHoP db has been set up")

if os.path.exists(f"{os.path.join(args['db_dir'], args['iPHoP_db'])}"):
logger.info(f"{time_current} | Found CheckV_db. CheckV db seems already set up.")
else:
if os.path.exists(f"{os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}"):
logger.info(f"{time_current} | Found iPHoP.latest_rw.tar.gz. Skip download.")
else:
os.system(f"wget -c https://portal.nersc.gov/cfs/m342/iphop/db/iPHoP.latest_rw.tar.gz --no-check-certificate -O {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}")

os.mkdir(args['iPHoP_db'])
os.system(f"tar xzf {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')} --directory {args['iPHoP_db']}")
os.system(f"mv {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/*')} {args['iPHoP_db']}")
os.system(f"rm {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}")
os.system(f"rmdir {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | iPHoP db has been set up")


# Step 7 Make GTDB-Tk db release 214
os.system(f"wget -c ftp://download.nmdc.cn/tools/meta/gtdb/gtdbtk_r214_data.tar.gz -O {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --no-check-certificate")
os.mkdir(args['GTDB_db'])
os.system(f"tar xzf {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --directory {args['GTDB_db']}")
os.system(f"mv {os.path.join(args['GTDB_db'], 'release214')} {os.path.join(args['GTDB_db'], 'GTDB_db')}")
os.system(f"rm {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')}")
os.system(f"conda env config vars set GTDBTK_DATA_PATH={os.path.join(args['GTDB_db'], 'GTDB_db')} -p {os.path.join(args['conda_env_dir'], 'ViWrap-GTDBTk')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | GTDB-Tk db has been set up")
if os.path.exists(f"{os.path.join(args['db_dir'], args['GTDB_db'])}"):
logger.info(f"{time_current} | Found {args['GTDB_db']}. GTDB_db db seems already set up.")
else:
if os.path.exists(f"{os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')}"):
logger.info(f"{time_current} | Found gtdbtk_r214_data.tar.gz. Skip download.")
else:
os.system(f"wget -c ftp://download.nmdc.cn/tools/meta/gtdb/gtdbtk_r214_data.tar.gz -O {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --no-check-certificate")
os.mkdir(args['GTDB_db'])
os.system(f"tar xzf {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --directory {args['GTDB_db']}")
os.system(f"mv {os.path.join(args['GTDB_db'], 'release214')} {os.path.join(args['GTDB_db'], 'GTDB_db')}")
os.system(f"rm {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')}")
os.system(f"conda env config vars set GTDBTK_DATA_PATH={os.path.join(args['GTDB_db'], 'GTDB_db')} -p {os.path.join(args['conda_env_dir'], 'ViWrap-GTDBTk')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | GTDB-Tk db has been set up")


# Step 8 Download VirSorter2 db
os.system(f"wget -c https://osf.io/v46sc/download -O {os.path.join(args['db_dir'], 'db.tgz')} --no-check-certificate")
os.system(f"tar -xzf {os.path.join(args['db_dir'], 'db.tgz')} -C {args['db_dir']}")
os.system(f"mv {os.path.join(args['db_dir'], 'db')} {args['VirSorter2_db']}")
os.system(f"rm {os.path.join(args['db_dir'], 'db.tgz')}")
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-vs2')} virsorter config --init-source --db-dir={args['VirSorter2_db']} >/dev/null 2>&1")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | VirSorter2 db has been set up")
if os.path.exists(f"{os.path.join(args['db_dir'], args['VirSorter2_db'])}"):
logger.info(f"{time_current} | Found {args['VirSorter2_db']}. VirSorter2_db db seems already set up.")
else:
if os.path.exists(f"{os.path.join(args['db_dir'], 'db.tgz')}"):
logger.info(f"{time_current} | Found db.tgz. Skip download.")
else:
os.system(f"wget -c https://osf.io/v46sc/download -O {os.path.join(args['db_dir'], 'db.tgz')} --no-check-certificate")
os.system(f"tar -xzf {os.path.join(args['db_dir'], 'db.tgz')} -C {args['db_dir']}")
os.system(f"mv {os.path.join(args['db_dir'], 'db')} {args['VirSorter2_db']}")
os.system(f"rm {os.path.join(args['db_dir'], 'db.tgz')}")
os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-vs2')} virsorter config --init-source --db-dir={args['VirSorter2_db']} >/dev/null 2>&1")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | VirSorter2 db has been set up")

# Step 9 Download DVF db
os.system(f"git clone https://github.com/jessieren/DeepVirFinder.git {os.path.join(args['db_dir'], 'DVF_db_tmp')}")
os.system(f"mv {os.path.join(args['db_dir'], 'DVF_db_tmp/models')} {args['DVF_db']}")
os.system(f"rm -rf {os.path.join(args['db_dir'], 'DVF_db_tmp')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | DVF db has been set up")
if os.path.exists(f"{os.path.join(args['db_dir'], args['DVF_db'])}"):
logger.info(f"{time_current} | Found {args['DVF_db']}. DVF_db db seems already set up.")
else:
os.system(f"git clone https://github.com/jessieren/DeepVirFinder.git {os.path.join(args['db_dir'], 'DVF_db_tmp')}")
os.system(f"mv {os.path.join(args['db_dir'], 'DVF_db_tmp/models')} {args['DVF_db']}")
os.system(f"rm -rf {os.path.join(args['db_dir'], 'DVF_db_tmp')}")

time_current = f"[{str(datetime.now().replace(microsecond=0))}]"
logger.info(f"{time_current} | DVF db has been set up")


end_time = datetime.now().replace(microsecond=0)
duration = end_time - start_time
logger.info(f"The total running time is {duration} (in \"hr:min:sec\" format)")

end_time = datetime.now().replace(microsecond=0)
duration = end_time - start_time
logger.info(f"The total running time is {duration} (in \"hr:min:sec\" format)")