diff --git a/README.md b/README.md index 4643bf8..bfd6fad 100644 --- a/README.md +++ b/README.md @@ -319,7 +319,7 @@ Choose one: 1. `/path/to/ViWrap/yamls/ViWrap.yml` indicates the ViWrap.yml file address. This file was placed within the yamls folder of ViWrap directory. 2. `/path/to/ViWrap_conda_environments` indicates the directory that you will need to use to store all conda environments for ViWrap. - 3. For the first step, one can also use one-line conda environment setting up string as: `conda create -c bioconda -c conda-forge -p /path/to/ViWrap_conda_environments/ViWrap python=3.8 biopython=1.80 mamba=1.3.0 numpy=1.24.2 pandas=1.5.3 pyfastx=0.8.4 matplotlib=3.6.3 seaborn=0.12.2 diamond=2.0.15 hmmer=3.3.2 pyparsing=2.4.7` + 3. For the first step, one can also use one-line conda environment setting up string as: `conda create -c bioconda -c conda-forge -p /path/to/ViWrap_conda_environments/ViWrap python=3.8 biopython=1.80 mamba=1.5.1 numpy=1.24.2 pandas=1.5.3 pyfastx=0.8.4 matplotlib=3.6.3 seaborn=0.12.2 diamond=2.0.15 hmmer=3.3.2 pyparsing=2.4.7` 2. Install in normal conda folder diff --git a/scripts/master_downloader.py b/scripts/master_downloader.py old mode 100644 new mode 100755 index e7739e9..9b9b367 --- a/scripts/master_downloader.py +++ b/scripts/master_downloader.py @@ -55,7 +55,8 @@ def main(args): sys.exit(f"Could not find conda env dirs within {args['conda_env_dir']}") if os.path.exists(args['db_dir']): - sys.exit(f"The db dir of {args['db_dir']} has also ready been set up") + #sys.exit(f"The db dir of {args['db_dir']} has also ready been set up") + logger.info(f"The db dir of {args['db_dir']} has also ready been set up") else: os.mkdir(args['db_dir']) @@ -66,123 +67,160 @@ def main(args): # Step 2 Make VIBRANT db vibrant_db_dir_absolute_path = os.path.abspath(args['VIBRANT_db']) - os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT')} bash {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT/bin/download-db.sh')} {vibrant_db_dir_absolute_path}") - + if os.path.exists(f"{os.path.join(args['db_dir'], 'VIBRANT_db')}"): + logger.info(f"{time_current} | Found VIBRANT_db. Skip download.") + else: + os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT')} bash {os.path.join(args['conda_env_dir'], 'ViWrap-VIBRANT/bin/download-db.sh')} {vibrant_db_dir_absolute_path}") + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" logger.info(f"{time_current} | VIBRANT db has been set up") # Step 3 Make geNomad db - os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-geNomad')} genomad download-database {args['db_dir']}") + if os.path.exists(f"{os.path.join(args['db_dir'], 'genomad_db')}"): + logger.info(f"{time_current} | Found genomad_db. Skip download.") + else: + os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-geNomad')} genomad download-database {args['db_dir']}") time_current = f"[{str(datetime.now().replace(microsecond=0))}]" logger.info(f"{time_current} | geNomad db has been set up") # Step 4 Make Tax classification db - os.mkdir(args['Tax_classification_db']) + if os.path.exists(f"{os.path.join(args['db_dir'], args['Tax_classification_db'])}"): + logger.info(f"{time_current} | Found {args['Tax_classification_db']}. Tax_classification_db db seems already set up.") + else: + os.mkdir(args['Tax_classification_db']) - ############################################### - # Part I Download NCBI RefSeq viral protein db# - ############################################### + ############################################### + # Part I Download NCBI RefSeq viral protein db# + ############################################### - ## Step 4.1 Download NCBI RefSeq viral protein and protein gpff - scripts.downloadDB.dl_refseq_viral_protein(args['Tax_classification_db']) - scripts.downloadDB.dl_refseq_viral_protein_gpff(args['Tax_classification_db']) + ## Step 4.1 Download NCBI RefSeq viral protein and protein gpff + scripts.downloadDB.dl_refseq_viral_protein(args['Tax_classification_db']) + scripts.downloadDB.dl_refseq_viral_protein_gpff(args['Tax_classification_db']) - ## Step 4.2 Parse to get protein to NCBI taxonomy info - scripts.downloadDB.parse_gpff(args['Tax_classification_db']) + ## Step 4.2 Parse to get protein to NCBI taxonomy info + scripts.downloadDB.parse_gpff(args['Tax_classification_db']) - ## Step 4.3 Grep NCBI RefSeq viral proteins with taxonomy info - scripts.downloadDB.grep_NCBI_RefSeq_viral_proteins_w_tax(args['Tax_classification_db']) + ## Step 4.3 Grep NCBI RefSeq viral proteins with taxonomy info + scripts.downloadDB.grep_NCBI_RefSeq_viral_proteins_w_tax(args['Tax_classification_db']) - ## Step 4.4 Reformat NCBI tax to ICTV 8-rank tax - ictv_tax_info = os.path.join(args['root_dir'], 'database/ICTV_Master_Species_List.txt') - pro2ictv_8_rank_tax = os.path.join(args['Tax_classification_db'], 'pro2ictv_8_rank_tax.txt') - scripts.downloadDB.reformat_NCBI_tax_to_ICTV_8_rank_tax(args['Tax_classification_db'], ictv_tax_info, pro2ictv_8_rank_tax) + ## Step 4.4 Reformat NCBI tax to ICTV 8-rank tax + ictv_tax_info = os.path.join(args['root_dir'], 'database/ICTV_Master_Species_List.txt') + pro2ictv_8_rank_tax = os.path.join(args['Tax_classification_db'], 'pro2ictv_8_rank_tax.txt') + scripts.downloadDB.reformat_NCBI_tax_to_ICTV_8_rank_tax(args['Tax_classification_db'], ictv_tax_info, pro2ictv_8_rank_tax) - ## Step 4.5 Make diamond blastp db - scripts.downloadDB.make_diamond_db(args['Tax_classification_db']) + ## Step 4.5 Make diamond blastp db + scripts.downloadDB.make_diamond_db(args['Tax_classification_db']) - ## Step 4.6 Remove useless files - scripts.downloadDB.remove(args['Tax_classification_db']) - - ########################## - # Part II Download VOG db# - ########################## - ## Step 4.7 Parse to get VOG marker list - vog_marker_table = os.path.join(args['root_dir'], 'database/VOG_marker_table.txt') - os.system(f"cp {os.path.join(args['root_dir'], 'database/VOG_marker_table.txt')} {os.path.join(args['Tax_classification_db'], 'VOG_marker_table.txt')}") - vog_marker_list = scripts.downloadDB.get_vog_marker_table(vog_marker_table) - - ## Step 4.8 Download the latest VOG db and pick VOG markers - scripts.downloadDB.get_marker_vog_hmm(vog_marker_list, args['Tax_classification_db']) - - ############################# - # Part III Download IMGVR db# - ############################# - ## Step 4.9 cp and degzip IMGVR db - os.system(f"cat {os.path.join(args['root_dir'], 'database/IMGVR_high-quality_phage_vOTU_representatives.tar.gz*')} > {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}") - os.system(f"tar xzf {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')} --directory {args['Tax_classification_db']}") - os.system(f"mv {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives/*')} {args['Tax_classification_db']}") - os.system(f"rm {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}") - os.system(f"rm -r {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives')}") - - time_current = f"[{str(datetime.now().replace(microsecond=0))}]" - logger.info(f"{time_current} | Tax classification db has been set up") + ## Step 4.6 Remove useless files + scripts.downloadDB.remove(args['Tax_classification_db']) + + ########################## + # Part II Download VOG db# + ########################## + ## Step 4.7 Parse to get VOG marker list + vog_marker_table = os.path.join(args['root_dir'], 'database/VOG_marker_table.txt') + os.system(f"cp {os.path.join(args['root_dir'], 'database/VOG_marker_table.txt')} {os.path.join(args['Tax_classification_db'], 'VOG_marker_table.txt')}") + vog_marker_list = scripts.downloadDB.get_vog_marker_table(vog_marker_table) + + ## Step 4.8 Download the latest VOG db and pick VOG markers + scripts.downloadDB.get_marker_vog_hmm(vog_marker_list, args['Tax_classification_db']) + + ############################# + # Part III Download IMGVR db# + ############################# + ## Step 4.9 cp and degzip IMGVR db + os.system(f"cat {os.path.join(args['root_dir'], 'database/IMGVR_high-quality_phage_vOTU_representatives.tar.gz*')} > {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}") + os.system(f"tar xzf {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')} --directory {args['Tax_classification_db']}") + os.system(f"mv {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives/*')} {args['Tax_classification_db']}") + os.system(f"rm {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives.tar.gz')}") + os.system(f"rm -r {os.path.join(args['Tax_classification_db'], 'IMGVR_high-quality_phage_vOTU_representatives')}") + + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" + logger.info(f"{time_current} | Tax classification db has been set up") # Step 5 Make CheckV db - os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} checkv download_database {args['db_dir']} >/dev/null 2>&1") - os.system(f"mv {os.path.join(args['db_dir'], 'checkv-db-v*')} {args['CheckV_db']}") - os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} export CHECKVDB={args['CheckV_db']}") - - time_current = f"[{str(datetime.now().replace(microsecond=0))}]" - logger.info(f"{time_current} | CheckV db has been set up") + if os.path.exists(f"{os.path.join(args['db_dir'], args['CheckV_db'])}"): + logger.info(f"{time_current} | Found CheckV_db. CheckV db seems already set up.") + else: + if os.path.exists(f"{os.path.join(args['db_dir'], 'checkv-db-v*')}"): + logger.info(f"{time_current} | Found checkv-db-v*. Skip download.") + else: + os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} checkv download_database {args['db_dir']} >/dev/null 2>&1") + + os.system(f"mv {os.path.join(args['db_dir'], 'checkv-db-v*')} args['CheckV_db']") + os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-CheckV')} export CHECKVDB={args['CheckV_db']}") + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" + logger.info(f"{time_current} | CheckV db has been set up") # Step 6 Make iPHoP db - os.system(f"wget -c https://portal.nersc.gov/cfs/m342/iphop/db/iPHoP.latest_rw.tar.gz --no-check-certificate -O {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}") - os.mkdir(args['iPHoP_db']) - os.system(f"tar xzf {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')} --directory {args['iPHoP_db']}") - os.system(f"mv {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/*')} {args['iPHoP_db']}") - os.system(f"rm {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}") - os.system(f"rmdir {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/')}") - - time_current = f"[{str(datetime.now().replace(microsecond=0))}]" - logger.info(f"{time_current} | iPHoP db has been set up") - + if os.path.exists(f"{os.path.join(args['db_dir'], args['iPHoP_db'])}"): + logger.info(f"{time_current} | Found CheckV_db. CheckV db seems already set up.") + else: + if os.path.exists(f"{os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}"): + logger.info(f"{time_current} | Found iPHoP.latest_rw.tar.gz. Skip download.") + else: + os.system(f"wget -c https://portal.nersc.gov/cfs/m342/iphop/db/iPHoP.latest_rw.tar.gz --no-check-certificate -O {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}") + + os.mkdir(args['iPHoP_db']) + os.system(f"tar xzf {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')} --directory {args['iPHoP_db']}") + os.system(f"mv {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/*')} {args['iPHoP_db']}") + os.system(f"rm {os.path.join(args['db_dir'], 'iPHoP.latest_rw.tar.gz')}") + os.system(f"rmdir {os.path.join(args['db_dir'], 'iPHoP_db/*_pub_rw/')}") + + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" + logger.info(f"{time_current} | iPHoP db has been set up") + # Step 7 Make GTDB-Tk db release 214 - os.system(f"wget -c ftp://download.nmdc.cn/tools/meta/gtdb/gtdbtk_r214_data.tar.gz -O {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --no-check-certificate") - os.mkdir(args['GTDB_db']) - os.system(f"tar xzf {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --directory {args['GTDB_db']}") - os.system(f"mv {os.path.join(args['GTDB_db'], 'release214')} {os.path.join(args['GTDB_db'], 'GTDB_db')}") - os.system(f"rm {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')}") - os.system(f"conda env config vars set GTDBTK_DATA_PATH={os.path.join(args['GTDB_db'], 'GTDB_db')} -p {os.path.join(args['conda_env_dir'], 'ViWrap-GTDBTk')}") - - time_current = f"[{str(datetime.now().replace(microsecond=0))}]" - logger.info(f"{time_current} | GTDB-Tk db has been set up") + if os.path.exists(f"{os.path.join(args['db_dir'], args['GTDB_db'])}"): + logger.info(f"{time_current} | Found {args['GTDB_db']}. GTDB_db db seems already set up.") + else: + if os.path.exists(f"{os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')}"): + logger.info(f"{time_current} | Found gtdbtk_r214_data.tar.gz. Skip download.") + else: + os.system(f"wget -c ftp://download.nmdc.cn/tools/meta/gtdb/gtdbtk_r214_data.tar.gz -O {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --no-check-certificate") + os.mkdir(args['GTDB_db']) + os.system(f"tar xzf {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')} --directory {args['GTDB_db']}") + os.system(f"mv {os.path.join(args['GTDB_db'], 'release214')} {os.path.join(args['GTDB_db'], 'GTDB_db')}") + os.system(f"rm {os.path.join(args['db_dir'], 'gtdbtk_r214_data.tar.gz')}") + os.system(f"conda env config vars set GTDBTK_DATA_PATH={os.path.join(args['GTDB_db'], 'GTDB_db')} -p {os.path.join(args['conda_env_dir'], 'ViWrap-GTDBTk')}") + + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" + logger.info(f"{time_current} | GTDB-Tk db has been set up") # Step 8 Download VirSorter2 db - os.system(f"wget -c https://osf.io/v46sc/download -O {os.path.join(args['db_dir'], 'db.tgz')} --no-check-certificate") - os.system(f"tar -xzf {os.path.join(args['db_dir'], 'db.tgz')} -C {args['db_dir']}") - os.system(f"mv {os.path.join(args['db_dir'], 'db')} {args['VirSorter2_db']}") - os.system(f"rm {os.path.join(args['db_dir'], 'db.tgz')}") - os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-vs2')} virsorter config --init-source --db-dir={args['VirSorter2_db']} >/dev/null 2>&1") - - time_current = f"[{str(datetime.now().replace(microsecond=0))}]" - logger.info(f"{time_current} | VirSorter2 db has been set up") + if os.path.exists(f"{os.path.join(args['db_dir'], args['VirSorter2_db'])}"): + logger.info(f"{time_current} | Found {args['VirSorter2_db']}. VirSorter2_db db seems already set up.") + else: + if os.path.exists(f"{os.path.join(args['db_dir'], 'db.tgz')}"): + logger.info(f"{time_current} | Found db.tgz. Skip download.") + else: + os.system(f"wget -c https://osf.io/v46sc/download -O {os.path.join(args['db_dir'], 'db.tgz')} --no-check-certificate") + os.system(f"tar -xzf {os.path.join(args['db_dir'], 'db.tgz')} -C {args['db_dir']}") + os.system(f"mv {os.path.join(args['db_dir'], 'db')} {args['VirSorter2_db']}") + os.system(f"rm {os.path.join(args['db_dir'], 'db.tgz')}") + os.system(f"conda run -p {os.path.join(args['conda_env_dir'], 'ViWrap-vs2')} virsorter config --init-source --db-dir={args['VirSorter2_db']} >/dev/null 2>&1") + + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" + logger.info(f"{time_current} | VirSorter2 db has been set up") # Step 9 Download DVF db - os.system(f"git clone https://github.com/jessieren/DeepVirFinder.git {os.path.join(args['db_dir'], 'DVF_db_tmp')}") - os.system(f"mv {os.path.join(args['db_dir'], 'DVF_db_tmp/models')} {args['DVF_db']}") - os.system(f"rm -rf {os.path.join(args['db_dir'], 'DVF_db_tmp')}") - - time_current = f"[{str(datetime.now().replace(microsecond=0))}]" - logger.info(f"{time_current} | DVF db has been set up") + if os.path.exists(f"{os.path.join(args['db_dir'], args['DVF_db'])}"): + logger.info(f"{time_current} | Found {args['DVF_db']}. DVF_db db seems already set up.") + else: + os.system(f"git clone https://github.com/jessieren/DeepVirFinder.git {os.path.join(args['db_dir'], 'DVF_db_tmp')}") + os.system(f"mv {os.path.join(args['db_dir'], 'DVF_db_tmp/models')} {args['DVF_db']}") + os.system(f"rm -rf {os.path.join(args['db_dir'], 'DVF_db_tmp')}") + time_current = f"[{str(datetime.now().replace(microsecond=0))}]" + logger.info(f"{time_current} | DVF db has been set up") + - end_time = datetime.now().replace(microsecond=0) - duration = end_time - start_time - logger.info(f"The total running time is {duration} (in \"hr:min:sec\" format)") - \ No newline at end of file + end_time = datetime.now().replace(microsecond=0) + duration = end_time - start_time + logger.info(f"The total running time is {duration} (in \"hr:min:sec\" format)") +