ViPhySlim/preprocessing.py at main · ErillLab/ViPhySlim · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
from Bio import Entrez, SeqIO
import yaml
import shutil
from memory_profiler import profile
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbimakeblastdbCommandline
import time

'''
    Load configuration from YAML and create
    working directories
'''
yaml_file = "configuration.yml"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

with open(yaml_file, "r") as file:
    document = yaml.safe_load(file)
    email = document["entrez_parameters"]["user_email"]
    genomes_path = os.path.join(BASE_DIR, document["input_parameters"]["genomes_path"])
    working_path = os.path.join(BASE_DIR, document["operational_parameters"]["working_path"])
    output_file_prefix = document["output_parameters"]["output_file_name"]
    erase_working_folder = document["operational_parameters"]["erase_working_folder"]
    accessions = document["input_parameters"]["accession_numbers"]
    execution_type = document["input_parameters"]["execution_type"]

    distance_formula = document['algorithm_parameters']['distance_formula']
    bootstrapping = document['algorithm_parameters']['bootstrap_replicates']

    word_size = document["BLAST_parameters"]["word_size"]
    positives = document["BLAST_parameters"]["positives"]
    e_value = document["BLAST_parameters"]["e_value"]

    generate_tree = document["tree_parameters"]["generate_tree"]
    root_tree = document["tree_parameters"]["root_tree"]
    accession_name = document["tree_parameters"]["accession_name"]
    generate_image = document["tree_parameters"]["generate_image"]

Entrez.email = email

translated_path = os.path.join(working_path, "Translated")
db_path = os.path.join(working_path, "DB")
os.makedirs(translated_path, exist_ok=True)
os.makedirs(db_path, exist_ok=True)

if not accession_name:
    accession_names = []

'''
    Translate the given sequence record into
    six reading frames
'''
def translate_complementary(record):
    forward1 = record.seq.translate()
    forward2 = record.seq[1:].translate()
    forward3 = record.seq[2:].translate()

    reverse = record.seq.reverse_complement()
    reverse1 = reverse.translate()
    reverse2 = reverse[1:].translate()
    reverse3 = reverse[2:].translate()

    sequence = str(forward1) + str(forward2) + str(forward3) + str(reverse1) + str(reverse2) + str(reverse3)

    return SeqRecord(Seq(sequence), id=record.id, description=f'{record.id} 6-frames translation')


'''
    Reads the list of accession numbers and if the
    corresponding genome file does not exist, it
    downloads them from the NCBI database
'''
def check_and_download_genomes():
    for accession in accessions:
        fasta_path = os.path.join(genomes_path, f"{accession}.fasta")
        genbank_path = os.path.join(genomes_path, f"{accession}.gb")

        if os.path.exists(fasta_path) or os.path.exists(genbank_path):
            continue

        with Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text") as handle:
            record = SeqIO.read(handle, "genbank")

        with open(fasta_path, "w") as fasta_file:
            SeqIO.write(record, fasta_file, "fasta")

        time.sleep(1)


'''
    Loads each genome file and translates it
    using the "translate_six_frames" function.
    The translated sequence is saved in the
    "Translated" folder inside the working directory.
    Finally, it creates an individual BLAST database
    for each genome and stores it in the "DB" folder
    in the working directory
'''
def process_and_translate_genomes():
    for filename in os.listdir(genomes_path):
        if filename.endswith(".fasta") or filename.endswith(".gb"):
            filepath = os.path.join(genomes_path, filename)
            accession_number = os.path.splitext(filename)[0]

            if not accession_name:
                if filename.endswith(".fasta"):
                    with open(filepath, "r") as file:
                        header = file.readline().strip()
                        virus_name = header.split(None, 1)[1].split(',')[0]
                elif filename.endswith(".gb"):
                    with open(filepath, "r") as file:
                        for line in file:
                            if line.startswith("DEFINITION"):
                                virus_name = line.split("  ", 1)[1].split(',')[0]
                                break
                accession_names.append(virus_name)

            record = SeqIO.read(filepath, "fasta" if filename.endswith(".fasta") else "genbank")
            translated_record = translate_complementary(record)

            translated_file_path = os.path.join(translated_path, f"{accession_number}.fasta")
            with open(translated_file_path, "w") as result_file:
                SeqIO.write(translated_record, result_file, "fasta")

            individual_db_path = os.path.join(db_path, accession_number)
            create_db_cmd = NcbimakeblastdbCommandline(dbtype="prot", input_file=translated_file_path, out=individual_db_path)
            create_db_cmd()
            time.sleep(0.5)


'''
    Runs the preprocessing stage:
        - Erases both folders in WorkingDirectory path
        - Checks and downloads missing genome files
        - Translates genome sequences into six frames and saves them
        - Creates individual BLAST databases for each translated genome
'''
def preprocessing():
    if erase_working_folder:
        shutil.rmtree(db_path)
        shutil.rmtree(translated_path)

    if not os.path.exists(db_path):
        os.mkdir(db_path)
    if not os.path.exists(translated_path):
        os.mkdir(translated_path)

    check_and_download_genomes()
    process_and_translate_genomes()