oncogenator/cgi_annotator.py at main · oncodash/oncogenator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import zipfile
from enum import Enum

import pandas as pd
from utils import *
import httpx
import io
import urllib3

CGI_LOGIN = ""
CGI_TOKEN = ""

class cna_alt_to_cgi(Enum):
    AMPLIFICATION = "AMP"
    DELETION = "DEL"
    def __str__(self):
        return str(self.value)

class cgi2oncokb_level(Enum):
    A = "LEVEL_1"
    B = "LEVEL_2"
    C = "LEVEL_3A"
    D = "LEVEL_3B"
    E = "LEVEL_4"
    R1 = "LEVEL_R1"
    R2 = "LEVEL_R2"
    def __str__(self):
        return str(self.value)

def map_cgi_evidence(biomarker):
    """
        Map CGI evidence to OncoKB levels.

        Parameters:
        biomarker (Series): A Series containing biomarker data.

        Returns:
        str: Mapped OncoKB level.
    """
    evidence = biomarker['Evidence']
    response = biomarker['Response']
    if pd.isna(evidence):
        return None
    if response == "Responsive":
        return cgi2oncokb_level[evidence].value
    if response == "Resistant":
        if cgi2oncokb_level[evidence] in ["LEVEL_1", "LEVEL_2"]:
            return cgi2oncokb_level["R1"].value
        if cgi2oncokb_level[evidence] in ["LEVEL_3A", "LEVEL_3B", "LEVEL_4"]:
            return cgi2oncokb_level["R2"].value
    return None

def handle_treatments_cgi(row, alt_type, alteration):
    """
        Handle treatments from CGI data.

        Parameters:
        row (Series): A Series containing treatment data.
        alt_type (str): Alteration type.
        alteration (str): Alteration description.

        Returns:
        Series: A Series containing treatment information.
    """

    drugs = row['Drugs']
    pmids = row['Source']
    approvedIndications = row['Biomarker']
    tumortype = row['Tumor type']
    level = map_cgi_evidence(row)
    description = ""
    return pd.Series({
        'alteration_type': alt_type,
        'alteration': alteration,
        'approvedIndications': approvedIndications,
        'description': description,
        'treatment': drugs,
        'level_of_evidence': level,
        'cgi_level': handle_string_field(row['Evidence'])+"("+handle_string_field(row['Response'])+")",
        'citations': pmids,
        'tumorType': tumortype
    })

def generate_cgi_cna_file_from_list(genelist):
    """
        Launch a CGI job with multiple variant types.

        Parameters:
        mutations_file (str): Path to the mutation file.
        cnas_file (str): Path to the CNAs file.
        transloc_file (str): Path to the translocation file.
        cancer_type (str): Type of cancer.
        reference (str): Reference genome.

        Returns:
        str: Job ID if the request is successful, otherwise 0.
    """
    header = "gene\tcna\n"
    with open("./tmp/cnas.ext", "w") as file2:
        file2.write(header)
        genes = genelist
        for gene in genes:
            row = gene + '\tAMP\n'
            print(row)
            file2.write(row)
        file2.close()

def launch_cgi_job_with_mulitple_variant_types(mutations_file=None, cnas_file=None, transloc_file=None, cancer_type="HGSOC", reference="GRCh38"):
    """
        This function launches a CGI (Cancer Genome Interpreter) job with multiple variant types,
        using the CGI API. It takes in mutation, cnas, and translocation files, cancer type, and
        reference as input, and returns a job ID if the request is successful.

        Args:
        mutations_file (str): The path to the mutation file.
        cnas_file (str): The path to the cnas file.
        transloc_file (str): The path to the translocation file.
        cancer_type (str): The type of cancer.
        reference (str): The reference genome.

        Returns:
        jobid (str): The job ID if the request is successful.

        Raises:
        None.
        """

    request_url = "https://www.cancergenomeinterpreter.org/api/v1"
    login = CGI_LOGIN
    token = CGI_TOKEN

    print("Request CGI")
    # CGI api requires every type mutation files to be provided
    headers = {
        'Authorization': login+' '+token
    }

    if cnas_file:
        payload = {
            'cancer_type': cancer_type,
            'title': 'Title',
            'reference': reference,
            'cnas': ('cnas.ext', open(cnas_file, 'rb').read(), 'application/octet-stream')
        }
    if mutations_file:
        payload = {
            'cancer_type': cancer_type,
            'title': 'Title',
            'reference': reference,
            'mutations': ('snvs.ext', open(mutations_file, 'rb').read(), 'application/octet-stream'),
        }

    # Make the POST request using multipart/form-data with the files parameter
    http = urllib3.PoolManager()

    # Make the POST request using multipart/form-data with the files parameter
    response = http.request(
        'POST',
        'https://www.cancergenomeinterpreter.org/api/v1',
        fields=payload,
        headers=headers,
        multipart_boundary="----WebKitFormBoundary7MA4YWxkTrZu0gW",
        preload_content=False  # Set preload_content to False to allow streaming the files
    )

    if (response.status == 200):

        jobid = response.data.decode("utf-8")
        print(jobid)
        return jobid

    else:
        print("[ERROR] Unable to request. Response: ", print(response.data))
        return 0


def query_cgi_job(jobid, output, snv_annotations: pd.DataFrame = None, cna_annotations: pd.DataFrame = None):
    """
    Query the CGI API with a job ID and save the results to the database.

    Parameters:
    jobid (str): The job ID for the CGI job to query.
    snv_annotations (DataFrame): DataFrame containing SNV annotations.
    cna_annotations (DataFrame): DataFrame containing CNA annotations.

    Returns:
    int: 1 if successful, otherwise 0.
    """
    request_url = "https://www.cancergenomeinterpreter.org/api/v1/"
    print("Request CGI job by id")

    cgilogin = CGI_LOGIN
    cgitoken = CGI_TOKEN

    headers = {
        'Authorization': cgilogin + ' ' + cgitoken
    }
    payload = {'action': 'download'}
    # response = httpx.request("GET",request_url+jobid, headers=headers, fields=payload)
    response = httpx.get(request_url + jobid, params=payload, headers=headers, timeout=None)

    if response.status_code == 200:
        z = zipfile.ZipFile(io.BytesIO(response.content))
        fnames = z.namelist()
        treatmentsdf = None
        cgi_snvdf = None
        cgi_cnadf = None
        treatments = []

        for fn in fnames:
            z.extract(fn)
            df = pd.read_csv(fn, sep="\t")
            print(fn)
            print(df)

            # Mutation response
            # ['Input ID', 'CHROMOSOME', 'POSITION', 'REF', 'ALT', 'chr', 'pos', 'ref','alt', 'ALT_TYPE', 'STRAND', 'CGI-Sample ID', 'CGI-Gene', 'CGI-Protein Change', 'CGI-Oncogenic Summary', 'CGI-Oncogenic Prediction', 'CGI-External oncogenic annotation','CGI-Mutation', 'CGI-Consequence', 'CGI-Transcript', 'CGI-STRAND', 'CGI-Type', 'CGI-HGVS', 'CGI-HGVSc', 'CGI-HGVSp']

            if fn == "alterations.tsv":
                cgi_snvdf = df
            if fn == "cna_analysis.tsv":
                cgi_cnadf = df
            if fn == "biomarkers.tsv":
                treatmentsdf = df

        bioms = treatmentsdf.loc[treatmentsdf['Match'] == 'YES']
        i = 0
        for index, biom in bioms.iterrows():
            # TODO: identify CNA and SNVs from ID and handle separately
            id = handle_string_field(biom["Sample ID"])
            idsplit = id.split(":")
            print(id)
            if idsplit[0] == "CNA":
                alteration = idsplit[1]+":"+idsplit[2]
                treatment = handle_treatments_cgi(biom, 'CNA', alteration)
                print(treatment)
                treatments.append(treatment)
                updatedf = cna_annotations.loc[
                    (((cna_annotations['oncogenic'] == "Unknown") |
                      (cna_annotations['oncogenic'].isna() == True)) & (
                             cna_annotations['hugoSymbol'] == idsplit[1]) & (
                             cna_annotations['alteration'] == idsplit[2]))]
                print(len(updatedf))

                for indxs, row in updatedf.iterrows():
                    i += 1
                    cgi_cna = cgi_cnadf.loc[cgi_cnadf['sample'] == id].iloc[0]
                    cna_annotations.at[indxs, 'oncogenic'] = handle_string_field(cgi_cna["driver"])
                    cna_annotations.at[indxs, 'gene_role'] = handle_string_field(cgi_cna["gene_role"]),
                    cna_annotations.at[indxs, 'tumorTypeSummary'] =  handle_string_field(cgi_cna["driver_statement"])

            if idsplit[0] == "SNV":
                hugoSymbol = idsplit[1]
                chromosome = str(idsplit[2])
                position = int(idsplit[3])
                reference_allele = str(idsplit[4])
                sample_allele = str(idsplit[5])
                alteration = hugoSymbol + ":" + chromosome + ":" + str(
                    position) + ":" + reference_allele + ":" + sample_allele

                treatment = handle_treatments_cgi(biom, 'SNV', alteration)
                print(treatment)
                treatments.append(treatment)

                # TODO: try update only if oncokb oncogenic result is None e.g. not known by oncokb
                updatedf = snv_annotations.loc[
                    (((snv_annotations['oncogenic'] == "Unknown") | (snv_annotations['oncogenic'].isna() == True)) & snv_annotations['alteration'] == alteration)]
                print("SNV updatedf:"+str(len(updatedf)))

                for indxs, row in updatedf.iterrows():
                    snv_annotations.at[indxs, 'consequence'] = handle_string_field(row["CGI-Consequence"]),
                    cgi_snv = cgi_snvdf.loc[cgi_snvdf['CGI-Sample ID'] == id].iloc[0]
                    snv_annotations.at[indxs, 'oncogenic'] = handle_string_field(cgi_snv["CGI-Oncogenic Summary"])
                    snv_annotations.at[indxs, 'gene_role'] = handle_string_field(cgi_snv["CGI-Oncogenic Prediction"]),
                    snv_annotations.at[indxs, 'tumorTypeSummary'] = handle_string_field(cgi_snv["driver_statement"])

        if isinstance(snv_annotations, pd.DataFrame):
            snv_annotations.to_csv(output, index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
            trdf = pd.DataFrame(treatments)
            trdf.to_csv("treatments.csv", mode="a", index=False, sep="\t")

        if isinstance(cna_annotations, pd.DataFrame):
            cna_annotations.to_csv(output, index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
            trdf = pd.DataFrame(treatments)
            trdf.to_csv("treatments.csv", mode="a", index=False, sep="\t")

        return 1
    else:
    #print(response.status_code)
        print("No CGI results available for job id: "+str(jobid))
        return 0

def generate_cgi_cna_file_from_list(genelist):
    header = "gene\tcna\n"
    with open("./tmp/cnas.ext", "w") as file2:
        file2.write(header)
        genes = genelist
        for gene in genes:
            row = gene + '\tAMP\n'
            print(row)
            file2.write(row)
        file2.close()

def generate_temp_cgi_query_files(snv_annotations: pd.DataFrame = None, cna_annotations: pd.DataFrame = None, translocs: pd.DataFrame = None, append_to_annotations: bool = True):
    """
        Generate temporary CGI query files from annotations.

        Parameters:
        snv_annotations (DataFrame): DataFrame containing SNV annotations.
        cna_annotations (DataFrame): DataFrame containing CNA annotations.
        translocs (DataFrame): DataFrame containing translocation data.
    """
    header = "chr\tpos\tref\talt\tsample\n"
    try:
        if isinstance(snv_annotations, pd.DataFrame):
            if append_to_annotations:
                with open("./tmp/snvs.ext", "w") as file1:
                    file1.write(header)

                    uniques = snv_annotations[['alteration']].drop_duplicates()
                    for indx, snv in uniques.iterrows():
                        id = "SNV:"+snv['alteration']
                        alt_split = snv['alteration'].split(':')
                        row = alt_split[1]+'\t'+alt_split[2]+'\t'+alt_split[3]+'\t'+alt_split[4]+'\t'+id+'\n'
                        file1.write(row)
                    file1.close()
            else:
                with open("./tmp/snvs.ext", "w") as file1:
                    file1.write(header)

                    uniques = snv_annotations[['hugoSymbol', 'chromosome', 'position', 'reference_allele', 'sample_allele', 'tumorType', 'referenceGenome']].drop_duplicates()
                    for indx, snv in uniques.iterrows():
                        id = "SNV:"+snv['hugoSymbol']+':'+snv['chromosome']+':'+str(snv['position'])+':'+snv['reference_allele']+':'+snv['sample_allele']
                        row = snv['chromosome']+'\t'+str(snv['position'])+'\t'+snv['reference_allele']+'\t'+snv['sample_allele']+'\t'+id+'\n' #+'\t'+cryptocode.encrypt(snv.samples, settings.CRYPTOCODE)+'\n'
                        file1.write(row)
                    file1.close()

        if isinstance(cna_annotations, pd.DataFrame):
            header = "gene\tcna\tsample\n"
            with open("./tmp/cnas.ext", "w") as file2:
                file2.write(header)

                uniques = cna_annotations[['hugoSymbol', 'alteration', 'referenceGenome', 'tumorType']].drop_duplicates()
                print(type(uniques))
                for indx, cna in uniques.iterrows():
                    print(cna)
                    id = "CNA:"+str(cna['hugoSymbol']) + ':' + str(cna['alteration'])
                    row = cna['hugoSymbol']+'\t'+cna_alt_to_cgi[cna['alteration']].value+'\t'+id+'\n'
                    file2.write(row)
                file2.close()

    except Exception as e:
        print(f"Unexpected {e=}, {type(e)=}")
        raise
    return 1