compara-deep-learning/process_data.py at master · EMBLEBIGSOC/compara-deep-learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import progressbar
from save_data import write_dict_json


def create_map_list(l):  # this function maps the indexes to values
    t = {}
    for i in range(len(l)):
        t[l[i]] = i
    return t


def create_chromosome_maps(a, n):
    cmap = []
    cimap = []
    for df in progressbar.progressbar(a):
        chmap = {}
        chindmap = {}
        for index, row in df.iterrows():
            g = row.gene_id
            try:
                _ = chmap[g]
            except BaseException:
                chmap[g] = str(row.Chr)
                if str(row.Chr) in chindmap:
                    chindmap[str(row.Chr)].append(index)
                else:
                    chindmap[str(row.Chr)] = []
                    chindmap[str(row.Chr)].append(index)
        cmap.append(chmap)
        cimap.append(chindmap)
    return cmap, cimap


def list_dict_genomes(a, n):
    lst = []
    ldt = []
    for x in a:
        ldgt = {}
        uc = list(x["gene_id"])
        for i, r in x.iterrows():
            ldgt[r.gene_id] = i
        lst.append(uc)
        ldt.append(ldgt)
    return lst, ldt


def get_nearest_neighbors(g, gs, n, a, d, ld, ldg, cmap, cimap):
    # print("Finding Neighbor Genes")
    ne = []  # list to store the backward genes
    nr = []  # list to store the forward genes
    # get the address of the corresponding species to which the gene belongs
    # whose neighbor has to be found
    gi = d[gs.capitalize()]
    sldf = a[gi]  # select the dataframe
    scmap = cmap[gi]  # select the correct chromosome map
    scimap = cimap[gi]  # select the correct index maps
    try:
        _ = ld[gi]  # see if the corresponding gene map exists
    except BaseException:
        # print("Length of Dataframes:{} \t Length of Loaded Genes:{} \t Length of Loaded Genomes Dictionaries:{}".format(len(a),len(ld),len(ldg)))
        return ne, nr
    sldg = ldg[gi]  # select the corresponding map
    if g not in sldg:  # if the gene is not present in the dataframe return empty lists
        # print(g,"\t",gs)
        return ne, nr
    i = sldg[g]  # find the index of the gene
    chromosome_id = scmap[g]  # get the chromosome no from the database.
    scimap = scimap[chromosome_id]  # select the correct chromosomes indexes
    sldf = sldf.loc[scimap]  # select only the same chromosome genes.
    # get the -n neighbors
    start = int(sldf.loc[i]['start'])  # get the start location of the gene
    flag = 0
    for j in range(n):
        if flag == 1:
            ne.append("NULL_GENE")
            continue
        itemp = 0
        # select the column
        end = list(sldf.end)
        end = np.array(end)
        assert(len(end) == len(sldf))
        end = end - start  # subtract start from it so as to get relative position
        end_s = np.argsort(end)  # sort them by the order of distance
        if end[end_s[0]] >= 0:  # if all the genes end ahead of the one in consideration
            flag = 1  # increment the pointer
            ne.append("NULL_GENE")  # append the NULL_GENE value
            continue
        for k in end_s:  # iterate through the sorted array
            # find the first value that is negative and the next one is
            # positive to get the nearest gene
            if end[k] < 0 and end[k + 1] >= 0:
                itemp = k
                break
        itemp = scimap[itemp]
        ne.append(sldf.loc[itemp].gene_id)
        # make "start" the start location of the current gene
        start = int(sldf.loc[itemp].start)
        # print(start)
    # get the +n neighbors
    flag = 0
    end = int(sldf.loc[i].end)
    for j in range(n):
        if flag == 1:
            nr.append("NULL_GENE")
            continue
        itemp = 0
        start = list(sldf.start)
        start = np.array(start)
        start = start - end
        start_s = np.argsort(start)
        if start[start_s[-1]] < 0:
            flag = 1
            nr.append("NULL_GENE")
            continue
        for k in start_s:
            if start[k] > 0:
                itemp = k
                break
        itemp = scimap[itemp]
        nr.append(sldf.loc[itemp].gene_id)
        end = int(sldf.loc[itemp].end)
    return ne, nr


def create_data_homology_ls(a_h, d_h, n, a, d, ld, ldg, cmap, cimap, to_write):
    # dictionary which stores +/- n genes of the given gene by id. Each key is
    # a gene id which corresponds to the one in center.
    lsy = {}
    lsytemp = {}
    name = "neighbor_genes"
    for df in a_h:
        for _, row in progressbar.progressbar(df.iterrows()):
            x = row["gene_stable_id"]
            y = row["homology_gene_stable_id"]
            xs = row["species"]
            ys = row["homology_species"]
            try:
                _ = lsy[x]
            except BaseException:
                try:
                    # see if the species exist in genomic maps
                    _ = d[xs.capitalize()]
                    xl, xr = get_nearest_neighbors(
                        x, xs, n, a, d, ld, ldg, cmap, cimap)
                    if len(
                            xl) != 0:  # check if neighboring genes were successfully found
                        lsy[x] = dict(b=xl, f=xr)
                        lsytemp[x] = dict(b=xl, f=xr)
                except BaseException:
                    continue
            try:
                _ = lsy[y]
            except BaseException:
                try:
                    _ = d[ys.capitalize()]
                    yl, yr = get_nearest_neighbors(
                        y, ys, n, a, d, ld, ldg, cmap, cimap)
                    if len(yl) != 0:
                        lsy[y] = dict(b=yl, f=yr)
                        lsytemp[y] = dict(b=yl, f=yr)
                except BaseException:
                    continue
    if to_write == 1:
        write_dict_json(name, "processed", lsy)
    return lsy