-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfiltering_masterTable.py
More file actions
executable file
·87 lines (74 loc) · 4.19 KB
/
filtering_masterTable.py
File metadata and controls
executable file
·87 lines (74 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#/usr/bin/env python
import pandas as pd
import numpy as np
import glob
df = pd.read_csv('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/freeze_mastertable_RNAseq+histone+ATAC+DNase.txt', sep = '\t')
# extract upstream regions for assessing functional properties
upstream = (
df.loc[:, ["chr", "start", "end", "gene_id", "score", "strand", "transcript_id", "geneType"]]
.drop_duplicates()
.assign(
new_start=lambda x: np.where(x["strand"] == "-", x["end"] - 1000, x["start"]),
new_end=lambda x: np.where(x["strand"] == "-", x["end"], x["start"] + 1000)
)
.drop(columns=["start", "end"])
.rename(columns={"new_start": "start", "new_end": "end"})
.loc[:, ["chr", "start", "end", "gene_id", "score", "strand", "transcript_id", "geneType"]])
upstream.to_csv('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/resources/GRCh38_tss_upstream.bed', sep = '\t', index=False)
# DNA methylation data
methylation_files = glob.glob('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/intersection/DNAme/*.txt')
methylation_data = []
for file in methylation_files:
tissue = file.split('/')[-1].replace('.txt', '')
meth_df = pd.read_csv(file, sep='\t')
meth_df['transcript_id'] = meth_df['transcript_id'].astype(str).str.split('.').str[0]
meth_df['tissue'] = tissue
methylation_data.append(meth_df)
methylation_df = pd.concat(methylation_data, ignore_index=True)
tmp = df.merge(methylation_df, on=['transcript_id', 'tissue'], how='left')
tmp.to_csv('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/freeze_mastertable_RNAseq+histone+ATAC+DNase+DNAme.txt', sep = '\t', index=False)
# transcripts with overlapping promoter regions
blacklist = pd.read_csv('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/gencode_v29/blacklist.txt', header=None)
lt = [item.split('.')[0] for item in blacklist[0] if "PAR_Y" not in item]
# remove transcripts and rename gene type
df2 = df[~df['transcript_id'].isin(lt)]
df2['geneType'] = df2['geneType'].apply(lambda x:
'unprocessed_pseudogene' if 'unprocessed_pseudogene' in x else
'processed_pseudogene' if 'processed_pseudogene' in x else
'unitary_pseudogene' if 'unitary_pseudogene' in x else
x)
lncRNA_types = [
'3prime_overlapping_ncrna', 'antisense', 'bidirectional_promoter_lncrna',
'macro_lncRNA', 'non_coding', 'processed_transcript',
'sense_intronic', 'sense_overlapping', 'lincRNA'
]
df2['geneType'] = df2['geneType'].replace(lncRNA_types, 'lncRNA')
# annotation-based filtering
df_unique = df2[['transcript_id', 'geneType']].drop_duplicates()
transcript_counts = df_unique.groupby('geneType')['transcript_id'].nunique()
print(transcript_counts)
# expression-based filtering
df2_filtered = df2.loc[df2.groupby(['gene_id', 'tissue'])['avgTPM'].idxmax()].reset_index(drop=True)
df2_filtered = df2_filtered[df2_filtered['avgTPM'] >= 1]
# expression-based filtering -- num of non-redundant transcripts
df_unique = df2_filtered[['transcript_id','geneType']].drop_duplicates()
transcript_counts = df_unique.groupby('geneType')['transcript_id'].nunique()
print(transcript_counts)
# expression-based filtering -- num of non-redundant genes
df_unique = df2_filtered[['gene_id','geneType']].drop_duplicates()
gene_counts = df_unique.groupby('geneType')['gene_id'].nunique()
print(gene_counts)
# (tissue, transcript)-- pair of transcripts and tissues
df_unique = df2_filtered[['tissue', 'transcript_id', 'geneType']].drop_duplicates()
df_unique['tissue_transcript'] = list(zip(df_unique['tissue'], df_unique['transcript_id']))
transcript_counts = df_unique.groupby('geneType')['tissue_transcript'].nunique()
print(transcript_counts)
# RAMPAGE-based filter
RAMPAGE = pd.read_csv('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/intersection/RAMPAGE/GRCh38_tss_1kb_rPeaks.txt', sep='\t', header=None)
RAMPAGE_filtered = RAMPAGE[~RAMPAGE[6].str.contains("PAR_Y", na=False)]
RAMPAGE_filtered[6] = RAMPAGE_filtered[6].str.split('.').str[0]
RAMPAGE_filtered = RAMPAGE_filtered[RAMPAGE_filtered[8] != '.']
tx = set(RAMPAGE_filtered[6])
df2_filtered['RAMPAGE'] = df2_filtered['transcript_id'].isin(tx)
df2_filtered = df2_filtered.sort_values(by=["chr", "start"]).reset_index(drop=True)
df2_filtered.to_csv('/gpfs/gibbs/pi/gerstein/yj329/epiPgene/freeze_mastertable_RNAseq+histone+ATAC+DNase_filtered.txt', sep = '\t', index=False)