-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathread_data.py
More file actions
36 lines (25 loc) · 1.31 KB
/
read_data.py
File metadata and controls
36 lines (25 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# adjust clinical_gene.csv path.
import pandas as pd
# Add this code to change the columns names to probes_names
with open('clinical_gene.csv', 'r') as file:
wrong_col_names = file.readline().split('|')
gene_names = file.readline().split('|')
third_line = file.readline().split('|')
probes_names = file.readline().split('|')
# Create a dictionary with old columns as key and the probes names as the values.
col_dictionary = dict(zip(wrong_col_names[13:], probes_names[13:]))
# Create a dictionary Key = probe ; value = gene
probe_gene_dic = dict(zip(probes_names[13:], gene_names[13:]))
del third_line, probes_names, file, gene_names
# Some columns are float because pandas integer cant represent NA values
clinical_gene_df = pd.read_csv('clinical_gene.csv',
sep = '|',
skiprows = 4,
header = None,
names = wrong_col_names)
# Change names from V1 to probe names using a dictionary
clinical_gene_df.rename(columns= col_dictionary, inplace=True)
# Drop columns not required for the analysis
# 1 = cel_file ; 2 = gender ; 5 = vital_status ; 6 = snc_teste_involve ; 7 = latino_black
# 12 = B
clinical_gene_df.drop(clinical_gene_df.columns[[1,2,5,6,7,12]], axis=1, inplace=True)