-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathage-correlation.py
More file actions
111 lines (81 loc) · 3.26 KB
/
age-correlation.py
File metadata and controls
111 lines (81 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
import re
import csv
import strainpycon
fn = "data/FrequenciesField9Aug2019.csv"
thres = 1.8e-7
gamma = 0.01
nrange = range(1,6+1)
sp = strainpycon.StrainRecon()
lookup = { "1996" : {}, "2007" : {}, "2001" : {}, "2012" : {}}
with open('data/ages-96-01-07.csv', 'r') as f:
csvreader = csv.reader(f, delimiter=",")
next(csvreader, None) #Header
for row in csvreader:
sample = row[0]
age = row[4]
year = row[1]
#if year != "2007":
# continue
if age == '' or age == '0':
age = str(int(float(row[3]) * 12.0))
lookup[year][sample] = age
#print (sample, age, year, row)
#samplecode,year,date,ageyrs,agemonths,matchid,INDIVIDID,LOCATIONID,HHID,Original_code,PMM3,Processed
#22,1996,11/19/1996,,5.092402464,38,,,,17/087B,10426.66667,
#sample = row[0].replace('"','')
#yr = yearmap[sample]
#yr = re.search (r"Sample_([0-9]*)_", sample)
#yr = re.search (r"_([0-9]*)", sample[-5:])
with open('data/ages-12.csv', 'r') as f:
csvreader = csv.reader(f, delimiter=",")
next(csvreader, None) #Header
for row in csvreader:
sample = row[1]
age = row[11]
year = "2012"
# Obs,SampleName,SampleCode,Smear,MaxAlleles,minalleles,Maxalleles2,Results,comp,DateTaken,Sex,AgeinMonth,AgeinYear,FeverLast2Weeks,Feverin24H,agecat3,HemocueReading,aspres1,aspeci1,asdens1,gmpres1,gmdens1,ITN,studyarea2,antimalarialtaken,latitude,longitude,altitude,NewSamplecode,Processed,year,samplecode1,Used,calcdensityifwbc,HIgh,randID,SELECTED,CalcDensityIfRBC
#115,327,K5F7P,P,1,1,1,1,12-284,6/22/2012,Male,9,,2,,6mo- <5yo,8.6,Positive,PF,378,Negative,,No,Asembo,0,-0.190935,34.37265833,1145.2,,,,,,6048,,0.08479847,1,79380
if age == '' or age == '0' or int(age) == 0:
age = str(int(float(row[12]) * 12.0))
#print (sample, age, year, row)
lookup[year][sample] = age
### Process
#print (lookup)
coll = { "all" : []}
#j = 0
with open(fn, 'r') as f:
csvreader = csv.reader(f, delimiter=",")
next(csvreader, None) #Header
for row in csvreader:
sample = row[0]
#print (sample)
unp = re.search (r"Sample_([0-9]*)_([^_]*)_([0-9]*)", sample)
#print (sample, unp.groups())
#sample_ID,_,yr = unp.groups()
yr,sample_ID,_ = unp.groups()
if not (yr in lookup and sample_ID in lookup[yr]):
unp = re.search (r"Sample_([0-9]*)_(.*)_([0-9]*)$", sample)
sample_ID,_,yr = unp.groups()
if yr in lookup and sample_ID in lookup[yr]:
bases = np.array ([ float(x) if x != "NA" else np.nan for x in row[1:]])
mf = sp.misfits (bases, nrange=nrange, gamma=gamma)
moi = np.sum (mf > thres)+1
#print ("FOUND\t{}\t{}\t{}\t{}".format(yr, sample_ID, lookup[yr][sample_ID], moi)) # row[1:]))
if yr not in coll:
coll[yr] = []
coll[yr].append ( (float(lookup[yr][sample_ID]), float(moi)) )
coll["all"].append ( (float(lookup[yr][sample_ID]), float(moi)) )
else:
#print ("Not found: {}".format(sample))
pass
pval = []
for yr in sorted(coll.keys()) + ["all"]:
a = np.array(coll[yr]).T
#print (a)
print ("{}:\tr={}\tp={}".format (yr, np.corrcoef (a[0], a[1])[0,1], pearsonr (a[0], a[1])[1]))
#print ("R:\t{}".format (pearsonr (a[0], a[1])))
pval.append (pearsonr (a[0], a[1])[1])
print ("FDR:\t{}".format (multipletests(pval, alpha=0.05, method='fdr_bh')))