-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpearson.py
More file actions
190 lines (165 loc) · 6.08 KB
/
pearson.py
File metadata and controls
190 lines (165 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 16 15:31:04 2016
@author: jessime
"""
import numpy as np
from pickle import dump
try:
from scipy.stats import t
except ImportError:
pass
from kmer_counts import BasicCounter
from tqdm import tqdm, tqdm_notebook
def pearson(counts1, counts2, row_standardize=True, outfile=None):
"""Calculates a column standardized Pearson correlation matrix"""
if row_standardize:
counts1 = (counts1.T - np.mean(counts1, axis=1)).T
counts1 = (counts1.T / np.std(counts1, axis=1)).T
counts2 = (counts2.T - np.mean(counts2, axis=1)).T
counts2 = (counts2.T / np.std(counts2, axis=1)).T
#Take the inner product and save
dist = np.inner(counts1, counts2)/counts1.shape[1]
if outfile:
np.save(outfile, dist)
return dist
def pvalue(dist, N):
"""Calculate the p-values (two tailed) of a Pearson correlation matrix
Parameters
----------
dist : ndarray
Pearson correlation matrix
N : int
Sample size. (The length of each array used to find the R value)
Returns
-------
pvals : ndarray
p-value for each R value
Reference
---------
http://stats.stackexchange.com/questions/120199/calculate-p-value-for-the-correlation-coefficient
http://stackoverflow.com/questions/17559897/python-p-value-from-t-statistic
"""
pvals = dist / np.sqrt((1-dist**2) / (N -2))
pvals = t.sf(np.abs(pvals), N - 2)*2
return pvals
class StreamDist(object):
"""Pearson's R from fasta files too large for memory."""
def __init__(self, infasta=None, query=None, outfile=None,
mean=None, std_dev=None, k=6, n=0,
norm=True, binary=True, nb=False):
self.infasta = infasta
self.query = query
self.outfile = outfile
self.mean = mean
self.std_dev = std_dev
self.k = k
self.norm = norm
self.binary = binary
self.nb = nb
self.counter = BasicCounter(k=self.k, silent=True)
self.query_counts = None
self.dist = None
self.names = None
self.n = n
self.i = 0
#Annoying, but necessary for init of self.dist
if self.mean is not None or self.std_dev is not None:
assert self.n > 0, 'Please provide number of sequences in fasta.'
def get_names(self):
"""Stream over the file again and pull out header lines."""
names = []
with open(self.infasta) as infile:
bar = self._progress()
for line in bar(infile):
if line[0] == '>':
names.append(line.strip())
self.names = names
def _progress(self):
if self.nb:
return tqdm_notebook
else:
return tqdm
def _single_count(self, line):
seq = line.strip().upper()
row = np.zeros(4**self.k, dtype=np.float32)
self.counter.seqs = seq
row = self.counter.occurrences(row, seq)
return row
def _stream_seqs(self, func):
"""Perform a function on the counts of each fasta sequence"""
with open(self.infasta) as infasta:
bar = self._progress()
if self.n == 0:
total = None
else:
total = self.n * 2
for line in bar(infasta, total=total):
if line[0] != '>':
counts = self._single_count(line)
func(counts)
def make_query_counts(self):
with open(self.query) as query:
seq = query.readlines()[1].strip().upper()
self.query_counts = self._single_count(seq)
if self.norm:
self.query_counts = self.col_norm(self.query_counts)
self.query_counts = self.row_norm(self.query_counts)
def online_moments(self, counts):
"""Update the mean and std. vectors"""
self.n += 1
delta = counts - self.mean
self.mean = self.mean + delta/self.n
self.std_dev = self.std_dev + delta*(counts - self.mean)
def calc_norm_vectors(self):
self.mean = np.zeros(4**self.k, dtype=np.float32)
self.std_dev = np.zeros(4**self.k, dtype=np.float32)
self._stream_seqs(self.online_moments)
self.std_dev = self.std_dev /self.n
self.std_dev = np.sqrt(self.std_dev)
def norm_vectors(self):
"""Either load or count norm vectors if they are not passed as arrays."""
if self.mean is None and self.std_dev is None:
self.calc_norm_vectors()
elif isinstance(self.mean, str) and isinstance(self.std_dev, str):
self.mean = np.load(self.mean).astype(dtype=np.float32)
self.std_dev = np.load(self.std_dev).astype(dtype=np.float32)
else:
pass
def col_norm(self, counts):
"""Column normalize"""
counts = counts - self.mean
counts = counts / self.std_dev
return counts
def row_norm(self, counts):
"""Row normalize"""
counts = (counts.T - np.mean(counts)).T
counts = (counts.T / np.std(counts)).T
return counts
def calc_dist(self, counts):
if self.norm:
counts = self.col_norm(counts)
counts = self.row_norm(counts)
#self.dist[self.i] = np.inner(self.query_counts, counts)/len(counts)
self.dist[self.i] = np.sum(self.query_counts * counts)/len(counts)
self.i += 1
def save(self):
"""Saves the counts appropriately based on current settings"""
if self.outfile is not None:
if not self.binary:
np.savetxt(self.outfile, self.dist, delimiter=',')
else:
try:
np.save(self.outfile, self.dist)
except AttributeError:
outfile = open(self.outfile, 'w+b')
dump(self.dist.tolist(), outfile)
outfile.close()
def make_dist(self):
"""Main"""
if self.norm:
self.norm_vectors()
self.make_query_counts()
self.dist = np.zeros(self.n, dtype=np.float32)
self._stream_seqs(self.calc_dist)
self.save()