-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstochastic.py
More file actions
123 lines (94 loc) · 4.13 KB
/
stochastic.py
File metadata and controls
123 lines (94 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python
# -*- coding: utf-8 -*-
import scipy.misc as scm
import scipy.stats as scs
def prob_of_complete_sequence_reconstruction(genome_length, read_length, number_of_reads, k_value, error_percentage):
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
prob_not_covered = ((g-l+k)/g)**n
prob_covered = 1-(((g-l+k)/g)**n)
prob_error = (1-(1-e)**(2*(k-1)))
return 1 - (prob_not_covered + (prob_covered * prob_error) * (g-l+k))
def prob_kmer_is_correct(k_value, error_percentage):
k = float(k_value)
e = float(error_percentage)
return (1-e)**k
def expected_kmer_coverage_idd(genome_length, read_length, number_of_reads, k_value):
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
return ((l-k)*n*k)/g
def expected_number_of_correct_kmers_at_position(genome_length, read_length, number_of_reads, k_value, error_percentage):
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
prob_correct_kmer = prob_kmer_is_correct(k_value, error_percentage)
return (l*n/g) * prob_correct_kmer
def prob_kmer_has_specific_number_of_errors(k_value, error_percentage, number_of_errors):
k = float(k_value)
e = float(error_percentage)
n = float(number_of_errors)
p = (1-e)**(k-n) * e**n * scm.comb(k,n)
return p
def prob_multiple_identical_wrong_kmers(genome_length, read_length, number_of_reads, k_value, error_percentage):
### is not correct
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
p_single_error = prob_kmer_has_specific_number_of_errors(k_value, error_percentage, 1)
p_not_identical = 1-(((1-e)**(k-1))*e*(1.0/3.0))
return p_single_error * (1 - p_not_identical**((l*n/g)-1))
def expected_number_of_kmers_with_exact_same_errors(genome_length, read_length, number_of_reads, k_value, error_percentage):
### is not correct
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
prob = prob_multiple_identical_wrong_kmers(genome_length, read_length, number_of_reads, k_value, error_percentage)
return prob*(l*n/g)
def average_number_of_kmers_with_exactly_identical_errors(genome_length, read_length, number_of_reads, k_value, error_percentage):
# assumes that kmers with errors are chosen i.i.d from {A,C,G,T}^k
# actual number is higher, since kmers with errors have small hamilton-distance to correct kmers
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
expected_number_of_kmers_with_error = (g-k)*n*l*(1-prob_kmer_is_correct(k_value, error_percentage))
return expected_number_of_kmers_with_error/(4.00**k)
def expected_number_of_kmers(genome_length, read_length, number_of_reads, k_value, error_percentage):
# this method ignores that two identical kmers can have the same error and that a kmer with error may be identical to a kmer without errors
### is not correct
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
expected_number_of_kmers_with_error = (g-k)*n*l*(1-prob_kmer_is_correct(k_value, error_percentage))
expected_number_of_kmers_without_error = max((g-k), (g-k)*n*l*prob_kmer_is_correct(k_value, error_percentage))
return expected_number_of_kmers_with_error + expected_number_of_kmers_without_error
def expected_number_of_kmers_with_exactly_identical_errors(genome_length, read_length, number_of_reads, k_value, error_percentage):
g = float(genome_length)
l = float(read_length)
n = float(number_of_reads)
k = float(k_value)
e = float(error_percentage)
number_of_possible_error_kmers = 4.0**k
prob_kmer_with_error = (1-prob_kmer_is_correct(k_value, error_percentage))
prob_specific_kmer = prob_kmer_with_error * (1.0/number_of_possible_error_kmers)
total_maximum_number_of_kmers = (l-k)*n
binomprob = scs.binum(total_maximum_number_of_kmers, prob_specific_kmer)
i = 0
p = 0
while binomprob.pmf(i) > p:
i += 1
return i