-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsj-coverage.cpp
More file actions
executable file
·180 lines (161 loc) · 4.96 KB
/
sj-coverage.cpp
File metadata and controls
executable file
·180 lines (161 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/*
============================================================================
Name : read-coverage.cpp
Author : Francesca Nadalin
Version :
Description : Extract the reads spanning splicing junctions
============================================================================
*/
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <stdio.h>
#include <map>
#include <sys/time.h>
#include <sys/resource.h>
#include "Sam.h"
#include "Bed.h"
#define RESERVE_BLOCK 100
int main(int argc, char * argv[]) {
clock_t time1 = clock();
if (argc < 5)
{
std::cout << "Usage: <junctions.bed> <bamfile> <outfile_spanning> <outfile_nonspanning>" << std::endl;
std::cout << "<junctions.bed> BED file with junctions, and with junction ID in the last column" << std::endl;
std::cout << "Output file: TSV file with spanning read ID and list of junction IDs, comma-separated" << std::endl;
std::cout << "Output file: TSV file with non-spanning read ID and list of junction IDs, comma-separated" << std::endl;
return 1;
}
std::string bed_file(argv[1]);
std::string bam_file(argv[2]);
std::string out_sj_file(argv[3]);
std::string out_nsj_file(argv[4]);
std::ifstream bed;
bed.open(bed_file.c_str());
if (not bed.is_open())
{
std::cerr << "Error while opening file " << bed_file << " for reading" << std::endl;
return 2;
}
std::vector<Bed> beds; // vector of BED entries
beds.reserve(RESERVE_BLOCK);
for (std::string line; std::getline(bed, line); )
{
Bed B(line);
if (beds.capacity() == beds.size())
{
beds.reserve(beds.capacity() + RESERVE_BLOCK);
}
beds.push_back(B);
}
bed.close();
Sam S(bam_file);
if (not S.is_open())
{
std::cerr << "Error while opening file " << bam_file << " for reading" << std::endl;
}
std::ofstream out_sj;
out_sj.open(out_sj_file.c_str());
if (not out_sj.is_open())
{
std::cerr << "Error while opening file " << out_sj_file << " for reading" << std::endl;
return 2;
}
std::ofstream out_nsj;
out_nsj.open(out_nsj_file.c_str());
if (not out_nsj.is_open())
{
std::cerr << "Error while opening file " << out_nsj_file << " for reading" << std::endl;
return 2;
}
int n = 0;
while (S.read())
{
if (S.is_mapped())
{
// std::cout << "reading sam..." << std::endl;
// center of the read
// std::cout << "start: " << S.start() << std::endl;
// std::cout << "end: " << S.end() << std::endl;
std::vector<junction> jj = S.junctions();
size_t block_start = S.start();
std::string j_list_nspan;
// std::cout << "junctions:" << std::endl;
if (jj.size() > 0)
{
std::string j_list_span;
for (auto j : jj) {
// std::cout << j.first << " " << j.second << std::endl;
size_t block_end = j.first;
for (auto b : beds)
{
if (S.ref().compare(b.ref()) == 0)
{
if (j.first > b.start() - JUNCTION_COORD_TOL and
j.first < b.start() + JUNCTION_COORD_TOL and
j.second > b.end() - JUNCTION_COORD_TOL and
j.second < b.end() + JUNCTION_COORD_TOL)
{
// the read overlaps the junction
// here it is spliced: it encompasses the two spliced exons
if (j_list_span.size() > 0) j_list_span.append(",");
j_list_span.append(b.name());
}
else
{
if ((block_start < b.start() - JUNCTION_COORD_TOL and
block_end > b.start() + JUNCTION_COORD_TOL + MIN_INTRON_SPAN) or
(block_start < b.end() - JUNCTION_COORD_TOL - MIN_INTRON_SPAN and
block_end > b.start() + JUNCTION_COORD_TOL))
{
// the read overlaps the junction
// here it is not spliced: it spans one exon and the next intron
if (j_list_nspan.size() > 0) j_list_nspan.append(",");
j_list_nspan.append(b.name());
}
}
}
}
block_start = j.second;
}
if (j_list_span.size() > 0) out_sj << j_list_span << " " << S.id() << std::endl;
}
size_t block_end = S.end();
for (auto b : beds)
{
if (S.ref().compare(b.ref()) == 0)
{
if ((block_start < b.start() - JUNCTION_COORD_TOL and
block_end > b.start() + JUNCTION_COORD_TOL + MIN_INTRON_SPAN) or
(block_start < b.end() - JUNCTION_COORD_TOL - MIN_INTRON_SPAN and
block_end > b.start() + JUNCTION_COORD_TOL))
{
// the read overlaps the junction
// here it is not spliced: it spans one exon and the next intron
if (j_list_nspan.size() > 0) j_list_nspan.append(",");
j_list_nspan.append(b.name());
}
}
}
if (j_list_nspan.size() > 0) out_nsj << j_list_nspan << " " << S.id() << std::endl;
}
++n;
if (n%100000 == 0)
{
std::cout << ".";
std::cout.flush();
if (n%10000000 == 0)
{
std::cout << std::endl;
}
}
}
return 0;
double time2 = clock();
std::cout << std::endl;
std::cout << "Wall time: " << (double) (time2 - time1) / CLOCKS_PER_SEC << " s" << std::endl;
struct rusage usage;
getrusage(RUSAGE_SELF, &usage);
std::cout << "RSS: " << usage.ru_maxrss << std::endl;
return 0;
}