-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPairwiseAlignment.cpp
More file actions
219 lines (185 loc) · 7.17 KB
/
PairwiseAlignment.cpp
File metadata and controls
219 lines (185 loc) · 7.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/* Implementation of the Needleman-Wunsch algoithm for pairwise sequence alignment in C++.
The function takes a FASTA file as it's only argument. The FASTA file should only contain two sequences, otherwise
the alignment between the last two sequences of the file will be returned. */
/* USAGE: Compile and run on Unix
$ g++ -o PairwiseAlign PairwiseAlignment.cpp
$ ./PairwiseAlign [path to FASTA file]*/
#include <iostream>
#include <string>
#include <fstream>
int main ( int argc, char **argv )
{
/* Get the two sequences to align from a FASTA file.
I borrowed heavily from this code http://rosettacode.org/wiki/FASTA_format#C.2B.2B for this portion.
In fact, the only alteration I made was to send the two sequences to the seq1 and seq2 strings instead
of printing them to the console as in the original code.*/
if( argc <= 1 ){
std::cerr << "Usage: "<<argv[0]<<" [FASTA file]" << std::endl;
return -1;
}
std::ifstream input(argv[1]);
if(!input.good()){
std::cerr << "Error opening '"<<argv[1]<<"'. Bailing out." << std::endl;
return -1;
}
std::string line, name, content, seq1, seq2;
while( std::getline( input, line ).good() ){
if( line.empty() || line[0] == '>' ){ // Identifier marker
if( !name.empty() ){ // Read the first sequence into seq1
seq1 = content;
name.clear();
}
if( !line.empty() ){
name = line.substr(1);
}
content.clear();
} else if( !name.empty() ){
if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
name.clear();
content.clear();
} else {
content += line;
}
}
}
if( !name.empty() ){ // Read the second sequence into seq2
seq2 = content;
}
std::cout << "-Sequences read from FASTA file-" << '\n' << "Seq1: " << seq1 << std::endl << "Seq2: " << seq2 << std::endl << '\n';
// Begin implementation of the Needleman-Wunsch algorithm to align the two sequences from the FASTA file
// Penalties
int match = 0;
int fiveprimegap = 8;
int threeprimegap = 7;
int gap = 11;
int mismatch = 4;
// An array to hold the scores of dimensions m * n where m is the number of bases in seq1, and n is the number of bases in seq2
int ScoreArray[seq1.length() + 1][seq2.length() + 1];
// An array to store the direction which was taken to arrive at a particular cell
char DirectionArray[seq1.length() + 1][seq2.length() + 1];
/* Initialize the score array so that the first row and column correspond to the blank scores. The first row is initialized with
values corresponding to the 5' gap penalty*/
for (unsigned i=0; i<seq1.length()+1; ++i)
{
ScoreArray[i][0] = i*gap;
DirectionArray[i][0] = 'U';
}
for (unsigned i = 0; i<seq2.length()+1; ++i)
{
ScoreArray[0][i] = i*fiveprimegap;
DirectionArray[0][i] = 'L';
}
// Fill in the rest of the score array
for (unsigned row=1; row<seq1.length() + 1; ++row)
{
for (unsigned column=1; column<seq2.length() + 1; ++column)
{
int DiagScore;
int LeftScore;
int UpScore;
int MinScore;
// The score of moving diagonally to a position (match or mismatch)
DiagScore = seq1.at(row - 1)==seq2.at(column - 1) ? ScoreArray[row - 1][column - 1] + match : ScoreArray[row - 1][column - 1] + mismatch;
// The score of moving from the cell to the left
LeftScore = ScoreArray[row][column - 1] + gap;
// The score of moving down from the top cell
UpScore = ScoreArray[row - 1][column] + gap;
// Select the minimum score and input it into the score array
MinScore = std::min(DiagScore, std::min(LeftScore, UpScore));
ScoreArray[row][column] = MinScore;
// Fill in the appropriate direction in the direction array
if(DiagScore == MinScore){
DirectionArray[row][column] = 'D';
} else if(LeftScore == MinScore)
{
DirectionArray[row][column] = 'L';
}else{
DirectionArray[row][column] = 'U';
}
}
}
// Print the score array
/* for(unsigned i=0; i<seq1.length()+1; ++i)
{
for(unsigned j=0; j<seq2.length()+1; ++j)
{
std::cout << ScoreArray[i][j] << ' ';
}
std::cout << "\n";
} */
/* I believe this is a pretty inelegant way to go about doing this next part, but anyway.... what I want to do now is calculate the
scores for the last row of the array as if they had incurred a three-prime gap penalty. If any cell in the last row has a
"corrected" score which takes this penalty into account that is still less than the score of the bottom right corner, I will
use that cell as the starting point for the traceback instead.*/
int CorrectedScores[seq2.length() + 1];
for(unsigned i=0; i<seq2.length() + 1; ++i)
{
CorrectedScores[i] = ScoreArray[seq1.length()][i] + ((seq2.length() - i)*threeprimegap);
}
// The "uncorrected" optimal score
int OptimalScore_start = ScoreArray[seq1.length()][seq2.length()];
// Find the smallest element of the corrected scores. This is the column where the traceback will begin
int SmallestIndex;
int SmallestVal = OptimalScore_start;
for(unsigned i=0; i<seq2.length(); ++i)
{
if(CorrectedScores[i] < SmallestVal)
{
SmallestIndex = i;
SmallestVal = CorrectedScores[i];
}
}
// Print the direction array
/* DirectionArray[0][0] = 'O';
for(unsigned i=0; i<seq1.length()+1; ++i)
{
for(unsigned j=0; j<seq2.length()+1; ++j)
{
std::cout << DirectionArray[i][j] << ' ';
}
std::cout << "\n";
}
std::cout << '\n'; */
// Trace back using the direction matrix to produce the alignment
int CurrentRow = seq1.length();
int CurrentCol = SmallestIndex;
std::string Align1 = "";
std::string Align2 = "";
// Number of 3' gaps
int GapNum = seq2.length() - SmallestIndex;
// Check if there is a 3' gap and add it if there is
if(SmallestIndex < seq2.length())
{
for(unsigned i=0; i < GapNum; ++i)
{
Align1.insert(0, 1, '-');
Align2.insert(0, 1, seq2.at(seq2.length() - i - 1));
}
}
while(CurrentRow >= 1 | CurrentCol >= 1)
{
if(DirectionArray[CurrentRow][CurrentCol] == 'D') // Moving diagonally
{
Align1.insert(0, 1, seq1.at(CurrentRow - 1));
Align2.insert(0, 1, seq2.at(CurrentCol - 1));
--CurrentRow;
--CurrentCol;
}
else if(DirectionArray[CurrentRow][CurrentCol] == 'U') // Moving up
{
Align1.insert(0, 1, seq1.at(CurrentRow - 1));
Align2.insert(0, 1, '-');
--CurrentRow;
}
else // Moving left
{
Align1.insert(0, 1, '-');
Align2.insert(0, 1, seq2.at(CurrentCol - 1));
--CurrentCol;
}
}
// Print the optimal alignment and its score
std::cout << "-Optimal Alignment-"<< '\n' << Align1 << '\n' << Align2 << '\n';
std::cout << "Alignment score: " << SmallestVal << '\n';
return 0;
}