Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
rm -rf build
mkdir build
cd build
cmake -Wno-dev ..
cmake --build . --config Release

# Additionally, either copy the nsearch file from build/nsearch to /usr/local/bin or add the directory to path
# sudo cp nsearch/nsearch /usr/local/bin/
# something like echo $PATH:$PWD/build/nsearch added to bashrc or something
19 changes: 19 additions & 0 deletions libnsearch/include/nsearch/Alnout/Writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "../Alphabet/DNA.h"
#include "../Alphabet/Protein.h"
#include "../Alphabet/English.h"

#include <fstream>

Expand Down Expand Up @@ -331,4 +332,22 @@ inline std::string Writer< Protein >::Unit() {
return "aa";
}

// English specializations
template <>
inline char Writer< English >::MatchSymbol( const char A, const char B ) {
auto score = ScorePolicy< English >::Score( A, B );
if( score >= 10 )
return '|';
if( score >= 5 )
return ':';
if( score > 0 )
return '.';
return ' ';
}

template <>
inline std::string Writer< English >::Unit() {
return "Letter";
}

} // namespace Alnout
99 changes: 99 additions & 0 deletions libnsearch/include/nsearch/Alphabet/English.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#pragma once

#include "../Alphabet.h"

struct English {
typedef char CharType;
};

// Scoring: First Try. Looking for exact match.
// Collapse letters into 5 bits
template <>
struct BitMapPolicy< English > {
static const size_t NumBits = 5;

inline static int8_t BitMap( const char letter ) {
static const uint8_t BitMapping[] = {
0b000000, // 'A'
0b000001, // 'B'
0b000010, // 'C'
0b000011, // 'D'
0b000100, // 'E'
0b000101, // 'F'
0b000110, // 'G'
0b000111, // 'H'
0b001000, // 'I'
0b001001, // 'J'
0b001010, // 'K'
0b001011, // 'L'
0b001100, // 'M'
0b001101, // 'N'
0b001110, // 'O'
0b001111, // 'P'
0b010000, // 'Q'
0b010001, // 'R'
0b010010, // 'S'
0b010011, // 'T'
0b010101, // 'U'
0b010110, // 'V'
0b010111, // 'W'
0b011000, // 'X'
0b011001, // 'Y'
0b011010, // 'Z'
0b011011, // ' ' space
};

auto val = BitMapping[ letter - 'A' ];
// if( ( val & 0b010000 ) > 0 )
// return -1; // ambiguity

return ( val & 0b01111 );
}
};

// BLOSUM62
template <>
struct ScorePolicy< English > {
inline static int8_t Score( const char letterA, const char letterB ) {
static const int ScoreMatrixSize = 27; // 'A'...'Z' and ' '
static const int8_t ScoreMatrix[ ScoreMatrixSize ][ ScoreMatrixSize ] = {
/* A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, */
{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // A
{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B
{ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // C
{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // D
{ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // E
{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // F
{ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // G
{ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // H
{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // I
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // J
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // K
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // L
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // M
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // N
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // O
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // P
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Q
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // R
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, // S
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, // T
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // U
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // V
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // W
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, // X
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}, // Y
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, // Z
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, //
};

return ScoreMatrix[ letterA - 'A' ][ letterB - 'A' ];
}
};

template <>
struct MatchPolicy< English > {
inline static bool Match( const char letterA, const char letterB ) {
return ScorePolicy< English >::Score( letterA, letterB ) >= 4;
}
};
14 changes: 14 additions & 0 deletions libnsearch/include/nsearch/Database/GlobalSearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,20 @@ void GlobalSearch< A >::SearchForHits( const Sequence< A >& query,
for( auto& sp : sps ) {
size_t queryPos, candidatePos;

// check if we already have a HSP which this SP is part of
bool isContained = false;
for( auto it = hsps.cbegin(); it != hsps.cend(); ++it ) {
const HSP& hsp = *it;
if (sp.IsFullyContainedWithin(hsp)) {
isContained = true;
break;
}
}

// do not extend this, since it's part of an HSP already
if (isContained)
continue;

size_t a1 = sp.a1, a2 = sp.a2, b1 = sp.b1, b2 = sp.b2;

Cigar leftCigar;
Expand Down
5 changes: 5 additions & 0 deletions libnsearch/include/nsearch/Database/HSP.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ class HSP {
return ( a1 <= other.a2 && other.a1 <= a2 ) // overlap in A direction
|| ( b1 <= other.b2 && other.b1 <= b2 ); // overlap in B direction
}
bool IsFullyContainedWithin( const HSP& other) const {
return (other.a1 <= a1 && other.a2 >= a2) &&
(other.b1 <= b1 && other.b2 >= b2);
}


size_t DistanceTo( const HSP& other ) const {
size_t dx = ( a1 > other.a2 ? a1 - other.a2 : other.a1 - a2 );
Expand Down
5 changes: 4 additions & 1 deletion nsearch/src/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <nsearch/Sequence.h>
#include <nsearch/Alphabet/DNA.h>
#include <nsearch/Alphabet/Protein.h>
#include <nsearch/Alphabet/English.h>

#include "Common.h"
#include "Filter.h"
Expand All @@ -22,7 +23,7 @@ static const char USAGE[] = R"(

Usage:
nsearch search --query=<queryfile> --db=<databasefile>
--out=<outputfile> --min-identity=<minidentity> [--max-hits=<maxaccepts>] [--max-rejects=<maxrejects>] [--protein] [--strand=<strand>]
--out=<outputfile> --min-identity=<minidentity> [--max-hits=<maxaccepts>] [--max-rejects=<maxrejects>] [--protein] [--english] [--strand=<strand>]
nsearch merge --forward=<forwardfile> --reverse=<reversefile> --out=<outputfile>
nsearch filter --in=<inputfile> --out=<outputfile> [--max-expected-errors=<maxee>]

Expand Down Expand Up @@ -100,6 +101,8 @@ int main( int argc, const char** argv ) {

if( args[ "--protein" ].asBool() ) {
DoSearch< Protein >( query, db, out, ParseSearchParams< Protein >( args ) );
} else if ( args[ "--english" ].asBool() ) {
DoSearch< English >( query, db, out, ParseSearchParams< English >( args ) );
} else {
DoSearch< DNA >( query, db, out, ParseSearchParams< DNA >( args ) );
}
Expand Down
9 changes: 9 additions & 0 deletions nsearch/src/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <nsearch/Sequence.h>
#include <nsearch/Alphabet/DNA.h>
#include <nsearch/Alphabet/Protein.h>
#include <nsearch/Alphabet/English.h>

#include <memory>

Expand Down Expand Up @@ -105,6 +106,11 @@ struct WordSize< Protein > {
static const int VALUE = 5;
};

template <>
struct WordSize< English > {
static const int VALUE = 5;
};

template < typename A >
bool DoSearch( const std::string& queryPath, const std::string& databasePath,
const std::string& outputPath,
Expand Down Expand Up @@ -202,3 +208,6 @@ template bool DoSearch< DNA >( const std::string&, const std::string&,
template bool DoSearch< Protein >( const std::string&, const std::string&,
const std::string&,
const SearchParams< Protein >& );
template bool DoSearch< English >( const std::string&, const std::string&,
const std::string&,
const SearchParams< English >& );