diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..7a8d988 --- /dev/null +++ b/build.sh @@ -0,0 +1,9 @@ + rm -rf build + mkdir build + cd build + cmake -Wno-dev .. + cmake --build . --config Release + + # Additionally, either copy the nsearch file from build/nsearch to /usr/local/bin or add the directory to path + # sudo cp nsearch/nsearch /usr/local/bin/ + # something like echo $PATH:$PWD/build/nsearch added to bashrc or something diff --git a/libnsearch/include/nsearch/Alnout/Writer.h b/libnsearch/include/nsearch/Alnout/Writer.h index 3c4740b..d024e79 100644 --- a/libnsearch/include/nsearch/Alnout/Writer.h +++ b/libnsearch/include/nsearch/Alnout/Writer.h @@ -4,6 +4,7 @@ #include "../Alphabet/DNA.h" #include "../Alphabet/Protein.h" +#include "../Alphabet/English.h" #include @@ -331,4 +332,22 @@ inline std::string Writer< Protein >::Unit() { return "aa"; } +// English specializations +template <> +inline char Writer< English >::MatchSymbol( const char A, const char B ) { + auto score = ScorePolicy< English >::Score( A, B ); + if( score >= 10 ) + return '|'; + if( score >= 5 ) + return ':'; + if( score > 0 ) + return '.'; + return ' '; +} + +template <> +inline std::string Writer< English >::Unit() { + return "Letter"; +} + } // namespace Alnout diff --git a/libnsearch/include/nsearch/Alphabet/English.h b/libnsearch/include/nsearch/Alphabet/English.h new file mode 100644 index 0000000..a16a456 --- /dev/null +++ b/libnsearch/include/nsearch/Alphabet/English.h @@ -0,0 +1,99 @@ +#pragma once + +#include "../Alphabet.h" + +struct English { + typedef char CharType; +}; + +// Scoring: First Try. Looking for exact match. +// Collapse letters into 5 bits +template <> +struct BitMapPolicy< English > { + static const size_t NumBits = 5; + + inline static int8_t BitMap( const char letter ) { + static const uint8_t BitMapping[] = { + 0b000000, // 'A' + 0b000001, // 'B' + 0b000010, // 'C' + 0b000011, // 'D' + 0b000100, // 'E' + 0b000101, // 'F' + 0b000110, // 'G' + 0b000111, // 'H' + 0b001000, // 'I' + 0b001001, // 'J' + 0b001010, // 'K' + 0b001011, // 'L' + 0b001100, // 'M' + 0b001101, // 'N' + 0b001110, // 'O' + 0b001111, // 'P' + 0b010000, // 'Q' + 0b010001, // 'R' + 0b010010, // 'S' + 0b010011, // 'T' + 0b010101, // 'U' + 0b010110, // 'V' + 0b010111, // 'W' + 0b011000, // 'X' + 0b011001, // 'Y' + 0b011010, // 'Z' + 0b011011, // ' ' space + }; + + auto val = BitMapping[ letter - 'A' ]; + // if( ( val & 0b010000 ) > 0 ) + // return -1; // ambiguity + + return ( val & 0b01111 ); + } +}; + +// BLOSUM62 +template <> +struct ScorePolicy< English > { + inline static int8_t Score( const char letterA, const char letterB ) { + static const int ScoreMatrixSize = 27; // 'A'...'Z' and ' ' + static const int8_t ScoreMatrix[ ScoreMatrixSize ][ ScoreMatrixSize ] = { + /* A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, */ + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // A + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B + { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // C + { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // D + { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // E + { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // F + { 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // G + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // H + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // I + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // J + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // K + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // L + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // M + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // N + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // O + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // P + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Q + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // R + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, // S + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, // T + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // U + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // V + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // W + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, // X + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}, // Y + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, // Z + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, // + }; + + return ScoreMatrix[ letterA - 'A' ][ letterB - 'A' ]; + } +}; + +template <> +struct MatchPolicy< English > { + inline static bool Match( const char letterA, const char letterB ) { + return ScorePolicy< English >::Score( letterA, letterB ) >= 4; + } +}; diff --git a/libnsearch/include/nsearch/Database/GlobalSearch.h b/libnsearch/include/nsearch/Database/GlobalSearch.h index 7e7be29..ad8ff4b 100644 --- a/libnsearch/include/nsearch/Database/GlobalSearch.h +++ b/libnsearch/include/nsearch/Database/GlobalSearch.h @@ -140,6 +140,20 @@ void GlobalSearch< A >::SearchForHits( const Sequence< A >& query, for( auto& sp : sps ) { size_t queryPos, candidatePos; + // check if we already have a HSP which this SP is part of + bool isContained = false; + for( auto it = hsps.cbegin(); it != hsps.cend(); ++it ) { + const HSP& hsp = *it; + if (sp.IsFullyContainedWithin(hsp)) { + isContained = true; + break; + } + } + + // do not extend this, since it's part of an HSP already + if (isContained) + continue; + size_t a1 = sp.a1, a2 = sp.a2, b1 = sp.b1, b2 = sp.b2; Cigar leftCigar; diff --git a/libnsearch/include/nsearch/Database/HSP.h b/libnsearch/include/nsearch/Database/HSP.h index 9774586..ac5ad83 100644 --- a/libnsearch/include/nsearch/Database/HSP.h +++ b/libnsearch/include/nsearch/Database/HSP.h @@ -32,6 +32,11 @@ class HSP { return ( a1 <= other.a2 && other.a1 <= a2 ) // overlap in A direction || ( b1 <= other.b2 && other.b1 <= b2 ); // overlap in B direction } + bool IsFullyContainedWithin( const HSP& other) const { + return (other.a1 <= a1 && other.a2 >= a2) && + (other.b1 <= b1 && other.b2 >= b2); + } + size_t DistanceTo( const HSP& other ) const { size_t dx = ( a1 > other.a2 ? a1 - other.a2 : other.a1 - a2 ); diff --git a/nsearch/src/Main.cpp b/nsearch/src/Main.cpp index a5f8546..fb67390 100644 --- a/nsearch/src/Main.cpp +++ b/nsearch/src/Main.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "Common.h" #include "Filter.h" @@ -22,7 +23,7 @@ static const char USAGE[] = R"( Usage: nsearch search --query= --db= - --out= --min-identity= [--max-hits=] [--max-rejects=] [--protein] [--strand=] + --out= --min-identity= [--max-hits=] [--max-rejects=] [--protein] [--english] [--strand=] nsearch merge --forward= --reverse= --out= nsearch filter --in= --out= [--max-expected-errors=] @@ -100,6 +101,8 @@ int main( int argc, const char** argv ) { if( args[ "--protein" ].asBool() ) { DoSearch< Protein >( query, db, out, ParseSearchParams< Protein >( args ) ); + } else if ( args[ "--english" ].asBool() ) { + DoSearch< English >( query, db, out, ParseSearchParams< English >( args ) ); } else { DoSearch< DNA >( query, db, out, ParseSearchParams< DNA >( args ) ); } diff --git a/nsearch/src/Search.cpp b/nsearch/src/Search.cpp index 7dd10d3..67568d3 100644 --- a/nsearch/src/Search.cpp +++ b/nsearch/src/Search.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -105,6 +106,11 @@ struct WordSize< Protein > { static const int VALUE = 5; }; +template <> +struct WordSize< English > { + static const int VALUE = 5; +}; + template < typename A > bool DoSearch( const std::string& queryPath, const std::string& databasePath, const std::string& outputPath, @@ -202,3 +208,6 @@ template bool DoSearch< DNA >( const std::string&, const std::string&, template bool DoSearch< Protein >( const std::string&, const std::string&, const std::string&, const SearchParams< Protein >& ); +template bool DoSearch< English >( const std::string&, const std::string&, + const std::string&, + const SearchParams< English >& );