From 1573f1be478a3846bc336c209bb8baec41a060f1 Mon Sep 17 00:00:00 2001 From: Aditya Jeevannavar Date: Mon, 18 Oct 2021 11:47:32 +0300 Subject: [PATCH 1/6] Added script to build library --- build.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 build.sh diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..d0b94f0 --- /dev/null +++ b/build.sh @@ -0,0 +1,9 @@ + rm -rf build + mkdir build + cd build + cmake -Wno-dev -D CMAKE_BUILD_TYPE=Release .. + make + + # Additionally, either copy the nsearch file from build/nsearch to /usr/local/bin or add the directory to path + # sudo cp nsearch/nsearch /usr/local/bin/ + # something like echo $PATH:$PWD/build/nsearch added to bashrc or something From 8baae8ff78d96cecab55c2b42d8bb464f207decd Mon Sep 17 00:00:00 2001 From: Aditya Jeevannavar Date: Mon, 18 Oct 2021 11:49:20 +0300 Subject: [PATCH 2/6] Added English alphabet for search --- libnsearch/include/nsearch/Alphabet/English.h | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 libnsearch/include/nsearch/Alphabet/English.h diff --git a/libnsearch/include/nsearch/Alphabet/English.h b/libnsearch/include/nsearch/Alphabet/English.h new file mode 100644 index 0000000..e71de15 --- /dev/null +++ b/libnsearch/include/nsearch/Alphabet/English.h @@ -0,0 +1,99 @@ +#pragma once + +#include "../Alphabet.h" + +struct English { + typedef char CharType; +}; + +// Scoring: First Try. Looking for exact match. +// Collapse letters into 5 bits +template <> +struct BitMapPolicy< Protein > { + static const size_t NumBits = 4; + + inline static int8_t BitMap( const char letters ) { + static const uint8_t BitMapping[] = { + 0b00000, // 'A' + 0b10000, // 'B' + 0b00011, // 'C' + 0b00100, // 'D' + 0b00100, // 'E' + 0b01111, // 'F' + 0b00101, // 'G' + 0b00110, // 'H' + 0b00111, // 'I' + 0b10000, // 'J' + 0b01001, // 'K' + 0b01000, // 'L' + 0b01010, // 'M' + 0b00010, // 'N' + 0b10000, // 'O' + 0b01011, // 'P' + 0b00100, // 'Q' + 0b00001, // 'R' + 0b01100, // 'S' + 0b01101, // 'T' + 0b10000, // 'U' + 0b00111, // 'V' + 0b01110, // 'W' + 0b10010, // 'X' + 0b01111, // 'Y' + 0b10001, // 'Z' + 0b10011, // ' ' space + }; + + auto val = BitMapping[ letters - 'A' ]; + if( ( val & 0b10000 ) > 0 ) + return -1; // ambiguity + + return ( val & 0b1111 ); + } +}; + +// BLOSUM62 +template <> +struct ScorePolicy< English > { + inline static int8_t Score( const char lettersA, const char lettersB ) { + static const int ScoreMatrixSize = 27; // 'A'...'Z' and ' ' + static const int8_t ScoreMatrix[ ScoreMatrixSize ][ ScoreMatrixSize ] = { + /* A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, */ + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // A + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B + { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // C + { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // D + { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // E + { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // F + { 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // G + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // H + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // I + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // J + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // K + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // L + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // M + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // N + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // O + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // P + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Q + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // R + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, // S + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, // T + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, // U + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, // V + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, // W + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, // X + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}, // Y + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, // Z + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, // + }; + + return ScoreMatrix[ lettersA - 'A' ][ lettersB - 'A' ]; + } +}; + +template <> +struct MatchPolicy< English > { + inline static bool Match( const char lettersA, const char lettersB ) { + return ScorePolicy< English >::Score( lettersA, lettersB ) >= 10; + } +}; From 8549f25a41896639c00822b86862934eda36ae2f Mon Sep 17 00:00:00 2001 From: Aditya Jeevannavar Date: Mon, 18 Oct 2021 13:51:35 +0300 Subject: [PATCH 3/6] Added english alphabet arguments and calls --- nsearch/src/Main.cpp | 4 +++- nsearch/src/Search.cpp | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/nsearch/src/Main.cpp b/nsearch/src/Main.cpp index a5f8546..fd072ec 100644 --- a/nsearch/src/Main.cpp +++ b/nsearch/src/Main.cpp @@ -22,7 +22,7 @@ static const char USAGE[] = R"( Usage: nsearch search --query= --db= - --out= --min-identity= [--max-hits=] [--max-rejects=] [--protein] [--strand=] + --out= --min-identity= [--max-hits=] [--max-rejects=] [--protein] [--english] [--strand=] nsearch merge --forward= --reverse= --out= nsearch filter --in= --out= [--max-expected-errors=] @@ -100,6 +100,8 @@ int main( int argc, const char** argv ) { if( args[ "--protein" ].asBool() ) { DoSearch< Protein >( query, db, out, ParseSearchParams< Protein >( args ) ); + } else if (args[ "--english" ].asBol() ) { + DoSearch< English >( query, db, out, ParseSearchParams< Englsih >( args ) ); } else { DoSearch< DNA >( query, db, out, ParseSearchParams< DNA >( args ) ); } diff --git a/nsearch/src/Search.cpp b/nsearch/src/Search.cpp index 7dd10d3..4c7487b 100644 --- a/nsearch/src/Search.cpp +++ b/nsearch/src/Search.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -105,6 +106,10 @@ struct WordSize< Protein > { static const int VALUE = 5; }; +struct WordSize< English > { + static const int VALUE = 5; +}; + template < typename A > bool DoSearch( const std::string& queryPath, const std::string& databasePath, const std::string& outputPath, @@ -202,3 +207,6 @@ template bool DoSearch< DNA >( const std::string&, const std::string&, template bool DoSearch< Protein >( const std::string&, const std::string&, const std::string&, const SearchParams< Protein >& ); +template bool DoSearch< English >( const std::string&, const std::string&, + const std::string&, + const SearchParams< English >& ); From 477a0e944f985a3fd6a8b63976b619bca20ff4f4 Mon Sep 17 00:00:00 2001 From: Aditya Jeevannavar Date: Tue, 19 Oct 2021 12:10:04 +0300 Subject: [PATCH 4/6] Corrected errors --- libnsearch/include/nsearch/Alphabet/English.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libnsearch/include/nsearch/Alphabet/English.h b/libnsearch/include/nsearch/Alphabet/English.h index e71de15..cb306f1 100644 --- a/libnsearch/include/nsearch/Alphabet/English.h +++ b/libnsearch/include/nsearch/Alphabet/English.h @@ -9,7 +9,7 @@ struct English { // Scoring: First Try. Looking for exact match. // Collapse letters into 5 bits template <> -struct BitMapPolicy< Protein > { +struct BitMapPolicy< English > { static const size_t NumBits = 4; inline static int8_t BitMap( const char letters ) { From 799791d20012c810409a1cd876b5976897cdeab8 Mon Sep 17 00:00:00 2001 From: Aditya Jeevannavar Date: Tue, 19 Oct 2021 12:11:28 +0300 Subject: [PATCH 5/6] Added all requisite support for English alphabet --- libnsearch/include/nsearch/Alnout/Writer.h | 19 +++++++++++++++++++ nsearch/src/Main.cpp | 5 +++-- nsearch/src/Search.cpp | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/libnsearch/include/nsearch/Alnout/Writer.h b/libnsearch/include/nsearch/Alnout/Writer.h index 3c4740b..d024e79 100644 --- a/libnsearch/include/nsearch/Alnout/Writer.h +++ b/libnsearch/include/nsearch/Alnout/Writer.h @@ -4,6 +4,7 @@ #include "../Alphabet/DNA.h" #include "../Alphabet/Protein.h" +#include "../Alphabet/English.h" #include @@ -331,4 +332,22 @@ inline std::string Writer< Protein >::Unit() { return "aa"; } +// English specializations +template <> +inline char Writer< English >::MatchSymbol( const char A, const char B ) { + auto score = ScorePolicy< English >::Score( A, B ); + if( score >= 10 ) + return '|'; + if( score >= 5 ) + return ':'; + if( score > 0 ) + return '.'; + return ' '; +} + +template <> +inline std::string Writer< English >::Unit() { + return "Letter"; +} + } // namespace Alnout diff --git a/nsearch/src/Main.cpp b/nsearch/src/Main.cpp index fd072ec..64344f8 100644 --- a/nsearch/src/Main.cpp +++ b/nsearch/src/Main.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "Common.h" #include "Filter.h" @@ -100,8 +101,8 @@ int main( int argc, const char** argv ) { if( args[ "--protein" ].asBool() ) { DoSearch< Protein >( query, db, out, ParseSearchParams< Protein >( args ) ); - } else if (args[ "--english" ].asBol() ) { - DoSearch< English >( query, db, out, ParseSearchParams< Englsih >( args ) ); + } else if (args[ "--english" ].asBool() ) { + DoSearch< English >( query, db, out, ParseSearchParams< English >( args ) ); } else { DoSearch< DNA >( query, db, out, ParseSearchParams< DNA >( args ) ); } diff --git a/nsearch/src/Search.cpp b/nsearch/src/Search.cpp index 4c7487b..67568d3 100644 --- a/nsearch/src/Search.cpp +++ b/nsearch/src/Search.cpp @@ -106,6 +106,7 @@ struct WordSize< Protein > { static const int VALUE = 5; }; +template <> struct WordSize< English > { static const int VALUE = 5; }; From f34f1dbdc2eab8d33c0b8adeba93d15a4dbe504c Mon Sep 17 00:00:00 2001 From: jeevannavar Date: Thu, 1 Dec 2022 15:49:54 +0200 Subject: [PATCH 6/6] modifications --- build.sh | 4 +- libnsearch/include/nsearch/Alphabet/English.h | 74 +++++++++---------- .../include/nsearch/Database/GlobalSearch.h | 14 ++++ libnsearch/include/nsearch/Database/HSP.h | 5 ++ nsearch/src/Main.cpp | 2 +- 5 files changed, 59 insertions(+), 40 deletions(-) diff --git a/build.sh b/build.sh index d0b94f0..7a8d988 100644 --- a/build.sh +++ b/build.sh @@ -1,8 +1,8 @@ rm -rf build mkdir build cd build - cmake -Wno-dev -D CMAKE_BUILD_TYPE=Release .. - make + cmake -Wno-dev .. + cmake --build . --config Release # Additionally, either copy the nsearch file from build/nsearch to /usr/local/bin or add the directory to path # sudo cp nsearch/nsearch /usr/local/bin/ diff --git a/libnsearch/include/nsearch/Alphabet/English.h b/libnsearch/include/nsearch/Alphabet/English.h index cb306f1..a16a456 100644 --- a/libnsearch/include/nsearch/Alphabet/English.h +++ b/libnsearch/include/nsearch/Alphabet/English.h @@ -10,51 +10,51 @@ struct English { // Collapse letters into 5 bits template <> struct BitMapPolicy< English > { - static const size_t NumBits = 4; + static const size_t NumBits = 5; - inline static int8_t BitMap( const char letters ) { + inline static int8_t BitMap( const char letter ) { static const uint8_t BitMapping[] = { - 0b00000, // 'A' - 0b10000, // 'B' - 0b00011, // 'C' - 0b00100, // 'D' - 0b00100, // 'E' - 0b01111, // 'F' - 0b00101, // 'G' - 0b00110, // 'H' - 0b00111, // 'I' - 0b10000, // 'J' - 0b01001, // 'K' - 0b01000, // 'L' - 0b01010, // 'M' - 0b00010, // 'N' - 0b10000, // 'O' - 0b01011, // 'P' - 0b00100, // 'Q' - 0b00001, // 'R' - 0b01100, // 'S' - 0b01101, // 'T' - 0b10000, // 'U' - 0b00111, // 'V' - 0b01110, // 'W' - 0b10010, // 'X' - 0b01111, // 'Y' - 0b10001, // 'Z' - 0b10011, // ' ' space + 0b000000, // 'A' + 0b000001, // 'B' + 0b000010, // 'C' + 0b000011, // 'D' + 0b000100, // 'E' + 0b000101, // 'F' + 0b000110, // 'G' + 0b000111, // 'H' + 0b001000, // 'I' + 0b001001, // 'J' + 0b001010, // 'K' + 0b001011, // 'L' + 0b001100, // 'M' + 0b001101, // 'N' + 0b001110, // 'O' + 0b001111, // 'P' + 0b010000, // 'Q' + 0b010001, // 'R' + 0b010010, // 'S' + 0b010011, // 'T' + 0b010101, // 'U' + 0b010110, // 'V' + 0b010111, // 'W' + 0b011000, // 'X' + 0b011001, // 'Y' + 0b011010, // 'Z' + 0b011011, // ' ' space }; - auto val = BitMapping[ letters - 'A' ]; - if( ( val & 0b10000 ) > 0 ) - return -1; // ambiguity + auto val = BitMapping[ letter - 'A' ]; + // if( ( val & 0b010000 ) > 0 ) + // return -1; // ambiguity - return ( val & 0b1111 ); + return ( val & 0b01111 ); } }; // BLOSUM62 template <> struct ScorePolicy< English > { - inline static int8_t Score( const char lettersA, const char lettersB ) { + inline static int8_t Score( const char letterA, const char letterB ) { static const int ScoreMatrixSize = 27; // 'A'...'Z' and ' ' static const int8_t ScoreMatrix[ ScoreMatrixSize ][ ScoreMatrixSize ] = { /* A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, */ @@ -87,13 +87,13 @@ struct ScorePolicy< English > { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, // }; - return ScoreMatrix[ lettersA - 'A' ][ lettersB - 'A' ]; + return ScoreMatrix[ letterA - 'A' ][ letterB - 'A' ]; } }; template <> struct MatchPolicy< English > { - inline static bool Match( const char lettersA, const char lettersB ) { - return ScorePolicy< English >::Score( lettersA, lettersB ) >= 10; + inline static bool Match( const char letterA, const char letterB ) { + return ScorePolicy< English >::Score( letterA, letterB ) >= 4; } }; diff --git a/libnsearch/include/nsearch/Database/GlobalSearch.h b/libnsearch/include/nsearch/Database/GlobalSearch.h index 7e7be29..ad8ff4b 100644 --- a/libnsearch/include/nsearch/Database/GlobalSearch.h +++ b/libnsearch/include/nsearch/Database/GlobalSearch.h @@ -140,6 +140,20 @@ void GlobalSearch< A >::SearchForHits( const Sequence< A >& query, for( auto& sp : sps ) { size_t queryPos, candidatePos; + // check if we already have a HSP which this SP is part of + bool isContained = false; + for( auto it = hsps.cbegin(); it != hsps.cend(); ++it ) { + const HSP& hsp = *it; + if (sp.IsFullyContainedWithin(hsp)) { + isContained = true; + break; + } + } + + // do not extend this, since it's part of an HSP already + if (isContained) + continue; + size_t a1 = sp.a1, a2 = sp.a2, b1 = sp.b1, b2 = sp.b2; Cigar leftCigar; diff --git a/libnsearch/include/nsearch/Database/HSP.h b/libnsearch/include/nsearch/Database/HSP.h index 9774586..ac5ad83 100644 --- a/libnsearch/include/nsearch/Database/HSP.h +++ b/libnsearch/include/nsearch/Database/HSP.h @@ -32,6 +32,11 @@ class HSP { return ( a1 <= other.a2 && other.a1 <= a2 ) // overlap in A direction || ( b1 <= other.b2 && other.b1 <= b2 ); // overlap in B direction } + bool IsFullyContainedWithin( const HSP& other) const { + return (other.a1 <= a1 && other.a2 >= a2) && + (other.b1 <= b1 && other.b2 >= b2); + } + size_t DistanceTo( const HSP& other ) const { size_t dx = ( a1 > other.a2 ? a1 - other.a2 : other.a1 - a2 ); diff --git a/nsearch/src/Main.cpp b/nsearch/src/Main.cpp index 64344f8..fb67390 100644 --- a/nsearch/src/Main.cpp +++ b/nsearch/src/Main.cpp @@ -101,7 +101,7 @@ int main( int argc, const char** argv ) { if( args[ "--protein" ].asBool() ) { DoSearch< Protein >( query, db, out, ParseSearchParams< Protein >( args ) ); - } else if (args[ "--english" ].asBool() ) { + } else if ( args[ "--english" ].asBool() ) { DoSearch< English >( query, db, out, ParseSearchParams< English >( args ) ); } else { DoSearch< DNA >( query, db, out, ParseSearchParams< DNA >( args ) );