Skip to content
This repository was archived by the owner on Jul 28, 2022. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion bioseqdb_pg/bioseqdb--0.0.0.sql
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ CREATE TYPE bwa_result AS (
is_secondary BOOLEAN,
is_reverse BOOLEAN,
cigar TEXT,
score INTEGER
score INTEGER,
mismatch_count INTEGER,
gap_open INTEGER,
ident_n INTEGER,
alignment_len INTEGER
);

CREATE FUNCTION nuclseq_search_bwa(NUCLSEQ, CSTRING, CSTRING, CSTRING)
Expand Down
50 changes: 48 additions & 2 deletions bioseqdb_pg/bwa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,13 +265,55 @@ std::vector<BwaMatch> BwaIndex::align_sequence(std::string_view query) const {
// TODO: How do rb/re fields look in reverse matches?
int64_t ref_offset = index->bns->anns[align->rid].offset;
mem_aln_t details = mem_reg2aln(options, index->bns, index->pac, query.length(), query.data(), align);
int alignment_len = 0;
int gap_open = 0;
int identical = 0;
int mismatch_count = 0;
int ref_idx = 0, read_idx = 0;
std::string ref_subseq = extract_reference_subseq(index, align->rb, align->re);
std::basic_string_view<char> query_subseq = query.substr(align->qb, align->qe - align->qb);
for (size_t i = 0; i < details.n_cigar; ++i) {
int cnt = bam_cigar_oplen(details.cigar[i]);
char opcode = bam_cigar_opchr(details.cigar[i]);
alignment_len += cnt;
switch (opcode) {
case 'X': // Chyba nie jest generowane przez bwa.
ref_idx += cnt;
read_idx += cnt;
mismatch_count += cnt;
break;
case '=': // Chyba nie jest generowane przez bwa.
ref_idx += cnt;
read_idx += cnt;
identical += cnt;
break;
case 'M':
for (int j = 0; j < cnt; ++j, ++read_idx, ++ref_idx) {
if (ref_subseq[ref_idx] == query_subseq[read_idx])
++identical;
else
++mismatch_count;
}
break;
// TODO should deletions or insertions count as mismatches?
// In SeqLib/src/ssw_cpp.cpp they are, but in mmseqs2 (src/util/convertalignments.cpp) aren't.
case 'D':
gap_open += 1;
ref_idx += cnt;
break;
case 'I':
gap_open += 1;
read_idx += cnt;
break;
}
}
matches.push_back({
.ref_id = std::string_view(index->bns->anns[align->rid].name),
.ref_subseq = extract_reference_subseq(index, align->rb, align->re),
.ref_subseq = ref_subseq,
.ref_match_begin = align->rb - ref_offset,
.ref_match_end = align->re - ref_offset,
.ref_match_len = align->re - align->rb,
.query_subseq = query.substr(align->qb, align->qe - align->qb),
.query_subseq = query_subseq,
.query_match_begin = align->qb,
.query_match_end = align->qe,
.query_match_len = align->qe - align->qb,
Expand All @@ -281,6 +323,10 @@ std::vector<BwaMatch> BwaIndex::align_sequence(std::string_view query) const {
// TODO: Revert the CIGAR string and/or subsequences when the match is reversed?
.cigar = cigar_compressed_to_string(details.cigar, details.n_cigar),
.score = details.score,
.mismatch_count = mismatch_count,
.gap_open = gap_open,
.ident_n = identical,
.alignment_len = alignment_len,
});
free(details.cigar);
}
Expand Down
4 changes: 4 additions & 0 deletions bioseqdb_pg/bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ struct BwaMatch {
bool is_reverse;
std::string cigar;
int score;
int mismatch_count;
int gap_open;
int ident_n;
int alignment_len;
};

class BwaIndex {
Expand Down