Skip to content
Open
319 changes: 226 additions & 93 deletions BESST/GenerateOutput.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
'''
Created on Sep 29, 2011

@author: ksahlin


Updates on march, 2017
Updates on august, 2018
@contributor: sletort

This file is part of BESST.

BESST is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

BESST is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with BESST. If not, see <http://www.gnu.org/licenses/>.
'''
Expand Down Expand Up @@ -91,118 +94,250 @@ def WriteToF(F, Contigs, list_of_contigs):
info_list.append((cont_obj.name, cont_obj.direction, cont_obj.position, cont_obj.length, cont_obj.sequence)) #,cont_obj.links
if cont_obj.position < 0:
print('Write to F: Position is negative!', cont_obj.position, cont_obj.name, cont_obj.direction)
#del Contigs[cont_obj.name]
#del Contigs[cont_obj.name]
F.append(info_list)
return(F)

# ==================================
def compute_kmer_overlap( end1,end2 ):
# should be a class method of Scaffold ?
i = len(end1)
while i > 0:
if end1[-i:] == end2[:i]:
return i
i -= 1
return i

# thoses classes are based on AGP concept
# Scaffold is 'Object' in AGP concept
class Component(object):
"""'Component' is for what compose the 'Object', 'Contig' and 'Gap'."""
i = 0 # an instance counter, a way to have id for component.

def __init__( self, id, len, seq ):
super( Component, self ).__init__()
Component.i += 1

self._len = len
self._id = id if( id != None ) else Component.i
self._seq = seq

# position of the first/last contig base that belong to the scaffold
# contig coordinate
self._scaff_start = 1
self._scaff_end = len

def __del__( self ):
Component.i -= 1

@property
def seq( self ):
return self._seq

@property
def len( self ):
return self._len

@property
def id( self ):
return self._id

@property
def scaff_start( self ):
return self._scaff_start
@scaff_start.setter
def scaff_start( self, value ):
self._scaff_start = value
#~ self._scaff_end -= value

@property
def scaff_end( self ):
return self._scaff_end

def get_scaff_ctg_coords( self ):
return [ self.scaff_start, self.scaff_end ]

def __str__( self ):
s = "\t".join([ str(x) for x in [self._id, self._len] ])
s += "\t".join([ str(x) for x in [ '\ncoords' ]+self.get_scaff_ctg_coords() ])
s += "\n" + self._seq
return s

class Contig( Component ):
'''a Contig is a Component with a direction(=strand).

This class provide a builder and sequence is returned regarding direction.
'''
def __init__( self, id, len, direction, seq ):
super( Contig, self ).__init__( id, len, seq )
self.__direction = direction

@classmethod
def buildFromTuple( cls, t_ctg ):
return cls( t_ctg[0], t_ctg[3], t_ctg[1], t_ctg[4] )

@property
def seq( self ):
if self.direction:
return self._seq
else:
return RevComp( self._seq,rev_nuc )

@property
def direction( self ):
return self.__direction

class Gap( Component ):
'''a Gap is a component whose sequence is a repetition of N or n.'''

def __init__( self, char, len ):
super( Gap, self ).__init__( None, len, char*len )

# ==================================

class Scaffold(object):
"""docstring for Scaffold"""
def __init__(self, name, param, info_tuple):
super(Scaffold, self).__init__()
self.name = name
super( Scaffold, self ).__init__()
self.name = name
self.param = param

self.seqs = [x[4] for x in info_tuple]
self.gaps = list(map(lambda x,y : y[2] - (x[2] + x[3]), info_tuple[:-1],info_tuple[1:]))
self.directions = [x[1] for x in info_tuple]
self.positions = [(x[2],x[2] + x[3] - 1) for x in info_tuple]
self.contigs = [x[0] for x in info_tuple]

def check_kmer_overlap(self,end1,end2):
i = len(end1)
while i > 0:
if end1[-i:] == end2[:i]:
return i
i -= 1
return i

def get_sequence(self, string, direction):
if direction:
return string
else:
return RevComp(string,rev_nuc)
self.l_components = []
self.__scaff_len = 0
self.__buildComponents( info_tuple )

@property
def scaff_len( self ):
return self.__scaff_len

def make_fasta_string(self,fasta_file):
fasta = []
#fasta.append('>{0}\n'.format(self.name))
fasta.append('>'+str(self.name)+'\n')
# first contig
def __incorporateComponent( self, o_compo, compo_pos=0 ):
ctg_offset = 0

fasta.append( self.get_sequence(self.seqs[0], self.directions[0]))
if( 0 == len( self.l_components ) ):
# the first contig align all along the scaffold
s_start = compo_pos + 1 # if the first contig start on base 1 of the scaffold, use 1 make it easier to understand
# compo_pos + 1 to turn to 1-based.
else:
s_start = self.scaff_len + 1 # the base following the last one

for i in range(len(self.seqs)-1):
gap = self.gaps[i]
if gap <= 2*self.param.std_dev_ins_size:
overlap = self.check_kmer_overlap( self.get_sequence(self.seqs[i], self.directions[i])[-self.param.max_contig_overlap:], self.get_sequence(self.seqs[i+1], self.directions[i+1])[:self.param.max_contig_overlap])
if overlap >= 20:
fasta.append('n' + self.get_sequence(self.seqs[i+1], self.directions[i+1])[overlap:])
print('merging {0} bp here'.format(overlap), file=self.param.information_file)
else:
#print gap
if gap <= 1:
fasta.append('n' + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
else:
fasta.append('N'*int(gap) + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
else:
if gap <= 1:
fasta.append('n' + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
else:
fasta.append('N'*int(gap) + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
s_end = s_start + o_compo.len - o_compo.scaff_start
self.__scaff_len = s_end
self.l_components.append([ o_compo, s_start, s_end ])

def __buildComponents( self, l_info_tuples ):
o_ctg = Contig.buildFromTuple( l_info_tuples[0] )

self.__incorporateComponent( o_ctg, l_info_tuples[0][2] )

print(''.join([ x for x in fasta]), file=fasta_file)
max_gap_size = 2*self.param.std_dev_ins_size # not good var name, max_small_gap_size ?
max_overlap = self.param.max_contig_overlap

def make_AGP_string(self, AGP_file):
for i in range( 1, len( l_info_tuples ) ):
o_next = Contig.buildFromTuple( l_info_tuples[i] )
next_pos = l_info_tuples[i][2] +1 # 1-based

gap_estimate = next_pos - ( l_info_tuples[i-1][2]+1 + o_ctg.len )
if( max_gap_size < gap_estimate ):
#~ print "***Big Gap"
o_gap = Gap( 'N', gap_estimate )
else:
# compute the real overlap
prev_seq = o_ctg.seq[-max_overlap:]
next_seq = o_next.seq[:max_overlap]
overlap = compute_kmer_overlap( prev_seq, next_seq )

#~ print "Overlap = " + str( overlap )
if( 20 <= overlap ): # big overlap
#~ print "***Big Overlap"
print( 'merging {0} bp here'.format(overlap), file=self.param.information_file )
o_gap = Gap( 'n', 1 )
o_next.scaff_start = overlap + 1 # +1 to turn 1-based
elif( gap_estimate <= 1 ): # supposed overlap, but not found
#~ print "***Supposed overlap"
o_gap = Gap( 'n', 1 )
else:
o_gap = Gap( 'N', gap_estimate )

self.__incorporateComponent( o_gap )
self.__incorporateComponent( o_next )
o_ctg = o_next

def make_fasta_string( self,fasta_file ):
'''write a fasta sequence from the object.

Builds a header and concatenates Component sequences.
When no overlapping, offset will be 0.
'''
l_fasta = [ '>'+str(self.name)+'\n' ]
for o_compo in [ l_[0] for l_ in self.l_components ]:
offset = o_compo.scaff_start - 1 # scaff_start is 1-based
l_fasta.append( o_compo.seq[offset:] )

print( ''.join( l_fasta ), file=fasta_file )

def make_AGP_string( self, AGP_file ):
'''write AGP rows from the object.

AGP row starts with scaff_id, scaff_start, scaff_end.
Follows the component number, type.
Then depending on type we have
for type=contig id, scaff_start, scaff_end, orientation
for type=gap len, gap_type, linkage, links_evidence
'''
component_count = 0
for i in range( len(self.seqs) ):
sign = '+' if self.directions[i] else '-'
if i > 0 and self.gaps[i-1] > 0:
component_count += 1
obj_start = self.positions[i-1][1] + 2
obj_end = self.positions[i][0]
compo_len = self.gaps[i-1]
l_elts = [ self.name, obj_start, obj_end, component_count ]
l_elts +=[ 'N', compo_len, 'scaffold', 'yes', 'paired-ends' ]
print('\t'.join([ str(x) for x in l_elts ]), file=AGP_file)

for l_compo in self.l_components:
component_count += 1
obj_start = self.positions[i][0] + 1
obj_end = self.positions[i][1] + 1
compo_len = self.positions[i][1] - self.positions[i][0] + 1
l_elts = [ self.name, obj_start, obj_end, component_count ]
l_elts +=[ 'W', self.contigs[i],'1', compo_len, sign ]
print('\t'.join([ str(x) for x in l_elts ]), file=AGP_file)
o_compo = l_compo[0]
l_elts = [ self.name, l_compo[1], l_compo[2], component_count ]

if( isinstance( o_compo, Gap ) ):
l_component = [ 'N', o_compo.len, 'scaffold', 'yes', 'paired-ends' ]
else:
strand = '+' if o_compo.direction else '-'
l_component = [ 'W', o_compo.id ]
l_component += o_compo.get_scaff_ctg_coords()
l_component.append( strand )

print( '\t'.join([ str(x) for x in l_elts + l_component ]), file=AGP_file )

def make_GFF_string( self, gff_file ):
'''write GFF rows from the object.

GFF row starts with scaff_id, source, component type, scaff_start, scaff_end.
Follows score, strand, phase and ends with attributes.
Note: we only provide ID and Name as attributes.
'''
source = 'besst_assembly'
for i in range( len(self.seqs) ):
sign = '+' if self.directions[i] else '-'
if i > 0 and self.gaps[i-1] > 0:
obj_start = self.positions[i-1][1] + 2
obj_end = self.positions[i][0]
l_attrs = []
l_elts = [ self.name, source, 'gap', obj_start, obj_end ]
l_elts += [ '.', '.', '.', ';'.join( l_attrs ) ]
print('\t'.join([ str(x) for x in l_elts ]), file=gff_file)

obj_start = self.positions[i][0] + 1
obj_end = self.positions[i][1] + 1
name = "_".join( self.contigs[i].split('_',2)[:2] ) # only kept NODE_XXXX from ID
l_attrs = [ 'ID='+self.contigs[i], 'Name='+name ]
l_elts = [ self.name, source, 'contig', obj_start, obj_end ]
l_elts += [ '.', sign, '.', ';'.join( l_attrs ) ]
print('\t'.join([ str(x) for x in l_elts ]), file=gff_file)

def PrintOutput(F, Information, output_dest, param, pass_nr):

for l_compo in self.l_components:
o_compo = l_compo[0]

( score, strand, phase ) = ( '.', '.', '.' )
if( isinstance( o_compo, Gap ) ):
type_ = 'gap'
l_attrs = []
else:
type_ = 'contig'
strand = '+' if o_compo.direction else '-'
name = "_".join( o_compo.id.split('_',2)[:2] ) # only kept NODE_XXXX from ID
l_attrs = [ 'ID='+ o_compo.id, 'Name='+name ]

l_elts = [ self.name, source, type_, l_compo[1], l_compo[2] ]
l_elts += [ score, strand, phase, ';'.join( l_attrs ) ]

print( '\t'.join([ str(x) for x in l_elts ]), file=gff_file )


def PrintOutput(F, param, pass_nr):
try:
os.mkdir(param.output_directory + '/pass' + str(pass_nr))
except OSError:
#directory is already created
pass
#contigs_before=len(C_dict)
contigs_after = len(F)
print('(super)Contigs after scaffolding: ' + str(contigs_after) + '\n', file=Information)
print('(super)Contigs after scaffolding: ' + str(contigs_after) + '\n', file=param.information_file)
gff_file = open(param.output_directory + '/pass' + str(pass_nr) + '/info-pass' + str(pass_nr) + '.gff', 'w')
AGP_file = open(param.output_directory + '/pass' + str(pass_nr) + '/info-pass' + str(pass_nr) + '.agp', 'w')
print('##gff-version 3', file=gff_file)
Expand All @@ -219,8 +354,8 @@ def PrintOutput(F, Information, output_dest, param, pass_nr):


s = Scaffold('scaffold_' + str(header_index) + "_uid_" + unique_id , param, scaf)
s.make_fasta_string(fasta_file)
s.make_AGP_string(AGP_file)
s.make_fasta_string( fasta_file )
s.make_AGP_string( AGP_file )
s.make_GFF_string( gff_file )

return()
Expand All @@ -230,5 +365,3 @@ def RevComp(string, rev_nuc):
#rev_nuc={'A':'T','C':'G','G':'C','T':'A','N':'N','X':'X'}
rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
return(rev_comp)


2 changes: 1 addition & 1 deletion runBESST
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def main(options):
F = GO.WriteToF(F, Contigs, list_of_contigs)


GO.PrintOutput(F, Information, param.output_directory, param, i + 1)
GO.PrintOutput(F, param, i + 1)

if not options.separate_repeats:
destination = open("{0}/pass{1}/Scaffolds_pass{1}.fa".format(param.output_directory, str(i+1) ), 'wb')
Expand Down