Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 129 additions & 28 deletions gene_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
YOUR HEADER COMMENT HERE

@author: YOUR NAME HERE
@author: Harris Davidson

"""

Expand Down Expand Up @@ -30,10 +30,17 @@ def get_complement(nucleotide):
>>> get_complement('C')
'G'
"""
# TODO: implement this
pass
if nucleotide == 'A': return 'T'
elif nucleotide == 'T': return 'A'
elif nucleotide == 'C': return 'G'
elif nucleotide == 'G': return 'C'


# print(get_complement('A'))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remember to delete your intermediate print statements when submitting your code.

# print(get_complement('T'))
# print(get_complement('C'))
#print(get_complement('G'))

def get_reverse_complement(dna):
""" Computes the reverse complementary sequence of DNA for the specfied DNA
sequence
Expand All @@ -45,9 +52,12 @@ def get_reverse_complement(dna):
>>> get_reverse_complement("CCGCGTTCA")
'TGAACGCGG'
"""
# TODO: implement this
pass
compliment = ""
for f in dna:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try using variable names that are more descriptive than a random letter - it helps with code readability.

compliment = compliment + get_complement(f)
return compliment[::-1]

#print(get_reverse_complement("ATGCCCGCTTT"))

def rest_of_ORF(dna):
""" Takes a DNA sequence that is assumed to begin with a start
Expand All @@ -62,9 +72,41 @@ def rest_of_ORF(dna):
>>> rest_of_ORF("ATGAGATAGG")
'ATGAGA'
"""
# TODO: implement this
pass

i = 0
while i < len(dna):
codon = dna[i:i+3]
if (codon == 'TAG') or (codon == 'TAA') or (codon == 'TGA'):
return dna[0:i]
i = i+3
return dna

#print(rest_of_ORF("ATGAGATAGG"))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, delete commented out portions of code when submitting.

# pos = [] #indeices of stop codons
# b = 0
# while b>-1:
# b = dna.find("TAG",b+1)
# pos = pos + [b]
# b=0
# while b>-1:
# b = dna.find("TAA",b+1)
# pos = pos + [b]
# b=0
# while b>-1:
# b = dna.find("TGA",b+1)
# pos = pos + [b]
#
# stops = []
#
# for x in pos:
# if 0==x%3:
# stops.append(x)
# stops.sort
# if len(stops) == 0:
# return dna
# else:
# return dna[0:stops[-1]]

#print(rest_of_ORF('ATGAGATAGG'))

def find_all_ORFs_oneframe(dna):
""" Finds all non-nested open reading frames in the given DNA
Expand All @@ -79,9 +121,23 @@ def find_all_ORFs_oneframe(dna):
>>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
['ATGCATGAATGTAGA', 'ATGTGCCC']
"""
# TODO: implement this
pass

all_ORFs = []
#print(all_ORFs)

#for i in range(0,len(dna),3):
i = 0
while i < len(dna)-3:
if dna[i:i+3] == 'ATG':
#print(rest_of_ORF(dna[i:]))
all_ORFs.append(rest_of_ORF(dna[i:]))
#print(all_ORFs[-1])
#print(len(all_ORFs[-1]))
i = i + len(all_ORFs[-1])
#print(i)
else:
i = i + 3
return all_ORFs
#print(find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC"))

def find_all_ORFs(dna):
""" Finds all non-nested open reading frames in the given DNA sequence in
Expand All @@ -96,9 +152,8 @@ def find_all_ORFs(dna):
>>> find_all_ORFs("ATGCATGAATGTAG")
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
"""
# TODO: implement this
pass

return find_all_ORFs_oneframe(dna)+find_all_ORFs_oneframe(dna[1:])+find_all_ORFs_oneframe(dna[2:])
#print(find_all_ORFs('ATGCATGAATGTAG'))

def find_all_ORFs_both_strands(dna):
""" Finds all non-nested open reading frames in the given DNA sequence on both
Expand All @@ -109,18 +164,22 @@ def find_all_ORFs_both_strands(dna):
>>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
['ATGCGAATG', 'ATGCTACATTCGCAT']
"""
# TODO: implement this
pass

return find_all_ORFs(dna) + find_all_ORFs(get_reverse_complement(dna))
#print(find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA"))

def longest_ORF(dna):
""" Finds the longest ORF on both strands of the specified DNA and returns it
as a string
>>> longest_ORF("ATGCGAATGTAGCATCAAA")
'ATGCTACATTCGCAT'
"""
# TODO: implement this
pass
all_ORFs = find_all_ORFs_both_strands(dna)
length = []
for f in all_ORFs:
length.append(len(f))
longest = length.index(max(length))
return all_ORFs[longest]
#print(all_ORFs[longest])


def longest_ORF_noncoding(dna, num_trials):
Expand All @@ -130,9 +189,15 @@ def longest_ORF_noncoding(dna, num_trials):
dna: a DNA sequence
num_trials: the number of random shuffles
returns: the maximum length longest ORF """
# TODO: implement this
pass
max_length = 0
length = []
for i in range(1,num_trials):
sdna = shuffle_string(dna)
if(max_length < len(longest_ORF(sdna))):
max_length = len(longest_ORF(sdna))
return max_length

#print(longest_ORF_noncoding('ATGCGAATGTAGCATCAAA',3))

def coding_strand_to_AA(dna):
""" Computes the Protein encoded by a sequence of DNA. This function
Expand All @@ -148,8 +213,17 @@ def coding_strand_to_AA(dna):
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'
"""
# TODO: implement this
pass
acid_sequence = ''
for i in range(0,len(dna)-2,3):
#print(type(i))
#codon = dna[i:i+3]

#print(codon)
acid_sequence = acid_sequence + aa_table[dna[i:i+3]]
#print(acid_sequence)
return acid_sequence

#print(coding_strand_to_AA("ATGCCCGCTTT"))


def gene_finder(dna):
Expand All @@ -158,9 +232,36 @@ def gene_finder(dna):
dna: a DNA sequence
returns: a list of all amino acid sequences coded by the sequence dna.
"""
# TODO: implement this
pass

if __name__ == "__main__":
import doctest
doctest.testmod()
minlength = longest_ORF_noncoding(dna,1500)
#minlength = 6000
print(minlength)
all_ORFs = find_all_ORFs_both_strands(dna)
#print(all_ORFs)
all_coding_ORFs=[]
for i in range(0,len(all_ORFs)-1):
if len(all_ORFs[i]) > minlength:
#print(all_ORFs[i])
all_coding_ORFs.append(all_ORFs[i])
#else:
#print('too short')
#print(all_coding_ORFs)
amino_sequences=[]
for i in range(0,len(all_coding_ORFs)):
#print(coding_strand_to_AA(all_coding_ORFs[i]))
#print(all_coding_ORFs[i])
amino_sequences.append(coding_strand_to_AA(all_coding_ORFs[i]))

return amino_sequences

#print(gene_finder("ATGCCCGCTTT"))

from load import load_seq
dna = load_seq("./data/X73525.fa")

print(gene_finder(dna))
#gene_finder(dna)
#
# if __name__ == "__main__":
# import doctest
# doctest.testmod(verbose=True)