From ce96de4039670ea377537285077a058ff4a2ddf5 Mon Sep 17 00:00:00 2001 From: hrrs Date: Sun, 24 Sep 2017 17:27:30 -0400 Subject: [PATCH] Turning in my MP1 --- gene_finder.py | 157 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 129 insertions(+), 28 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 3b1e7dd..9445d2c 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -2,7 +2,7 @@ """ YOUR HEADER COMMENT HERE -@author: YOUR NAME HERE +@author: Harris Davidson """ @@ -30,10 +30,17 @@ def get_complement(nucleotide): >>> get_complement('C') 'G' """ - # TODO: implement this - pass + if nucleotide == 'A': return 'T' + elif nucleotide == 'T': return 'A' + elif nucleotide == 'C': return 'G' + elif nucleotide == 'G': return 'C' +# print(get_complement('A')) +# print(get_complement('T')) +# print(get_complement('C')) +#print(get_complement('G')) + def get_reverse_complement(dna): """ Computes the reverse complementary sequence of DNA for the specfied DNA sequence @@ -45,9 +52,12 @@ def get_reverse_complement(dna): >>> get_reverse_complement("CCGCGTTCA") 'TGAACGCGG' """ - # TODO: implement this - pass + compliment = "" + for f in dna: + compliment = compliment + get_complement(f) + return compliment[::-1] +#print(get_reverse_complement("ATGCCCGCTTT")) def rest_of_ORF(dna): """ Takes a DNA sequence that is assumed to begin with a start @@ -62,9 +72,41 @@ def rest_of_ORF(dna): >>> rest_of_ORF("ATGAGATAGG") 'ATGAGA' """ - # TODO: implement this - pass - + i = 0 + while i < len(dna): + codon = dna[i:i+3] + if (codon == 'TAG') or (codon == 'TAA') or (codon == 'TGA'): + return dna[0:i] + i = i+3 + return dna + +#print(rest_of_ORF("ATGAGATAGG")) + # pos = [] #indeices of stop codons + # b = 0 + # while b>-1: + # b = dna.find("TAG",b+1) + # pos = pos + [b] + # b=0 + # while b>-1: + # b = dna.find("TAA",b+1) + # pos = pos + [b] + # b=0 + # while b>-1: + # b = dna.find("TGA",b+1) + # pos = pos + [b] + # + # stops = [] + # + # for x in pos: + # if 0==x%3: + # stops.append(x) + # stops.sort + # if len(stops) == 0: + # return dna + # else: + # return dna[0:stops[-1]] + +#print(rest_of_ORF('ATGAGATAGG')) def find_all_ORFs_oneframe(dna): """ Finds all non-nested open reading frames in the given DNA @@ -79,9 +121,23 @@ def find_all_ORFs_oneframe(dna): >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") ['ATGCATGAATGTAGA', 'ATGTGCCC'] """ - # TODO: implement this - pass - + all_ORFs = [] + #print(all_ORFs) + + #for i in range(0,len(dna),3): + i = 0 + while i < len(dna)-3: + if dna[i:i+3] == 'ATG': + #print(rest_of_ORF(dna[i:])) + all_ORFs.append(rest_of_ORF(dna[i:])) + #print(all_ORFs[-1]) + #print(len(all_ORFs[-1])) + i = i + len(all_ORFs[-1]) + #print(i) + else: + i = i + 3 + return all_ORFs +#print(find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")) def find_all_ORFs(dna): """ Finds all non-nested open reading frames in the given DNA sequence in @@ -96,9 +152,8 @@ def find_all_ORFs(dna): >>> find_all_ORFs("ATGCATGAATGTAG") ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] """ - # TODO: implement this - pass - + return find_all_ORFs_oneframe(dna)+find_all_ORFs_oneframe(dna[1:])+find_all_ORFs_oneframe(dna[2:]) +#print(find_all_ORFs('ATGCATGAATGTAG')) def find_all_ORFs_both_strands(dna): """ Finds all non-nested open reading frames in the given DNA sequence on both @@ -109,9 +164,8 @@ def find_all_ORFs_both_strands(dna): >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") ['ATGCGAATG', 'ATGCTACATTCGCAT'] """ - # TODO: implement this - pass - + return find_all_ORFs(dna) + find_all_ORFs(get_reverse_complement(dna)) +#print(find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")) def longest_ORF(dna): """ Finds the longest ORF on both strands of the specified DNA and returns it @@ -119,8 +173,13 @@ def longest_ORF(dna): >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ - # TODO: implement this - pass + all_ORFs = find_all_ORFs_both_strands(dna) + length = [] + for f in all_ORFs: + length.append(len(f)) + longest = length.index(max(length)) + return all_ORFs[longest] + #print(all_ORFs[longest]) def longest_ORF_noncoding(dna, num_trials): @@ -130,9 +189,15 @@ def longest_ORF_noncoding(dna, num_trials): dna: a DNA sequence num_trials: the number of random shuffles returns: the maximum length longest ORF """ - # TODO: implement this - pass + max_length = 0 + length = [] + for i in range(1,num_trials): + sdna = shuffle_string(dna) + if(max_length < len(longest_ORF(sdna))): + max_length = len(longest_ORF(sdna)) + return max_length +#print(longest_ORF_noncoding('ATGCGAATGTAGCATCAAA',3)) def coding_strand_to_AA(dna): """ Computes the Protein encoded by a sequence of DNA. This function @@ -148,8 +213,17 @@ def coding_strand_to_AA(dna): >>> coding_strand_to_AA("ATGCCCGCTTT") 'MPA' """ - # TODO: implement this - pass + acid_sequence = '' + for i in range(0,len(dna)-2,3): + #print(type(i)) + #codon = dna[i:i+3] + + #print(codon) + acid_sequence = acid_sequence + aa_table[dna[i:i+3]] + #print(acid_sequence) + return acid_sequence + +#print(coding_strand_to_AA("ATGCCCGCTTT")) def gene_finder(dna): @@ -158,9 +232,36 @@ def gene_finder(dna): dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ - # TODO: implement this - pass -if __name__ == "__main__": - import doctest - doctest.testmod() + minlength = longest_ORF_noncoding(dna,1500) + #minlength = 6000 + print(minlength) + all_ORFs = find_all_ORFs_both_strands(dna) + #print(all_ORFs) + all_coding_ORFs=[] + for i in range(0,len(all_ORFs)-1): + if len(all_ORFs[i]) > minlength: + #print(all_ORFs[i]) + all_coding_ORFs.append(all_ORFs[i]) + #else: + #print('too short') + #print(all_coding_ORFs) + amino_sequences=[] + for i in range(0,len(all_coding_ORFs)): + #print(coding_strand_to_AA(all_coding_ORFs[i])) + #print(all_coding_ORFs[i]) + amino_sequences.append(coding_strand_to_AA(all_coding_ORFs[i])) + + return amino_sequences + +#print(gene_finder("ATGCCCGCTTT")) + +from load import load_seq +dna = load_seq("./data/X73525.fa") + +print(gene_finder(dna)) +#gene_finder(dna) +# +# if __name__ == "__main__": +# import doctest +# doctest.testmod(verbose=True)