From ce96de4039670ea377537285077a058ff4a2ddf5 Mon Sep 17 00:00:00 2001
From: hrrs <harris.davidson@students.olin.edu>
Date: Sun, 24 Sep 2017 17:27:30 -0400
Subject: [PATCH] Turning in my MP1

---
 gene_finder.py | 157 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 129 insertions(+), 28 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 3b1e7dd..9445d2c 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -2,7 +2,7 @@
 """
 YOUR HEADER COMMENT HERE
 
-@author: YOUR NAME HERE
+@author: Harris Davidson
 
 """
 
@@ -30,10 +30,17 @@ def get_complement(nucleotide):
     >>> get_complement('C')
     'G'
     """
-    # TODO: implement this
-    pass
+    if nucleotide == 'A': return 'T'
+    elif nucleotide == 'T': return 'A'
+    elif nucleotide == 'C': return 'G'
+    elif nucleotide == 'G': return 'C'
 
 
+# print(get_complement('A'))
+# print(get_complement('T'))
+# print(get_complement('C'))
+#print(get_complement('G'))
+
 def get_reverse_complement(dna):
     """ Computes the reverse complementary sequence of DNA for the specfied DNA
         sequence
@@ -45,9 +52,12 @@ def get_reverse_complement(dna):
     >>> get_reverse_complement("CCGCGTTCA")
     'TGAACGCGG'
     """
-    # TODO: implement this
-    pass
+    compliment = ""
+    for f in dna:
+        compliment = compliment + get_complement(f)
+    return compliment[::-1]
 
+#print(get_reverse_complement("ATGCCCGCTTT"))
 
 def rest_of_ORF(dna):
     """ Takes a DNA sequence that is assumed to begin with a start
@@ -62,9 +72,41 @@ def rest_of_ORF(dna):
     >>> rest_of_ORF("ATGAGATAGG")
     'ATGAGA'
     """
-    # TODO: implement this
-    pass
-
+    i = 0
+    while i < len(dna):
+        codon = dna[i:i+3]
+        if (codon == 'TAG') or (codon == 'TAA') or (codon == 'TGA'):
+            return dna[0:i]
+        i = i+3
+    return dna
+
+#print(rest_of_ORF("ATGAGATAGG"))
+    # pos = [] #indeices of stop codons
+    # b = 0
+    # while b>-1:
+    #     b = dna.find("TAG",b+1)
+    #     pos = pos + [b]
+    # b=0
+    # while b>-1:
+    #     b = dna.find("TAA",b+1)
+    #     pos = pos + [b]
+    # b=0
+    # while b>-1:
+    #     b = dna.find("TGA",b+1)
+    #     pos = pos + [b]
+    #
+    # stops = []
+    #
+    # for x in pos:
+    #     if 0==x%3:
+    #         stops.append(x)
+    # stops.sort
+    # if len(stops) == 0:
+    #     return dna
+    # else:
+    #     return dna[0:stops[-1]]
+
+#print(rest_of_ORF('ATGAGATAGG'))
 
 def find_all_ORFs_oneframe(dna):
     """ Finds all non-nested open reading frames in the given DNA
@@ -79,9 +121,23 @@ def find_all_ORFs_oneframe(dna):
     >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
     ['ATGCATGAATGTAGA', 'ATGTGCCC']
     """
-    # TODO: implement this
-    pass
-
+    all_ORFs = []
+    #print(all_ORFs)
+
+    #for i in range(0,len(dna),3):
+    i = 0
+    while i < len(dna)-3:
+        if dna[i:i+3] == 'ATG':
+            #print(rest_of_ORF(dna[i:]))
+            all_ORFs.append(rest_of_ORF(dna[i:]))
+            #print(all_ORFs[-1])
+            #print(len(all_ORFs[-1]))
+            i = i + len(all_ORFs[-1])
+            #print(i)
+        else:
+            i = i + 3
+    return all_ORFs
+#print(find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC"))
 
 def find_all_ORFs(dna):
     """ Finds all non-nested open reading frames in the given DNA sequence in
@@ -96,9 +152,8 @@ def find_all_ORFs(dna):
     >>> find_all_ORFs("ATGCATGAATGTAG")
     ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
     """
-    # TODO: implement this
-    pass
-
+    return find_all_ORFs_oneframe(dna)+find_all_ORFs_oneframe(dna[1:])+find_all_ORFs_oneframe(dna[2:])
+#print(find_all_ORFs('ATGCATGAATGTAG'))
 
 def find_all_ORFs_both_strands(dna):
     """ Finds all non-nested open reading frames in the given DNA sequence on both
@@ -109,9 +164,8 @@ def find_all_ORFs_both_strands(dna):
     >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
     ['ATGCGAATG', 'ATGCTACATTCGCAT']
     """
-    # TODO: implement this
-    pass
-
+    return find_all_ORFs(dna) + find_all_ORFs(get_reverse_complement(dna))
+#print(find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA"))
 
 def longest_ORF(dna):
     """ Finds the longest ORF on both strands of the specified DNA and returns it
@@ -119,8 +173,13 @@ def longest_ORF(dna):
     >>> longest_ORF("ATGCGAATGTAGCATCAAA")
     'ATGCTACATTCGCAT'
     """
-    # TODO: implement this
-    pass
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    length = []
+    for f in all_ORFs:
+        length.append(len(f))
+    longest = length.index(max(length))
+    return all_ORFs[longest]
+    #print(all_ORFs[longest])
 
 
 def longest_ORF_noncoding(dna, num_trials):
@@ -130,9 +189,15 @@ def longest_ORF_noncoding(dna, num_trials):
         dna: a DNA sequence
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
-    # TODO: implement this
-    pass
+    max_length = 0
+    length = []
+    for i in range(1,num_trials):
+        sdna = shuffle_string(dna)
+        if(max_length < len(longest_ORF(sdna))):
+            max_length = len(longest_ORF(sdna))
+    return max_length
 
+#print(longest_ORF_noncoding('ATGCGAATGTAGCATCAAA',3))
 
 def coding_strand_to_AA(dna):
     """ Computes the Protein encoded by a sequence of DNA.  This function
@@ -148,8 +213,17 @@ def coding_strand_to_AA(dna):
         >>> coding_strand_to_AA("ATGCCCGCTTT")
         'MPA'
     """
-    # TODO: implement this
-    pass
+    acid_sequence = ''
+    for i in range(0,len(dna)-2,3):
+        #print(type(i))
+        #codon = dna[i:i+3]
+
+        #print(codon)
+        acid_sequence =  acid_sequence + aa_table[dna[i:i+3]]
+    #print(acid_sequence)
+    return acid_sequence
+
+#print(coding_strand_to_AA("ATGCCCGCTTT"))
 
 
 def gene_finder(dna):
@@ -158,9 +232,36 @@ def gene_finder(dna):
         dna: a DNA sequence
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
-    # TODO: implement this
-    pass
 
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
+    minlength = longest_ORF_noncoding(dna,1500)
+    #minlength = 6000
+    print(minlength)
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    #print(all_ORFs)
+    all_coding_ORFs=[]
+    for i in range(0,len(all_ORFs)-1):
+        if len(all_ORFs[i]) > minlength:
+            #print(all_ORFs[i])
+            all_coding_ORFs.append(all_ORFs[i])
+        #else:
+            #print('too short')
+    #print(all_coding_ORFs)
+    amino_sequences=[]
+    for i in range(0,len(all_coding_ORFs)):
+        #print(coding_strand_to_AA(all_coding_ORFs[i]))
+        #print(all_coding_ORFs[i])
+        amino_sequences.append(coding_strand_to_AA(all_coding_ORFs[i]))
+
+    return amino_sequences
+
+#print(gene_finder("ATGCCCGCTTT"))
+
+from load import load_seq
+dna = load_seq("./data/X73525.fa")
+
+print(gene_finder(dna))
+#gene_finder(dna)
+#
+# if __name__ == "__main__":
+#     import doctest
+#     doctest.testmod(verbose=True)