From de73a030da22b011265856f5c410c2d924ab2fd5 Mon Sep 17 00:00:00 2001
From: Vivien Chen <vivienyuwenchen@gmail.com>
Date: Wed, 13 Sep 2017 02:40:44 -0400
Subject: [PATCH 1/5] Completed gene finder

---
 gene_finder.py | 185 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 156 insertions(+), 29 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 3b1e7dd..56815e3 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -1,14 +1,16 @@
 # -*- coding: utf-8 -*-
 """
-YOUR HEADER COMMENT HERE
+Finds amino acid sequences coded by DNA input
 
-@author: YOUR NAME HERE
+@author: Vivien Chen
 
 """
 
 import random
 from amino_acids import aa, codons, aa_table   # you may find these useful
 from load import load_seq
+dna = load_seq("./data/X73525.fa")
+import doctest
 
 
 def shuffle_string(s):
@@ -25,13 +27,31 @@ def get_complement(nucleotide):
 
         nucleotide: a nucleotide (A, C, G, or T) represented as a string
         returns: the complementary nucleotide
+
+        I added two more doctests because each complementary nucleotide is in
+        its own if/else if branch. If one branch doesn't work and there is no
+        doctest for that branch, the mistake will not be caught. In fact, I
+        found out that I spelled nucleotide wrong in the G branch.
+
     >>> get_complement('A')
     'T'
     >>> get_complement('C')
     'G'
+    >>> get_complement('G')
+    'C'
+    >>> get_complement('T')
+    'A'
     """
-    # TODO: implement this
-    pass
+    if nucleotide == 'A':
+        return 'T'
+    elif nucleotide == 'C':
+        return 'G'
+    elif nucleotide == 'G':
+        return 'C'
+    elif nucleotide == 'T':
+        return 'A'
+
+#doctest.run_docstring_examples(get_complement, globals(), verbose=True)
 
 
 def get_reverse_complement(dna):
@@ -40,13 +60,27 @@ def get_reverse_complement(dna):
 
         dna: a DNA sequence represented as a string
         returns: the reverse complementary DNA sequence represented as a string
+
+        I did not add any more doctests because each string contains all the
+        nucleotides, so two doctests with random strings is sufficient.
+
     >>> get_reverse_complement("ATGCCCGCTTT")
     'AAAGCGGGCAT'
     >>> get_reverse_complement("CCGCGTTCA")
     'TGAACGCGG'
     """
-    # TODO: implement this
-    pass
+    reversed_dna = dna[::-1]
+    comp_list = []
+
+    for nuc in range(0, len(reversed_dna)):
+        comp_nuc = get_complement(reversed_dna[nuc])
+        comp_list.append(comp_nuc)
+
+    comp_str = ''.join(comp_list)
+
+    return comp_str
+
+#doctest.run_docstring_examples(get_reverse_complement, globals(), verbose=True)
 
 
 def rest_of_ORF(dna):
@@ -57,13 +91,29 @@ def rest_of_ORF(dna):
 
         dna: a DNA sequence
         returns: the open reading frame represented as a string
+
+        I added two more doctests: one in which the frame stop codon is TAA and
+        another in which there is no frame stop codon. This way, all the
+        branches are tested for error.
+
     >>> rest_of_ORF("ATGTGAA")
     'ATG'
     >>> rest_of_ORF("ATGAGATAGG")
     'ATGAGA'
+    >>> rest_of_ORF("ATGGAATAAGTA")
+    'ATGGAA'
+    >>> rest_of_ORF("ATGAGAGA")
+    'ATGAGAGA'
     """
-    # TODO: implement this
-    pass
+    #start codon: ATG
+    #stop codons: TAA, TAG, TGA
+    for i in range(3, len(dna)-2, 3):
+        if dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA':
+            return dna[:i]
+
+    return dna
+
+#doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True)
 
 
 def find_all_ORFs_oneframe(dna):
@@ -76,11 +126,29 @@ def find_all_ORFs_oneframe(dna):
 
         dna: a DNA sequence
         returns: a list of non-nested ORFs
+
+        I added another doctest that includes three nested ATGs in the default
+        frame of the sequence in order to make sure that the function does not
+        return nested ORFs in that frame.
+
     >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
     ['ATGCATGAATGTAGA', 'ATGTGCCC']
+    >>> find_all_ORFs_oneframe('ATGATGATGTAAC')
+    ['ATGATGATG']
     """
-    # TODO: implement this
-    pass
+    all_ORFs = []
+    stopped = True
+
+    for i in range(0, len(dna)-2, 3):
+        if dna[i:i+3] == 'ATG' and stopped:
+            all_ORFs.append(rest_of_ORF(dna[i:]))
+            stopped = False
+        elif dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA':
+            stopped = True
+
+    return all_ORFs
+
+#doctest.run_docstring_examples(find_all_ORFs_oneframe, globals(), verbose=True)
 
 
 def find_all_ORFs(dna):
@@ -93,11 +161,21 @@ def find_all_ORFs(dna):
         dna: a DNA sequence
         returns: a list of non-nested ORFs
 
+        I did not add any more doctests because this function depends on the
+        previous functions, so assuming the doctests of those were fruitful,
+        one doctest is sufficient for this function. The doctest includes one
+        ORF in each frame, so the function works if the doctest yields true.
+
     >>> find_all_ORFs("ATGCATGAATGTAG")
     ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
     """
-    # TODO: implement this
-    pass
+    frame1 = find_all_ORFs_oneframe(dna)
+    frame2 = find_all_ORFs_oneframe(dna[1:])
+    frame3 = find_all_ORFs_oneframe(dna[2:])
+
+    return frame1 + frame2 + frame3
+
+#doctest.run_docstring_examples(find_all_ORFs, globals(), verbose=True)
 
 
 def find_all_ORFs_both_strands(dna):
@@ -106,21 +184,40 @@ def find_all_ORFs_both_strands(dna):
 
         dna: a DNA sequence
         returns: a list of non-nested ORFs
+
+        I did not add any more doctests because this function depends on the
+        previous functions, so assuming the doctests of those were fruitful,
+        one doctest is sufficient for this function. The doctest includes one
+        ORF in each strand, so the function works if the doctest yields true.
+
     >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
     ['ATGCGAATG', 'ATGCTACATTCGCAT']
     """
-    # TODO: implement this
-    pass
+    strand1 = find_all_ORFs(dna)
+    strand2 = find_all_ORFs(get_reverse_complement(dna))
+
+    return strand1 + strand2
+
+#doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True)
 
 
 def longest_ORF(dna):
     """ Finds the longest ORF on both strands of the specified DNA and returns it
         as a string
+
     >>> longest_ORF("ATGCGAATGTAGCATCAAA")
     'ATGCTACATTCGCAT'
     """
-    # TODO: implement this
-    pass
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    humongest = 0
+
+    for i in range(0, len(all_ORFs)-1):
+        if len(all_ORFs[i]) < len(all_ORFs[i+1]):
+            humongest = all_ORFs[i+1]
+
+    return humongest
+
+#doctest.run_docstring_examples(longest_ORF, globals(), verbose=True)
 
 
 def longest_ORF_noncoding(dna, num_trials):
@@ -130,8 +227,18 @@ def longest_ORF_noncoding(dna, num_trials):
         dna: a DNA sequence
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
-    # TODO: implement this
-    pass
+    dna_shuffles = []
+
+    for i in range(0, num_trials):
+        dna_shuffles.append(shuffle_string(dna))
+
+    for i in range(0, len(dna_shuffles)-1):
+        if longest_ORF(dna_shuffles[i]) < longest_ORF(dna_shuffles[i+1]):
+            humongest = longest_ORF(dna_shuffles[i+1])
+
+    return len(humongest)
+
+#print longest_ORF_noncoding('ATGCGAATGTAGCATCAAA', 100)
 
 
 def coding_strand_to_AA(dna):
@@ -143,13 +250,22 @@ def coding_strand_to_AA(dna):
         returns: a string containing the sequence of amino acids encoded by the
                  the input DNA fragment
 
-        >>> coding_strand_to_AA("ATGCGA")
-        'MR'
-        >>> coding_strand_to_AA("ATGCCCGCTTT")
-        'MPA'
+    >>> coding_strand_to_AA("ATGCGA")
+    'MR'
+    >>> coding_strand_to_AA("ATGCCCGCTTT")
+    'MPA'
     """
-    # TODO: implement this
-    pass
+    amino_acids = []
+
+    for i in range(0, len(dna)-2, 3):
+        amino_acid = aa_table[dna[i:i+3]]
+        amino_acids.append(amino_acid)
+
+    amino_acids_str = ''.join(amino_acids)
+
+    return amino_acids_str
+
+#doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True)
 
 
 def gene_finder(dna):
@@ -158,9 +274,20 @@ def gene_finder(dna):
         dna: a DNA sequence
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
-    # TODO: implement this
-    pass
+    threshold = longest_ORF_noncoding(dna, 1500)
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    amino_acids = []
+
+    for i in range(0, len(all_ORFs)):
+        if len(all_ORFs[i]) > threshold:
+            amino_acid = coding_strand_to_AA(all_ORFs[i])
+            amino_acids.append(amino_acid)
+
+    return amino_acids
+
+print(gene_finder(dna))
+
 
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
+#if __name__ == "__main__":
+#    import doctest
+#    doctest.testmod()

From d7e632a5f62cba587e46a8a6e074693d16da8125 Mon Sep 17 00:00:00 2001
From: Vivien Chen <vivienyuwenchen@gmail.com>
Date: Thu, 14 Sep 2017 12:54:10 -0400
Subject: [PATCH 2/5] Updated with cooler features

---
 gene_finder.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 56815e3..de28e06 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -105,10 +105,10 @@ def rest_of_ORF(dna):
     >>> rest_of_ORF("ATGAGAGA")
     'ATGAGAGA'
     """
-    #start codon: ATG
-    #stop codons: TAA, TAG, TGA
+    stop_codons = ['TAA', 'TAG', 'TGA']
+
     for i in range(3, len(dna)-2, 3):
-        if dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA':
+        if dna[i:i+3] in stop_codons:
             return dna[:i]
 
     return dna
@@ -138,12 +138,13 @@ def find_all_ORFs_oneframe(dna):
     """
     all_ORFs = []
     stopped = True
+    stop_codons = stop_codons = ['TAA', 'TAG', 'TGA']
 
     for i in range(0, len(dna)-2, 3):
         if dna[i:i+3] == 'ATG' and stopped:
             all_ORFs.append(rest_of_ORF(dna[i:]))
             stopped = False
-        elif dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA':
+        elif dna[i:i+3] in stop_codons:
             stopped = True
 
     return all_ORFs

From 5b768c40b8480679814697ed078bdf2dc7f9273c Mon Sep 17 00:00:00 2001
From: Vivien Chen <vivienyuwenchen@gmail.com>
Date: Thu, 14 Sep 2017 17:58:19 -0400
Subject: [PATCH 3/5] Completed gene finder with pickling

---
 gene_finder.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index de28e06..4028e5d 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -11,6 +11,7 @@
 from load import load_seq
 dna = load_seq("./data/X73525.fa")
 import doctest
+from pickle import dump, load
 
 
 def shuffle_string(s):
@@ -138,7 +139,7 @@ def find_all_ORFs_oneframe(dna):
     """
     all_ORFs = []
     stopped = True
-    stop_codons = stop_codons = ['TAA', 'TAG', 'TGA']
+    stop_codons = ['TAA', 'TAG', 'TGA']
 
     for i in range(0, len(dna)-2, 3):
         if dna[i:i+3] == 'ATG' and stopped:
@@ -286,9 +287,17 @@ def gene_finder(dna):
 
     return amino_acids
 
-print(gene_finder(dna))
+genes = gene_finder(dna)
+
+#write to genes.txt file
+file_ = open('genes.txt', 'wb')
+dump(genes, file_)
+
+#read from genes.txt file
+file_ = open('genes.txt', 'rb+')
+print(load(file_))
 
 
-#if __name__ == "__main__":
-#    import doctest
-#    doctest.testmod()
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()

From 78b7d7eddd0ce2bb237d5e98a7db6d82aba85ba5 Mon Sep 17 00:00:00 2001
From: Vivien Chen <vivienyuwenchen@gmail.com>
Date: Thu, 14 Sep 2017 17:59:36 -0400
Subject: [PATCH 4/5] List of genes

---
 genes.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 genes.txt

diff --git a/genes.txt b/genes.txt
new file mode 100644
index 0000000..99add06
--- /dev/null
+++ b/genes.txt
@@ -0,0 +1,18 @@
+(lp0
+S'MSLRVRQIDRREWLLAQTATECQRHGREATLEYPTRQGMWVRLSDAEKRWSAWIKPGDWLEHVSPALAGAAVSAGAEHLVVPWLAATERPFELPVPHLSCRRLCVENPVPGSALPEGKLLHIMSDRGGLWFEHLPELPAVGGGRPKMLRWPLRFVIGSSDTQRSLLGRIGIGDVLLIRTSRAEVYCYAKKLGHFNRVEGGIIVETLDIQHIEEENNTTETAETLPGLNQLPVKLEFVLYRKNVTLAELEAMGQQQLLSLPTNAELNVEIMANGVLLGNGELVQMNDTLGVEIHEWLSESGNGE'
+p1
+aS'MSSNKTEKPTKKRLEDSAKKGQSFKSKDLIIACLTLGGIAYLVSYGSFNEFMGIIKIIIADNFDQSMADYSLAVFGIGLKYLIPFMLLCLVCSALPALLQAGFVLATEALKPNLSALNPVEGAKKLFSMRTVKDTVKTLLYLSSFVVAAIICWKKYKVEIFSQLNGNIVGIAVIWRELLLALVLTCLACALIVLLLDAIAEYFLTMKDMKMDKEEVKREMKEQEGNPEVKSKRREVHMEILSEQVKSDIENSRLIVANPTHITIGIYFKPELMPIPMISVYETNQRALAVRAYAEKVGVPVIVDIKLARSLFKTHRRYDLVSLEEIDEVLRLLVWLEEVENAGKDVIQPQENEVRH'
+p2
+aS'MGIFASAGCGKTMLMHMLIEQTEADVFVIGLIGERGREVTEFVDMLRASHKKEKCVLVFATSDFPSVDRCNAAQLATTVAEYFRDQGKRVVLFIDSMTRYARALRDVALASGERPARRGYPASVFDNLPRLLERPGATSEGSITAFYTVLLESEEEADPMADEIRSILDGHLYLSRKLAGQGHYPAIDVLKSVSRVFGQVTTPTHAEQASAVRKLMTRLEELQLFIDLGEYRPGENIDNDRAMQMRDSLKAWLCQPVAQYSSFDDTLSGMNAFADQN'
+p3
+aS'MGDVSAVSSSGNILLPQQDEVGGLSEALKKAVEKHKTEYSGDKKDRDYGDAFVMHKETALPLLLAAWRHGAPAKSEHHNGNVSGLHHNGKSELRIAEKLLKVTAEKSVGLISAEAKVDKSAALLSSKNRPLESVSGKKLSADLKAVESVSEVTDNATGISDDNIKALPGDNKAIAGEGVRKEGAPLARDVAPARMAAANTGKPEDKDHKKVKDVSQLPLQPTTIADLSQLTGGDEKMPLAAQSKPMMTIFPTADGVKGEDSSLTYRFQRWGNDYSVNIQARQAGEFSLIPSNTQVEHRLHDQWQNGNPQRWHLTRDDQQNPQQQQHRQQSGEEDDA'
+p4
+aS'MGNDISLIALLAFSTLLPFIIASGTCFVKFSIVFVMVRNALGLQQIPSNMTLNGVALLLSMFVMWPIMHDAYVYFEDEDVTFNDISSLSKHVDEGLDGYRDYLIKYSDRELVQFFENAQLKRQYGEETETVKRDKDEIEKPSIFALLPAYALSEIKSAFKIGFYLYLPFVVVDLVVSSVLLALGMMMMSPVTISTPIKLVLFVALDGWTLLSKGLILQYMDIAT'
+p5
+aS'MFYALYFEIHHLVASAALGFARVAPIFFFLPFLNSGVLSGAPRNAIIILVALGVWPHALNEAPPFLSVAMIPLVLQEAAVGVMLGCLLSWPFWVMHALGCIIDNQRGATLSSSIDPANGIDTSEMANFLNMFAAVVYLQNGGLVTMVDVLNKSYQLCDPMNECTPSLPPLLTFINQVAQNALVLASPVVLVLLLSEVFLGLLSRFAPQMNAFAISLTVKSGIAVLIMLLYFSPVLPDNVLRLSFQATGLSSWFYERGATHVLE'
+p6
+aS'MCNNFPSGSALPGTGFSTHKRRQDKCGTGNSNGRSVAASQGTTRCSAPAETAAPARAGETCSSQSPGLIQADHRFSASLNRTHIPCRVGYSSVASRPWRWHSVAVCANSHSRRSICLTRNDIRRHPPRQIAVCAVAAADFVDRLASGASAGDYRFAIDHANDVQPAYLTVLTKTPLLAAPEY'
+p7
+aS'MCRRRDLSKNAAYAFQYIDCRVMSLPGQLSAQIQVTVKDRANFIRHRVRLFLAFQQYRIKGSNASLAGRPWAFQQAGQIIEYGGGITSTSRTLSRRQCHVSQSTRITGHGIDKKHDPFSLVAKIFRYGCRQLRRIAAIDRGEIGSGKNQHAFFFLMRSAQHIHEFSDLTASFTDKTDNKDIRLRLLDQHMHQHGLTASCGGKNAHSLAYATGQ'
+p8
+a.
\ No newline at end of file

From 0bf340b4743a38ecee2d8e0741d9e0adeb904792 Mon Sep 17 00:00:00 2001
From: Vivien Chen <vivienyuwenchen@gmail.com>
Date: Sun, 24 Sep 2017 23:45:57 -0400
Subject: [PATCH 5/5] Finally understood the purpose of if __name__ ==
 __main__:

---
 gene_finder.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 4028e5d..3a8dfd5 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -287,17 +287,17 @@ def gene_finder(dna):
 
     return amino_acids
 
-genes = gene_finder(dna)
 
-#write to genes.txt file
-file_ = open('genes.txt', 'wb')
-dump(genes, file_)
+if __name__ == "__main__":
+    #import doctest
+    #doctest.testmod()
 
-#read from genes.txt file
-file_ = open('genes.txt', 'rb+')
-print(load(file_))
+    genes = gene_finder(dna)
 
+    #write to genes.txt file
+    file_ = open('genes.txt', 'wb')
+    dump(genes, file_)
 
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
+    #read from genes.txt file
+    file_ = open('genes.txt', 'rb+')
+    print(load(file_))