From de73a030da22b011265856f5c410c2d924ab2fd5 Mon Sep 17 00:00:00 2001 From: Vivien Chen Date: Wed, 13 Sep 2017 02:40:44 -0400 Subject: [PATCH 1/5] Completed gene finder --- gene_finder.py | 185 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 156 insertions(+), 29 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 3b1e7dd..56815e3 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -1,14 +1,16 @@ # -*- coding: utf-8 -*- """ -YOUR HEADER COMMENT HERE +Finds amino acid sequences coded by DNA input -@author: YOUR NAME HERE +@author: Vivien Chen """ import random from amino_acids import aa, codons, aa_table # you may find these useful from load import load_seq +dna = load_seq("./data/X73525.fa") +import doctest def shuffle_string(s): @@ -25,13 +27,31 @@ def get_complement(nucleotide): nucleotide: a nucleotide (A, C, G, or T) represented as a string returns: the complementary nucleotide + + I added two more doctests because each complementary nucleotide is in + its own if/else if branch. If one branch doesn't work and there is no + doctest for that branch, the mistake will not be caught. In fact, I + found out that I spelled nucleotide wrong in the G branch. + >>> get_complement('A') 'T' >>> get_complement('C') 'G' + >>> get_complement('G') + 'C' + >>> get_complement('T') + 'A' """ - # TODO: implement this - pass + if nucleotide == 'A': + return 'T' + elif nucleotide == 'C': + return 'G' + elif nucleotide == 'G': + return 'C' + elif nucleotide == 'T': + return 'A' + +#doctest.run_docstring_examples(get_complement, globals(), verbose=True) def get_reverse_complement(dna): @@ -40,13 +60,27 @@ def get_reverse_complement(dna): dna: a DNA sequence represented as a string returns: the reverse complementary DNA sequence represented as a string + + I did not add any more doctests because each string contains all the + nucleotides, so two doctests with random strings is sufficient. + >>> get_reverse_complement("ATGCCCGCTTT") 'AAAGCGGGCAT' >>> get_reverse_complement("CCGCGTTCA") 'TGAACGCGG' """ - # TODO: implement this - pass + reversed_dna = dna[::-1] + comp_list = [] + + for nuc in range(0, len(reversed_dna)): + comp_nuc = get_complement(reversed_dna[nuc]) + comp_list.append(comp_nuc) + + comp_str = ''.join(comp_list) + + return comp_str + +#doctest.run_docstring_examples(get_reverse_complement, globals(), verbose=True) def rest_of_ORF(dna): @@ -57,13 +91,29 @@ def rest_of_ORF(dna): dna: a DNA sequence returns: the open reading frame represented as a string + + I added two more doctests: one in which the frame stop codon is TAA and + another in which there is no frame stop codon. This way, all the + branches are tested for error. + >>> rest_of_ORF("ATGTGAA") 'ATG' >>> rest_of_ORF("ATGAGATAGG") 'ATGAGA' + >>> rest_of_ORF("ATGGAATAAGTA") + 'ATGGAA' + >>> rest_of_ORF("ATGAGAGA") + 'ATGAGAGA' """ - # TODO: implement this - pass + #start codon: ATG + #stop codons: TAA, TAG, TGA + for i in range(3, len(dna)-2, 3): + if dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA': + return dna[:i] + + return dna + +#doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True) def find_all_ORFs_oneframe(dna): @@ -76,11 +126,29 @@ def find_all_ORFs_oneframe(dna): dna: a DNA sequence returns: a list of non-nested ORFs + + I added another doctest that includes three nested ATGs in the default + frame of the sequence in order to make sure that the function does not + return nested ORFs in that frame. + >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") ['ATGCATGAATGTAGA', 'ATGTGCCC'] + >>> find_all_ORFs_oneframe('ATGATGATGTAAC') + ['ATGATGATG'] """ - # TODO: implement this - pass + all_ORFs = [] + stopped = True + + for i in range(0, len(dna)-2, 3): + if dna[i:i+3] == 'ATG' and stopped: + all_ORFs.append(rest_of_ORF(dna[i:])) + stopped = False + elif dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA': + stopped = True + + return all_ORFs + +#doctest.run_docstring_examples(find_all_ORFs_oneframe, globals(), verbose=True) def find_all_ORFs(dna): @@ -93,11 +161,21 @@ def find_all_ORFs(dna): dna: a DNA sequence returns: a list of non-nested ORFs + I did not add any more doctests because this function depends on the + previous functions, so assuming the doctests of those were fruitful, + one doctest is sufficient for this function. The doctest includes one + ORF in each frame, so the function works if the doctest yields true. + >>> find_all_ORFs("ATGCATGAATGTAG") ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] """ - # TODO: implement this - pass + frame1 = find_all_ORFs_oneframe(dna) + frame2 = find_all_ORFs_oneframe(dna[1:]) + frame3 = find_all_ORFs_oneframe(dna[2:]) + + return frame1 + frame2 + frame3 + +#doctest.run_docstring_examples(find_all_ORFs, globals(), verbose=True) def find_all_ORFs_both_strands(dna): @@ -106,21 +184,40 @@ def find_all_ORFs_both_strands(dna): dna: a DNA sequence returns: a list of non-nested ORFs + + I did not add any more doctests because this function depends on the + previous functions, so assuming the doctests of those were fruitful, + one doctest is sufficient for this function. The doctest includes one + ORF in each strand, so the function works if the doctest yields true. + >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") ['ATGCGAATG', 'ATGCTACATTCGCAT'] """ - # TODO: implement this - pass + strand1 = find_all_ORFs(dna) + strand2 = find_all_ORFs(get_reverse_complement(dna)) + + return strand1 + strand2 + +#doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True) def longest_ORF(dna): """ Finds the longest ORF on both strands of the specified DNA and returns it as a string + >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ - # TODO: implement this - pass + all_ORFs = find_all_ORFs_both_strands(dna) + humongest = 0 + + for i in range(0, len(all_ORFs)-1): + if len(all_ORFs[i]) < len(all_ORFs[i+1]): + humongest = all_ORFs[i+1] + + return humongest + +#doctest.run_docstring_examples(longest_ORF, globals(), verbose=True) def longest_ORF_noncoding(dna, num_trials): @@ -130,8 +227,18 @@ def longest_ORF_noncoding(dna, num_trials): dna: a DNA sequence num_trials: the number of random shuffles returns: the maximum length longest ORF """ - # TODO: implement this - pass + dna_shuffles = [] + + for i in range(0, num_trials): + dna_shuffles.append(shuffle_string(dna)) + + for i in range(0, len(dna_shuffles)-1): + if longest_ORF(dna_shuffles[i]) < longest_ORF(dna_shuffles[i+1]): + humongest = longest_ORF(dna_shuffles[i+1]) + + return len(humongest) + +#print longest_ORF_noncoding('ATGCGAATGTAGCATCAAA', 100) def coding_strand_to_AA(dna): @@ -143,13 +250,22 @@ def coding_strand_to_AA(dna): returns: a string containing the sequence of amino acids encoded by the the input DNA fragment - >>> coding_strand_to_AA("ATGCGA") - 'MR' - >>> coding_strand_to_AA("ATGCCCGCTTT") - 'MPA' + >>> coding_strand_to_AA("ATGCGA") + 'MR' + >>> coding_strand_to_AA("ATGCCCGCTTT") + 'MPA' """ - # TODO: implement this - pass + amino_acids = [] + + for i in range(0, len(dna)-2, 3): + amino_acid = aa_table[dna[i:i+3]] + amino_acids.append(amino_acid) + + amino_acids_str = ''.join(amino_acids) + + return amino_acids_str + +#doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True) def gene_finder(dna): @@ -158,9 +274,20 @@ def gene_finder(dna): dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ - # TODO: implement this - pass + threshold = longest_ORF_noncoding(dna, 1500) + all_ORFs = find_all_ORFs_both_strands(dna) + amino_acids = [] + + for i in range(0, len(all_ORFs)): + if len(all_ORFs[i]) > threshold: + amino_acid = coding_strand_to_AA(all_ORFs[i]) + amino_acids.append(amino_acid) + + return amino_acids + +print(gene_finder(dna)) + -if __name__ == "__main__": - import doctest - doctest.testmod() +#if __name__ == "__main__": +# import doctest +# doctest.testmod() From d7e632a5f62cba587e46a8a6e074693d16da8125 Mon Sep 17 00:00:00 2001 From: Vivien Chen Date: Thu, 14 Sep 2017 12:54:10 -0400 Subject: [PATCH 2/5] Updated with cooler features --- gene_finder.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 56815e3..de28e06 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -105,10 +105,10 @@ def rest_of_ORF(dna): >>> rest_of_ORF("ATGAGAGA") 'ATGAGAGA' """ - #start codon: ATG - #stop codons: TAA, TAG, TGA + stop_codons = ['TAA', 'TAG', 'TGA'] + for i in range(3, len(dna)-2, 3): - if dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA': + if dna[i:i+3] in stop_codons: return dna[:i] return dna @@ -138,12 +138,13 @@ def find_all_ORFs_oneframe(dna): """ all_ORFs = [] stopped = True + stop_codons = stop_codons = ['TAA', 'TAG', 'TGA'] for i in range(0, len(dna)-2, 3): if dna[i:i+3] == 'ATG' and stopped: all_ORFs.append(rest_of_ORF(dna[i:])) stopped = False - elif dna[i:i+3] == 'TAA' or dna[i:i+3] == 'TAG' or dna[i:i+3] == 'TGA': + elif dna[i:i+3] in stop_codons: stopped = True return all_ORFs From 5b768c40b8480679814697ed078bdf2dc7f9273c Mon Sep 17 00:00:00 2001 From: Vivien Chen Date: Thu, 14 Sep 2017 17:58:19 -0400 Subject: [PATCH 3/5] Completed gene finder with pickling --- gene_finder.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index de28e06..4028e5d 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -11,6 +11,7 @@ from load import load_seq dna = load_seq("./data/X73525.fa") import doctest +from pickle import dump, load def shuffle_string(s): @@ -138,7 +139,7 @@ def find_all_ORFs_oneframe(dna): """ all_ORFs = [] stopped = True - stop_codons = stop_codons = ['TAA', 'TAG', 'TGA'] + stop_codons = ['TAA', 'TAG', 'TGA'] for i in range(0, len(dna)-2, 3): if dna[i:i+3] == 'ATG' and stopped: @@ -286,9 +287,17 @@ def gene_finder(dna): return amino_acids -print(gene_finder(dna)) +genes = gene_finder(dna) + +#write to genes.txt file +file_ = open('genes.txt', 'wb') +dump(genes, file_) + +#read from genes.txt file +file_ = open('genes.txt', 'rb+') +print(load(file_)) -#if __name__ == "__main__": -# import doctest -# doctest.testmod() +if __name__ == "__main__": + import doctest + doctest.testmod() From 78b7d7eddd0ce2bb237d5e98a7db6d82aba85ba5 Mon Sep 17 00:00:00 2001 From: Vivien Chen Date: Thu, 14 Sep 2017 17:59:36 -0400 Subject: [PATCH 4/5] List of genes --- genes.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 genes.txt diff --git a/genes.txt b/genes.txt new file mode 100644 index 0000000..99add06 --- /dev/null +++ b/genes.txt @@ -0,0 +1,18 @@ +(lp0 +S'MSLRVRQIDRREWLLAQTATECQRHGREATLEYPTRQGMWVRLSDAEKRWSAWIKPGDWLEHVSPALAGAAVSAGAEHLVVPWLAATERPFELPVPHLSCRRLCVENPVPGSALPEGKLLHIMSDRGGLWFEHLPELPAVGGGRPKMLRWPLRFVIGSSDTQRSLLGRIGIGDVLLIRTSRAEVYCYAKKLGHFNRVEGGIIVETLDIQHIEEENNTTETAETLPGLNQLPVKLEFVLYRKNVTLAELEAMGQQQLLSLPTNAELNVEIMANGVLLGNGELVQMNDTLGVEIHEWLSESGNGE' +p1 +aS'MSSNKTEKPTKKRLEDSAKKGQSFKSKDLIIACLTLGGIAYLVSYGSFNEFMGIIKIIIADNFDQSMADYSLAVFGIGLKYLIPFMLLCLVCSALPALLQAGFVLATEALKPNLSALNPVEGAKKLFSMRTVKDTVKTLLYLSSFVVAAIICWKKYKVEIFSQLNGNIVGIAVIWRELLLALVLTCLACALIVLLLDAIAEYFLTMKDMKMDKEEVKREMKEQEGNPEVKSKRREVHMEILSEQVKSDIENSRLIVANPTHITIGIYFKPELMPIPMISVYETNQRALAVRAYAEKVGVPVIVDIKLARSLFKTHRRYDLVSLEEIDEVLRLLVWLEEVENAGKDVIQPQENEVRH' +p2 +aS'MGIFASAGCGKTMLMHMLIEQTEADVFVIGLIGERGREVTEFVDMLRASHKKEKCVLVFATSDFPSVDRCNAAQLATTVAEYFRDQGKRVVLFIDSMTRYARALRDVALASGERPARRGYPASVFDNLPRLLERPGATSEGSITAFYTVLLESEEEADPMADEIRSILDGHLYLSRKLAGQGHYPAIDVLKSVSRVFGQVTTPTHAEQASAVRKLMTRLEELQLFIDLGEYRPGENIDNDRAMQMRDSLKAWLCQPVAQYSSFDDTLSGMNAFADQN' +p3 +aS'MGDVSAVSSSGNILLPQQDEVGGLSEALKKAVEKHKTEYSGDKKDRDYGDAFVMHKETALPLLLAAWRHGAPAKSEHHNGNVSGLHHNGKSELRIAEKLLKVTAEKSVGLISAEAKVDKSAALLSSKNRPLESVSGKKLSADLKAVESVSEVTDNATGISDDNIKALPGDNKAIAGEGVRKEGAPLARDVAPARMAAANTGKPEDKDHKKVKDVSQLPLQPTTIADLSQLTGGDEKMPLAAQSKPMMTIFPTADGVKGEDSSLTYRFQRWGNDYSVNIQARQAGEFSLIPSNTQVEHRLHDQWQNGNPQRWHLTRDDQQNPQQQQHRQQSGEEDDA' +p4 +aS'MGNDISLIALLAFSTLLPFIIASGTCFVKFSIVFVMVRNALGLQQIPSNMTLNGVALLLSMFVMWPIMHDAYVYFEDEDVTFNDISSLSKHVDEGLDGYRDYLIKYSDRELVQFFENAQLKRQYGEETETVKRDKDEIEKPSIFALLPAYALSEIKSAFKIGFYLYLPFVVVDLVVSSVLLALGMMMMSPVTISTPIKLVLFVALDGWTLLSKGLILQYMDIAT' +p5 +aS'MFYALYFEIHHLVASAALGFARVAPIFFFLPFLNSGVLSGAPRNAIIILVALGVWPHALNEAPPFLSVAMIPLVLQEAAVGVMLGCLLSWPFWVMHALGCIIDNQRGATLSSSIDPANGIDTSEMANFLNMFAAVVYLQNGGLVTMVDVLNKSYQLCDPMNECTPSLPPLLTFINQVAQNALVLASPVVLVLLLSEVFLGLLSRFAPQMNAFAISLTVKSGIAVLIMLLYFSPVLPDNVLRLSFQATGLSSWFYERGATHVLE' +p6 +aS'MCNNFPSGSALPGTGFSTHKRRQDKCGTGNSNGRSVAASQGTTRCSAPAETAAPARAGETCSSQSPGLIQADHRFSASLNRTHIPCRVGYSSVASRPWRWHSVAVCANSHSRRSICLTRNDIRRHPPRQIAVCAVAAADFVDRLASGASAGDYRFAIDHANDVQPAYLTVLTKTPLLAAPEY' +p7 +aS'MCRRRDLSKNAAYAFQYIDCRVMSLPGQLSAQIQVTVKDRANFIRHRVRLFLAFQQYRIKGSNASLAGRPWAFQQAGQIIEYGGGITSTSRTLSRRQCHVSQSTRITGHGIDKKHDPFSLVAKIFRYGCRQLRRIAAIDRGEIGSGKNQHAFFFLMRSAQHIHEFSDLTASFTDKTDNKDIRLRLLDQHMHQHGLTASCGGKNAHSLAYATGQ' +p8 +a. \ No newline at end of file From 0bf340b4743a38ecee2d8e0741d9e0adeb904792 Mon Sep 17 00:00:00 2001 From: Vivien Chen Date: Sun, 24 Sep 2017 23:45:57 -0400 Subject: [PATCH 5/5] Finally understood the purpose of if __name__ == __main__: --- gene_finder.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 4028e5d..3a8dfd5 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -287,17 +287,17 @@ def gene_finder(dna): return amino_acids -genes = gene_finder(dna) -#write to genes.txt file -file_ = open('genes.txt', 'wb') -dump(genes, file_) +if __name__ == "__main__": + #import doctest + #doctest.testmod() -#read from genes.txt file -file_ = open('genes.txt', 'rb+') -print(load(file_)) + genes = gene_finder(dna) + #write to genes.txt file + file_ = open('genes.txt', 'wb') + dump(genes, file_) -if __name__ == "__main__": - import doctest - doctest.testmod() + #read from genes.txt file + file_ = open('genes.txt', 'rb+') + print(load(file_))