From d1253e692288cd2776319058638c3c5d4468a529 Mon Sep 17 00:00:00 2001 From: andrewgwalker Date: Tue, 28 Jul 2020 17:03:24 -0500 Subject: [PATCH] Add a function for reading a fasta into a dict of numpy arrays --- allel/io/fasta.py | 44 ++++++++++++++++++++++++++++++- allel/test/data/test.fa | 45 ++++++++++++++++++++++++++++++++ allel/test/data/test_comments.fa | 10 +++++++ allel/test/io/test_fasta_read.py | 21 +++++++++++++++ 4 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 allel/test/data/test.fa create mode 100644 allel/test/data/test_comments.fa create mode 100644 allel/test/io/test_fasta_read.py diff --git a/allel/io/fasta.py b/allel/io/fasta.py index 80321aa7..4c4397a5 100644 --- a/allel/io/fasta.py +++ b/allel/io/fasta.py @@ -44,5 +44,47 @@ def write_fasta(path, sequences, names, mode='w', width=80): header = b'>' + name + b'\n' fasta.write(header) for i in range(0, sequence.size, width): - line = sequence[i:i+width].tostring() + b'\n' + line = sequence[i:i + width].tostring() + b'\n' fasta.write(line) + + +def read_fasta(path): + """Read nucleotide sequences from a FASTA file and return them as a dictionary + mapping a sequence name to a sequence. + + Parameters + ---------- + + path : string + File path. + + """ + sequences = [] + names = [] + + current_sequence = [] + + with open(path, 'r') as fasta: + + # Skip any headers and get the first line + for line in fasta: + if line.startswith('>'): + name = line[1:].rstrip() + break + + for line in fasta: + if line.startswith('>'): + names.append(name) + sequences.append(np.concatenate(current_sequence)) + name = line[1:].rstrip() + current_sequence = [] + elif line.startswith(';'): + continue + else: + current_sequence.append(np.array(list(line.strip()), dtype=np.dtype('S1'))) + else: + if names: + names.append(name) + sequences.append(np.concatenate(current_sequence)) + + return dict(zip(names, sequences)) diff --git a/allel/test/data/test.fa b/allel/test/data/test.fa new file mode 100644 index 00000000..a08edf82 --- /dev/null +++ b/allel/test/data/test.fa @@ -0,0 +1,45 @@ +>crab_anapl ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDITIHNPLIRRPLFSWLAPSRIFDQIFGEHLQESELLPASPSLSPFLMR +SPIFRMPSWLETGLSEMRLEKDKFSVNLDVKHFSPEELKVKVLGDMVEIH +GKHEERQDEHGFIAREFNRKYRIPADVDPLTITSSLSLDGVLTVSAPRKQ +SDVPERSIPITREEKPAIAGAQRK +>crab_bovin ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFPASTSLSPFYLR +PPSFLRAPSWIDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV +HGKHEERQDEHGFISREFHRKYRIPADVDPLAITSSLSSDGVLTVNGPRK +QASGPERTIPITREEKPAVTAAPKK +>crab_chick ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDITIHNPLVRRPLFSWLTPSRIFDQIFGEHLQESELLPTSPSLSPFLMR +SPFFRMPSWLETGLSEMRLEKDKFSVNLDVKHFSPEELKVKVLGDMIEIH +GKHEERQDEHGFIAREFSRKYRIPADVDPLTITSSLSLDGVLTVSAPRKQ +SDVPERSIPITREEKPAIAGSQRK +>crab_human ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFPTSTSLSPFYLR +PPSFLRAPSWFDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV +HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK +QVSGPERTIPITREEKPAVTAAPKK +>crab_mesau ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFSTATSLSPFYLR +PPSFLRAPSWIDTGLSEMRMEKDRFSVNLDVKHFSPEELKVKVLGDVVEV +HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK +QASGPERTIPITREEKPAVTAAPKK +>crab_mouse ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN) (P23). +MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFSTATSLSPFYLR +PPSFLRAPSWIDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV +HGKHEERQDEHGFISREFHRKYRIPADVDPLAITSSLSSDGVLTVNGPRK +QVSGPERTIPITREEKPAVAAAPKK +>crab_rabit ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFPTSTSLSPFYLR +PPSFLRAPSWIDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV +HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK +QAPGPERTIPITREEKPAVTAAPKK +>crab_rat ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFSTATSLSPFYLR +PPSFLRAPSWIDTGLSEMRMEKDRFSVNLDVKHFSPEELKVKVLGDVIEV +HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK +QASGPERTIPITREEKPAVTAAPKK +>crab_squac ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). +MDIAIQHPWLRRPLFPSSIFPSRIFDQNFGEHFDPDLFPSFSSMLSPFYW +RMGAPMARMPSWAQTGLSELRLDKDKFAIHLDVKHFTPEELRVKILGDFI +EVQAQHEERQDEHGYVSREFHRKYKVPAGVDPLVITCSLSADGVLTITGP +RKVADVPERSVPISRDEKPAVAGPQQK diff --git a/allel/test/data/test_comments.fa b/allel/test/data/test_comments.fa new file mode 100644 index 00000000..a399ba13 --- /dev/null +++ b/allel/test/data/test_comments.fa @@ -0,0 +1,10 @@ +;Comment line +>sequence A +ggtaagtcctctagtacaaacacccccaatattgtgatataattaaaattatattcatat +tctgttgccagaaaaaacacttttaggctatattagagccatcttctttgaagcgttgtc +;Multiple comment lines +;Another comment line +>sequence B +ggtaagtgctctagtacaaacacccccaatattgtgatataattaaaattatattcatat +tctgttgccagattttacacttttaggctatattagagccatcttctttgaagcgttgtc +tatgcatcgatcgacgactg diff --git a/allel/test/io/test_fasta_read.py b/allel/test/io/test_fasta_read.py new file mode 100644 index 00000000..f9682912 --- /dev/null +++ b/allel/test/io/test_fasta_read.py @@ -0,0 +1,21 @@ +import os + +from allel.io.fasta import read_fasta + + +def fixture_path(fn): + return os.path.join(os.path.dirname(__file__), os.pardir, 'data', fn) + + +def test_fasta_read(): + path = fixture_path('test.fa') + data = read_fasta(path) + assert len(data.keys()) == 9 + + +def test_fasta_read_comments(): + path = fixture_path('test_comments.fa') + data = read_fasta(path) + assert len(data.keys()) == 2 + assert 'sequence A' in data.keys() + assert 'sequence B' in data.keys()