bioscript/gb2fasta.py at master · lijax/bioscript · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#! /usr/bin/env python
#
# Extract CDS information in Genbank file into fasta amino acid(faa) or nucleic acid(fna) file.
# Or just simply extract all nucleotide(fasta).
# Biopython is required to run this script.
# @data: 2012-11-7
# @author: yeyanbo

import re
import getopt
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein

def usage():
    usage = """Usage: python gb2fasta.py -i [input file] -o [output file] -t [output type]

Option:
  -h,--help      Print this usage
  -v,--verbose   Print progress information
  -i,--input     The genbank file to be processed
  -o,--output    The fasta file to be output to
  -t,--type      Output type of fasta file [faa, fna, fasta]
                  faa(default)   Extract the amino acid of all CDS
                  fna            Extract the nucleotides of all CDS
                  fasta          Extract nucleotide sequence in this genbank file
"""

    print usage

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "i:o:t:hv", ["input=", "output=", "type=", "help"])
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)

    infile = None
    outfile = None
    outtype = "faa"
    verbose = False

    types = ["faa", "fna", "fasta"]

    for o, a in opts:
        if o == "-v":
            verbose = True
        elif o in ("-h", "--help"):
            usage()
            sys.exit()
        elif o in ("-i", "--input"):
            infile = a
        elif o in ("-o", "--output"):
            outfile = a
        elif o in ("-t", "--type"):
            outtype = a
        else:
            assert False, "unhandled option"

    if outtype not in types:
        usage()
        sys.exit(2)

    if infile == None or outfile == None:
        usage()
        sys.exit(2)


    input_handle = open(infile, "r")
    output_handle = open(outfile, "w")

    faa_record = [] #store faa created from CDS

    for seq_record in SeqIO.parse(input_handle, "genbank") :
        gb_id = seq_record.id
        gb_gi = seq_record.annotations['gi']
        gb_dec = seq_record.description

        if verbose:
            print "Record %s" % gb_id

        if outtype == "fasta":
            seq_fasta = SeqRecord(seq=seq_record.seq, id="gi|%s|gb|%s|" %(gb_gi, gb_id), description = gb_dec)
            faa_record.append(seq_fasta)
        else:
            organism = seq_record.annotations["organism"]
            taxonomy = seq_record.annotations["taxonomy"]

            cds_count = 0 # count CDS in current record
            for seq_feature in seq_record.features :
                if seq_feature.type == "CDS" :
                    #assert len(seq_feature.qualifiers['translation']) == 1

                    strand = seq_feature.strand

                    location = str(seq_feature.location.start + 1) + "-" + str(seq_feature.location.end)

                    location = re.sub(r"[^0-9\-]", "", location)

                    product = ('product' in seq_feature.qualifiers) and seq_feature.qualifiers['product'][0] or ('gene' in seq_feature.qualifiers) and seq_feature.qualifiers['gene'][0] or 'unknown'

                    product = product.replace("|", "");

                    xref = seq_feature.qualifiers['db_xref']

                    r = re.compile('GI.*')

                    gi_xref = filter(r.match, xref)

                    if not gi_xref:
                        continue

                    gi = gi_xref[0].replace("GI:", "")

                    seq = seq_record.seq[seq_feature.location.start:seq_feature.location.end]
                    if strand == -1:
                        seq = seq.reverse_complement()

                    if outtype == "faa":
                        seq = seq.translate()

                    seq_fasta = SeqRecord(seq, id="gi|%s|gb|%s|or|%s|st|%s|lo|%s|gn|%s|tx|%s|" % (gi, seq_feature.qualifiers['protein_id'][0], cds_count, strand, location, seq_record.name, ";".join(taxonomy)), description="%s [%s]" % (product, organism))

                    faa_record.append(seq_fasta)

                    cds_count += 1
    SeqIO.write(faa_record, output_handle, "fasta")

    output_handle.close()
    input_handle.close()


if __name__ == "__main__":
    main()