-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbp_alter.py
More file actions
executable file
·156 lines (127 loc) · 4.53 KB
/
bp_alter.py
File metadata and controls
executable file
·156 lines (127 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
##########################
# Author: B.M. Anderson
# Date: 20 Jul 2020
# Version update: Aug 2024
# Description: modify sequences in a (mulit)fasta file given an input text file specifying
# contigs, locations and changes
# The modified sequences are output in the current directory as a file "output_alter.fasta"
##########################
import argparse
from Bio import SeqIO # for reading and writing sequence records
from Bio.SeqRecord import SeqRecord # for generating a sequence record from a sequence
import sys
# instantiate the parser
parser = argparse.ArgumentParser(description = 'A script to modify positions in sequences in a (multi)fasta file ' +
'using a user-generated text file specifying changes. ' +
'The text file should be in the form (tab-separated columns without a header): '
'contig_id position(1-based) existing_base alt_base(use \"-\" to delete it). '
'If wanting to make an insertion, specify two bases, e.g. C -> CT would be: contig position C CT')
# add arguments to parse
parser.add_argument('-f', type=str, dest='fasta', help='The (multi)fasta file with contigs')
parser.add_argument('-p', type=str, dest='positions', help='The user-generated text file with positions to change')
parser.add_argument('-a', type=str, dest='ambi', help='Substitute ambiguity codes instead of the alternative base pairs (yes or no [default])')
parser.add_argument('-b', type=str, dest='both', help='Generate both possible contigs as copy1 and copy2 (yes or no [default])')
# parse the command line
if len(sys.argv[1:]) == 0: # if there are no arguments
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
fasta_file = args.fasta
positions_file = args.positions
ambiguities = args.ambi
both = args.both
use_ambi = False
do_both = False
if not fasta_file:
parser.print_help(sys.stderr)
sys.exit(0)
if not positions_file:
parser.print_help(sys.stderr)
sys.exit(0)
if ambiguities:
if ambiguities.lower() == 'yes':
use_ambi = True
else:
use_ambi = False
if both:
if both.lower() == 'yes':
do_both = True
else:
do_both = False
# Define an ambiguity dictionary
amb_dict = {
'AC': 'M',
'AG': 'R',
'AT': 'W',
'CG': 'S',
'CT': 'Y',
'GT': 'K',
'ACG': 'V',
'ACT': 'H',
'AGT': 'D',
'CGT': 'B',
'ACGT': 'N'
}
# Read the contig file and store the fastas
fasta_list = []
fastas = SeqIO.parse(open(fasta_file, 'r'), 'fasta')
for fasta in fastas:
fasta_list.append(fasta)
# Read the positions file and store as a list
change_list = []
with open(positions_file, 'r') as pos_file:
for line in pos_file:
elements = line.strip().split('\t')
con_id = elements[0]
position = elements[1]
orig_bp = elements[2]
alt_bp = elements[3]
if use_ambi:
new_bp = amb_dict[''.join(sorted(list(orig_bp) + list(alt_bp)))]
else:
new_bp = alt_bp
change_list.append([con_id, position, orig_bp, new_bp])
# Open an output file and cycle through the stored fastas, making specified changes
with open('output_alter.fasta', 'w') as outfile:
nochanges = 0
changed = 0
for fasta in fasta_list:
these_changes = []
old_fasta = SeqRecord(fasta.seq, id = fasta.id, name = fasta.name, description = fasta.description)
for change in change_list:
if change[0] == fasta.id:
these_changes.append(change)
if len(these_changes) < 1: # no changes for that contig
nochanges = nochanges + 1
SeqIO.write(fasta, outfile, 'fasta')
continue
else:
changed = changed + 1
if do_both:
old_fasta.id = fasta.id + '_copy1'
old_fasta.description = fasta.description.replace(fasta.id, fasta.id + '_copy1')
SeqIO.write(old_fasta, outfile, 'fasta')
# sort the changes in reverse so that deletions won't affect indexing
sorted_changes = sorted(these_changes, key = lambda x: int(x[1]), reverse = True)
for change in sorted_changes:
position = int(change[1])
old_bp = change[2]
new_bp = change[3]
if str(fasta.seq[position - 1]) == old_bp: # the base matches user input
if new_bp == '-': # a deletion
fasta.seq = fasta.seq[0: position - 1] + fasta.seq[position: ]
else:
fasta.seq = fasta.seq[0: position - 1] + new_bp + fasta.seq[position: ]
else:
print('Warning: incorrectly specified existing base for ' + fasta.id + '!')
print('Exiting')
sys.exit(1)
# output the new fasta
if do_both:
old_id = fasta.id
fasta.id = fasta.id + '_copy2'
fasta.description = fasta.description.replace(old_id, fasta.id)
SeqIO.write(fasta, outfile, 'fasta')
# Report completion
print('Finished altering sequence(s): ' + str(changed) + ' altered, ' + str(nochanges) + ' unaltered')