-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdatabase_build.sh
More file actions
executable file
·156 lines (112 loc) · 3.19 KB
/
database_build.sh
File metadata and controls
executable file
·156 lines (112 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
##########################
# Author: B. Anderson
# Date: 10 Mar 2020
# Modified: Oct 2020
# Description: using a region list and dictionary (optional), build nucleotide or protein BLAST databases from specified genbank files
##########################
# python scripts to call
parse_script=~/scripts/genbank_parse.py
comb_script=~/scripts/extract_sort_comb.py
# a function for when the script is called incorrectly or without arguments
usage ()
{
echo "This script will compile sequences and build BLAST databases from a set of regions in given genbank files."
echo
echo "Usage: $(basename $0) options(- ... - ...) gbfile1 gbfile2 ..."
echo
echo "Options:"
echo
echo " -f|--file File (required) with list of regions, one per line, e.g. atp1"
echo
echo " -d|--dict Dictionary (optional) of alternative region names, tab-delimited, one per line, e.g. atp1-1 atp1"
echo
echo " -t|--type Type of database to build, nucl (default) or prot"
echo
exit 1
}
if [ $# -eq 0 ]; then # if there are no command arguments
usage
fi
FILES=()
dict_present="FALSE"
type_specified="FALSE"
while [[ $# -gt 0 ]] # while the number of args is greater than 0
do
key="$1"
case $key in
-f|--file)
regions_list="$2"
shift
shift
;;
-d|--dict)
dict="$2"
dict_present="TRUE"
shift
shift
;;
-t|--type)
dbtype="$2"
type_specified="TRUE"
shift
shift
;;
*)
FILES+=("$1") # capture arguments not connected to options in an array
shift
;;
esac
done
set -- "${FILES[@]}" # restore the non-option args to command line args
if [ $# -eq 0 ]; then # this would be true if there were no files specified
usage
fi
if [ "$type_specified" = "FALSE" ]; then
dbtype="nucl"
elif [ "$dbtype" != "prot" ] && [ "$dbtype" != "nucl" ]; then
usage
fi
# assign working directory
work_dir="$(pwd)"
# create output top directory
out_dir="$work_dir"/databases_"$(date +%b%Y)"
if [ ! -d "$out_dir" ]; then
mkdir -p "$out_dir"
fi
# run a series of steps to extract all regions from each of the genbank files
for gbfile in "$@"; do
echo "$gbfile"
"$parse_script" -l CDS "$gbfile" > CDS.tmp
if [ "$dbtype" = "nucl" ]; then
"$parse_script" -l rRNA "$gbfile" > rRNA.tmp
"$parse_script" -l tRNA "$gbfile" > tRNA.tmp
fi
cat *.tmp > regions.list
"$parse_script" -f regions.list -t "$dbtype" "$gbfile"
rm *.tmp regions.list
done
# run a second script to deal with all the extracted fasta files
echo
echo "Combining extracted regions then deleting intermediate files"
if [ "$dict_present" = "TRUE" ]; then
"$comb_script" -f "$regions_list" -d "$dict" *extract.fasta
else
"$comb_script" -f "$regions_list" *extract.fasta
fi
rm *extract.fasta
# for each region in the input list, make a directory and a database in that directory based on the corresponding file
echo
echo "Creating folders and BLAST databases"
for region in $(cat "$regions_list"); do
if [ ! -d "$out_dir"/"$region" ] && [ -f "$region".fasta ]; then
mkdir -p "$out_dir"/"$region"
fi
if [ -f "$region".fasta ]; then
mv "$region".fasta "$out_dir"/"$region"/"$region"_"$dbtype".fasta
cd "$out_dir"/"$region"
makeblastdb -in "$region"_"$dbtype".fasta -dbtype "$dbtype" -out "$region"_"$dbtype"db -logfile temp.log
rm temp*
cd "$work_dir"
fi
done