Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions 1.5.cat.files
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

source globals

DIR=$0.dir
mkdir -p $DIR

input_arg=""
sample_arg=""
first=1

for sample in $SAMPLES
do
paired_fq=1.join_paired_ends.dir/$sample.min_overlap_$J/fastqjoin.join.fastq
if [ $first -eq 1 ]
then
input_arg="$paired_fq"
first=0
else
input_arg="$input_arg $paired_fq"
fi
done

echo cating files

mkdir 1.5.quality.filtering
cat $input_arg > 1.5.quality.filtering/cat.all.joined.bacteria.seqs.fastq


23 changes: 23 additions & 0 deletions 1.6.quality.filter
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

source globals
DIR=$0.dir
mkdir -p $DIR

if [ ! -e $0.stats.log ]
then
$USEARCH -fastq_stats 1.5.quality.filtering/cat.all.joined.bacteria.seqs.fastq -log $0.stats.log
fi

# look at stats file

echo based on stats.log using $TRUNCLEN and $MAXEE for trunclen and maxee respectively

for sample in $SAMPLES
do
echo $sample
paired_fq=1.join_paired_ends.dir/$sample.min_overlap_$J/fastqjoin.join.fastq

$USEARCH -fastq_filter $paired_fq -fastq_maxee $MAXEE -fastq_trunclen $TRUNCLEN -fastqout $DIR/$sample.fastq

done
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ first=1

for sample in $SAMPLES
do
paired_fq=1.join_paired_ends.dir/$sample.min_overlap_$J/fastqjoin.join.fastq
paired_fq=1.6.quality.filter.dir/$sample.fastq
if [ $first -eq 1 ]
then
input_arg="$paired_fq"
Expand All @@ -25,5 +25,5 @@ done

echo running split_libraries
head -4 map.txt > fake_map.txt
split_libraries_fastq.py -i $input_arg --sample_id $sample_arg -o $DIR -m fake_map.txt -q $Q --barcode_type 'not-barcoded'
split_libraries_fastq.py -i $input_arg --sample_id $sample_arg -o $DIR -m fake_map.txt -q 0 --barcode_type 'not-barcoded'
\rm -rf fake_map.txt
20 changes: 20 additions & 0 deletions 3.QIIME.analysis
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
### Remove Chimeras and Assign Taxonomy

mkdir "3.QIIME_analysis"

##1. Use USEARCH61 to identify Chimeras
#navigate to directory with seq.fna
identify_chimeric_seqs.py -i $PWD/2.split_libraries_fastq.usearch.filtered.dir/seqs.fna -m usearch61 -o $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/ -r rdp_gold.fa

##2. Filter the input seq.fna file
filter_fasta.py -f $PWD/2.split_libraries_fastq.usearch.filtered.dir/seqs.fna -o $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/seqs_chimeras_filtered.fna -s $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/chimeras.txt -n

## 3. use pick_otus.py to cluster sequences using usearch61
pick_otus.py -m usearch61 -i $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/seqs_chimeras_filtered.fna -o $PWD/3.QIIME_analysis/USEARCH_picked_otus/

## 4. Make representative sequences
pick_rep_set.py -i $PWD/3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus.txt -f $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/seqs_chimeras_filtered.fna -o $PWD/3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus_rep_set.fna

## 5. Tabulate the number of times an OTU is found in each sample (note that taxonomy assigning is not needed, as it is analyzed in 4.RDP_to_table
make_otu_table.py -i $PWD/3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus.txt -o $PWD/3.QIIME_analysis/USEARCH_picked_otus/otu_table_non_chimeric.biom -e $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/chimeras.txt
15 changes: 15 additions & 0 deletions 4.1.filter_master_otu_table
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

SQL=$0.sql
XLS=4.1.otu_table.no_Archaea.xls

cat << EOF > $SQL
.mode tabs
SELECT *
FROM master_otu_table
WHERE
domain <> "Archaea"
;
EOF

sqlite3 -header database.sqlite3 < $SQL > $XLS
28 changes: 28 additions & 0 deletions 4.2.filter_master_otu_table_by_abundance
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

FRAC_CUTOFF=0.01

SQL=$0.sql
XLS=4.2.master_otu_table.no_Archaea.$FRAC_CUTOFF.cutoff.xls
FA=4.2.master_otu_table.no_Archaea.$FRAC_CUTOFF.cutoff.fasta

AB=otu_table_non_chimeric_rdp.tab

cat << EOF > $SQL
.mode tabs
SELECT *
FROM master_otu_table
WHERE
domain <> "Archaea"
AND
( 0
EOF
head -2 $AB | tail -1 | awk -F'\t' -v frac_cutoff=$FRAC_CUTOFF '{ for (i = 2; i <= NF; ++i) { printf("\t\t\t\tOR (CAST(%s AS REAL) / (SELECT SUM(%s) FROM master_otu_table WHERE domain <> \"Archaea\")) > %f \n", $i, $i, frac_cutoff); } }' >> $SQL
cat << EOF >> $SQL
)
;
EOF

sqlite3 -header database.sqlite3 < $SQL | sed "s/denovo/OTU_/g" > $XLS

awk '{ if (line > 0) printf(">%s\n%s\n", $1, $NF); ++line; }' 4.2.master_otu_table.no_Archaea.0.01.cutoff.xls > $FA
28 changes: 28 additions & 0 deletions 4.3.read_count
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

SQL=$0.read_sums.sql
XLS=$0.read_sums.xls

AB=otu_table_non_chimeric_rdp.tab

otu_table=master_otu_table

cat << EOF > $SQL
.mode tabs
SELECT ot.OTUId
EOF

head -2 $AB | tail -1 | awk -F'\t' -v frac_cutoff=$FRAC_CUTOFF '{ for (i = 2; i <= NF; ++i) { printf("\t\t, SUM(%s) AS %s\n", $i, $i); } }' >> $SQL

cat << EOF >> $SQL
FROM $otu_table AS ot
WHERE domain <> "Archaea"
;
EOF

sqlite3 -header database.sqlite3 < $SQL > $XLS

#awk -f $0.reshape.awk -v expt="H1" 44.master_otu_table.no_Archaea.0.01.xls > $0.persistent_otus.reshaped.tab
#awk -f $0.reshape.awk -v expt="L1" 44.master_otu_table.no_Archaea.0.01.xls >> $0.persistent_otus.reshaped.tab

R --no-save < $0.R
22 changes: 22 additions & 0 deletions 4.4.filter_master_otu_summary_genus
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

SQL=$0.sql
XLS=4.4.master_otu_table.no_Archaea.summary.genus.xls

AB=otu_table_non_chimeric_rdp.tab

cat << EOF > $SQL
.mode tabs
SELECT genus
EOF
head -2 $AB | tail -1 | awk -F'\t' '{ for (i = 2; i <= NF; ++i) { printf("\t\t\t\t, SUM(%s) AS %s\n", $i, $i); } }' >> $SQL

cat << EOF >> $SQL
FROM master_otu_table
WHERE
domain <> "Archaea"
GROUP BY genus
;
EOF

sqlite3 -header database.sqlite3 < $SQL | sed "s/denovo/OTU_/g" > $XLS
141 changes: 141 additions & 0 deletions 4.RDP_to_table
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/bin/bash

BIOM=3.QIIME_analysis/USEARCH_picked_otus/otu_table_non_chimeric.biom
AB=otu_table_non_chimeric_rdp.tab
ABTMP=$AB.tmp
FA=3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus_rep_set.fna
FATMP=$FA.tmp
RDP=seqs_chimeras_filtered_otus_rep_set.RDP_classifier.xls
RDPFLAT=$RDP.flat
XLS=otu_table.xls
DB=database.sqlite3

source globals

SQL=$0.sql

if [ ! -e $AB ]
then
echo converting biom to tab
biom convert -i $BIOM -o $AB -b
fi
awk '{ if (line >= 2) print $0; ++line; }' $AB > $ABTMP

if [ ! -e $RDP ]
then
echo running RDP classifier

java -Xmx1g -jar $RDPPATH classify -q $FA -o $RDP

echo flattening $RDP for loading to the database
awk -F'\t' '{ gsub(/"/, "", $0); otu = $1; rank = 0; for (i = 3; i <= NF; i += 3) { rank++; printf("%s\t%d\t%s\t%s\t%s\n", otu, rank, $i, $(i + 1), $(i + 2)); } }' $RDP > $RDPFLAT
fi

echo flatting $FA for loading to the database
awk -f fa2tab.awk $FA > $FATMP

echo using the one ring to bind them all, muahahah

cat << EOF > $SQL
.mode tabs

-- load abundances from $AB
DROP TABLE otu_table;
CREATE TABLE otu_table (

EOF


head -2 $AB | tail -1 | awk -F'\t' '{ printf("\tOTUId VARCHAR(32) PRIMARY KEY NOT NULL,\n"); for (i = 2; i <= NF; ++i) { printf("\t%s INTEGER%s\n", $i, (i == NF ? "" : ", ")); } }' >> $SQL

cat << EOF >> $SQL
);
.import $ABTMP otu_table

-- load the tax hierarchy from $RDP
DROP TABLE class_table;
CREATE TABLE class_table (
OTUId VARCHAR(32) NOT NULL,
rank_no INTEGER NOT NULL,
name VARCHAR(32) NOT NULL,
rank VARCHAR(32) NOT NULL,
confidence FLOAT NOT NULL
);
.import $RDPFLAT class_table

-- load the sequences from $FA
DROP TABLE IF EXISTS sequences;
CREATE TABLE sequences ( locus VARCHAR(32) PRIMARY KEY, description VARCHAR(256), sequence LONGTEXT );
.import $FATMP sequences

-- create flattened class table
DROP TABLE IF EXISTS class_flat;
CREATE TABLE class_flat
AS SELECT otus.OTUId,
-- the following 7 lines are how this statement should be, but a bug in sqlite3
-- causes the AS named fields to return empty results
-- d.name AS domain, d.confidence AS domain_confidence,
-- p.name AS phylum, p.confidence AS phylum_confidence,
-- c.name AS class, c.confidence AS class_confidence,
-- o.name AS order_, o.confidence AS order_confidence,
-- f.name AS family, f.confidence AS family_confidence,
-- g.name AS genus, g.confidence AS genus_confidence,
-- s.name AS species, s.confidence AS species_confidence
d.name, d.confidence,
p.name, p.confidence,
c.name, c.confidence,
o.name, o.confidence,
f.name, f.confidence,
g.name, g.confidence,
s.name, s.confidence
FROM (SELECT DISTINCT OTUId FROM class_table) AS otus
LEFT JOIN class_table AS d ON otus.OTUId = d.OTUId AND d.rank = "domain"
LEFT JOIN class_table AS p ON otus.OTUId = p.OTUId AND p.rank = "phylum"
LEFT JOIN class_table AS c ON otus.OTUId = c.OTUId AND c.rank = "class"
LEFT JOIN class_table AS o ON otus.OTUId = o.OTUId AND o.rank = "order"
LEFT JOIN class_table AS f ON otus.OTUId = f.OTUId AND f.rank = "family"
LEFT JOIN class_table AS g ON otus.OTUId = g.OTUId AND g.rank = "genus"
LEFT JOIN class_table AS s ON otus.OTUId = s.OTUId AND s.rank = "species"
;

-- uber join
DROP VIEW master_otu_table;
CREATE VIEW master_otu_table AS
SELECT ot.OTUId,
EOF

head -2 $AB | tail -1 | awk -F'\t' '{ for (i = 2; i <= NF; ++i) { printf("\t\t%s ,\n", $i); } }' >> $SQL

cat << EOF >> $SQL
-- the following 2 lines are how this statement should be, but a bug in sqlite3
-- cases the above commented out 7 lines to return empty result sets so we have to use the crappy names
-- domain, domain_confidence, phylum, phylum_confidence, class, class_confidence,
-- order_, order_confidence, family, family_confidence, genus, genus_confidence, species, species_confidence,
name AS domain, confidence AS domain_confidence,
"name:1" AS phylum, "confidence:1" AS phylum_confidence,
"name:2" AS class , "confidence:2" AS class_confidence,
"name:3" AS order_, "confidence:3" AS order_confidence,
"name:4" AS family, "confidence:4" AS family_confidence,
"name:5" AS genus, "confidence:5" AS genus_confidence,
"name:6" AS species, "confidence:6" AS species_confidence,
sequence
FROM otu_table AS ot
INNER JOIN class_flat AS cf ON ot.OTUId = cf.OTUId
INNER JOIN sequences AS st ON ot.OTUId = st.locus
ORDER BY ( 0
EOF

head -2 $AB | tail -1 | awk -F'\t' '{ for (i = 2; i <= NF; ++i) { printf("\t\t\t+ %s\n", $i); } }' >> $SQL

cat << EOF >> $SQL
) DESC
;

SELECT * FROM master_otu_table;
EOF

sqlite3 -header -separator $'\t' $DB < $SQL > $XLS

exit

\rm -rf $ABTMP $FATMP
Empty file modified LICENSE
100644 → 100755
Empty file.
23 changes: 23 additions & 0 deletions fa2tab.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
BEGIN {
s = -1;
}

/^>/ {
++s;
l=$1;
sub(">", "", l);
locus[s] = l;
description[s] = $2;
for (i = 3; i <= NF; ++i)
description[s] = description[s] " " $i;
sequence[s] = "";
}
{
if (substr($1, 1, 1) != ">")
sequence[s] = sequence[s] $1;
}
END {
for (i = 0; i <= s; ++i) {
printf("%s\t%s\t%s\n", locus[i], description[i], sequence[i]);
}
}
10 changes: 8 additions & 2 deletions globals
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# global variables for RZ16S 2015
SAMPLES="Bacteria10_S10 Bacteria11_S11 Bacteria12_S12 Bacteria133_S64 Bacteria138_S67 Bacteria1_S1 Bacteria2_S2 Bacteria3_S3 Bacteria4_S4 Bacteria5_S5 Bacteria6_S6 Bacteria73_S37 Bacteria74_S38 Bacteria75_S39 Bacteria76_S40 Bacteria77_S41 Bacteria78_S42 Bacteria79_S43 Bacteria7_S7 Bacteria80_S44 Bacteria81_S45 Bacteria8_S8 Bacteria9_S9"
SAMPLES="Bacteria49_S25 Bacteria50_S26 Bacteria51_S27 Bacteria52_S28 Bacteria53_S29 Bacteria54_S30 Bacteria55_S31 Bacteria56_S32 Bacteria57_S33 Bacteria58_S34 Bacteria59_S35 Bacteria60_S36 Bacteria73_S37 Bacteria74_S38 Bacteria75_S39 Bacteria76_S40 Bacteria77_S41 Bacteria78_S42 Bacteria79_S43 Bacteria80_S44 Bacteria81_S45"
SAMPLE_PREFIX="Linkoeping-A-"
SAMPLE_MIDFIX="_L001_R"
SAMPLE_POSTFIX="_001.fastq"
Expand All @@ -12,8 +12,14 @@ M=fastq-join
# overlap length
J=100

#USEARCH quality filtering
USEARCH="/macqiime/QIIME/bin/usearch61"
RDPPATH="/rdp_classifier_2.10.1/dist/classifier.jar"
TRUNCLEN=298
MAXEE=.5

# split_libraries_fastq
# filtering quality
Q=19
Q=20


Empty file modified map.txt
100644 → 100755
Empty file.
Loading