dacb · ziels · Apr 6, 2015
diff --git a/1.5.cat.files b/1.5.cat.files
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+source globals
+
+DIR=$0.dir
+mkdir -p $DIR
+
+input_arg=""
+sample_arg=""
+first=1
+
+for sample in $SAMPLES
+do
+	paired_fq=1.join_paired_ends.dir/$sample.min_overlap_$J/fastqjoin.join.fastq
+	if [ $first -eq 1 ]
+	then
+		input_arg="$paired_fq"
+		first=0
+	else
+		input_arg="$input_arg $paired_fq"
+	fi
+done
+
+echo cating files 
+
+mkdir 1.5.quality.filtering
+cat $input_arg > 1.5.quality.filtering/cat.all.joined.bacteria.seqs.fastq
+
+
diff --git a/1.6.quality.filter b/1.6.quality.filter
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+source globals
+DIR=$0.dir
+mkdir -p $DIR
+
+if [ ! -e $0.stats.log ]
+then
+        $USEARCH -fastq_stats 1.5.quality.filtering/cat.all.joined.bacteria.seqs.fastq -log $0.stats.log
+fi
+
+# look at stats file
+
+echo based on stats.log using $TRUNCLEN and $MAXEE for trunclen and maxee respectively
+
+for sample in $SAMPLES
+do
+	echo $sample
+	paired_fq=1.join_paired_ends.dir/$sample.min_overlap_$J/fastqjoin.join.fastq
+
+	$USEARCH -fastq_filter $paired_fq -fastq_maxee $MAXEE -fastq_trunclen $TRUNCLEN -fastqout $DIR/$sample.fastq
+
+done
diff --git a/2.split_libraries_fastq → 2.split_libraries_fastq.usearch.filtered b/2.split_libraries_fastq → 2.split_libraries_fastq.usearch.filtered
@@ -11,7 +11,7 @@ first=1
 
 for sample in $SAMPLES
 do
-	paired_fq=1.join_paired_ends.dir/$sample.min_overlap_$J/fastqjoin.join.fastq
+	paired_fq=1.6.quality.filter.dir/$sample.fastq
 	if [ $first -eq 1 ]
 	then
 		input_arg="$paired_fq"
@@ -25,5 +25,5 @@ done
 
 echo running split_libraries
 head -4 map.txt > fake_map.txt
-split_libraries_fastq.py -i $input_arg --sample_id $sample_arg -o $DIR -m fake_map.txt -q $Q --barcode_type 'not-barcoded'
+split_libraries_fastq.py -i $input_arg --sample_id $sample_arg -o $DIR -m fake_map.txt -q 0 --barcode_type 'not-barcoded'
 \rm -rf fake_map.txt
diff --git a/3.QIIME.analysis b/3.QIIME.analysis
@@ -0,0 +1,20 @@
+#!/bin/bash
+### Remove Chimeras and Assign Taxonomy
+
+mkdir "3.QIIME_analysis"
+
+##1. Use USEARCH61 to identify Chimeras
+#navigate to directory with seq.fna
+identify_chimeric_seqs.py -i $PWD/2.split_libraries_fastq.usearch.filtered.dir/seqs.fna -m usearch61 -o $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/ -r rdp_gold.fa
+
+##2. Filter the input seq.fna file
+filter_fasta.py -f $PWD/2.split_libraries_fastq.usearch.filtered.dir/seqs.fna -o $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/seqs_chimeras_filtered.fna -s $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/chimeras.txt -n
+
+## 3. use pick_otus.py to cluster sequences using usearch61 
+pick_otus.py -m usearch61 -i $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/seqs_chimeras_filtered.fna -o $PWD/3.QIIME_analysis/USEARCH_picked_otus/
+
+## 4. Make representative sequences 
+pick_rep_set.py -i $PWD/3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus.txt -f $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/seqs_chimeras_filtered.fna -o $PWD/3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus_rep_set.fna
+
+## 5. Tabulate the number of times an OTU is found in each sample (note that taxonomy assigning is not needed, as it is analyzed in 4.RDP_to_table
+make_otu_table.py -i $PWD/3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus.txt -o $PWD/3.QIIME_analysis/USEARCH_picked_otus/otu_table_non_chimeric.biom -e $PWD/3.QIIME_analysis/Chimera_filtering_USEARCH/chimeras.txt
diff --git a/4.1.filter_master_otu_table b/4.1.filter_master_otu_table
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+SQL=$0.sql
+XLS=4.1.otu_table.no_Archaea.xls
+
+cat << EOF > $SQL
+.mode tabs
+SELECT * 
+	FROM master_otu_table
+	WHERE
+			domain <> "Archaea"
+;		
+EOF
+
+sqlite3 -header database.sqlite3 < $SQL > $XLS
diff --git a/4.2.filter_master_otu_table_by_abundance b/4.2.filter_master_otu_table_by_abundance
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+FRAC_CUTOFF=0.01
+
+SQL=$0.sql
+XLS=4.2.master_otu_table.no_Archaea.$FRAC_CUTOFF.cutoff.xls
+FA=4.2.master_otu_table.no_Archaea.$FRAC_CUTOFF.cutoff.fasta
+
+AB=otu_table_non_chimeric_rdp.tab
+
+cat << EOF > $SQL
+.mode tabs
+SELECT * 
+	FROM master_otu_table
+	WHERE
+			domain <> "Archaea"
+		AND
+			( 0
+EOF
+head -2 $AB | tail -1 | awk -F'\t' -v frac_cutoff=$FRAC_CUTOFF '{ for (i = 2; i <= NF; ++i) { printf("\t\t\t\tOR (CAST(%s AS REAL) / (SELECT SUM(%s) FROM master_otu_table WHERE domain <> \"Archaea\")) > %f \n", $i, $i, frac_cutoff); } }' >> $SQL
+cat << EOF >> $SQL
+			)
+;		
+EOF
+
+sqlite3 -header database.sqlite3 < $SQL | sed "s/denovo/OTU_/g" > $XLS
+
+awk '{ if (line > 0) printf(">%s\n%s\n", $1, $NF); ++line; }' 4.2.master_otu_table.no_Archaea.0.01.cutoff.xls  > $FA
diff --git a/4.3.read_count b/4.3.read_count
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+SQL=$0.read_sums.sql
+XLS=$0.read_sums.xls
+
+AB=otu_table_non_chimeric_rdp.tab
+
+otu_table=master_otu_table
+
+cat << EOF > $SQL
+.mode tabs
+SELECT ot.OTUId
+EOF
+
+head -2 $AB | tail -1 | awk -F'\t' -v frac_cutoff=$FRAC_CUTOFF '{ for (i = 2; i <= NF; ++i) { printf("\t\t, SUM(%s) AS %s\n", $i, $i); } }' >> $SQL
+
+cat << EOF >> $SQL
+	FROM $otu_table AS ot
+	WHERE domain <> "Archaea"
+;
+EOF
+
+sqlite3 -header database.sqlite3 < $SQL > $XLS
+
+#awk -f $0.reshape.awk -v expt="H1" 44.master_otu_table.no_Archaea.0.01.xls > $0.persistent_otus.reshaped.tab
+#awk -f $0.reshape.awk -v expt="L1" 44.master_otu_table.no_Archaea.0.01.xls >> $0.persistent_otus.reshaped.tab
+
+R --no-save < $0.R
diff --git a/4.4.filter_master_otu_summary_genus b/4.4.filter_master_otu_summary_genus
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+SQL=$0.sql
+XLS=4.4.master_otu_table.no_Archaea.summary.genus.xls
+
+AB=otu_table_non_chimeric_rdp.tab
+
+cat << EOF > $SQL
+.mode tabs
+SELECT genus
+EOF
+head -2 $AB | tail -1 | awk -F'\t' '{ for (i = 2; i <= NF; ++i) { printf("\t\t\t\t, SUM(%s) AS %s\n", $i, $i); } }' >> $SQL
+
+cat << EOF >> $SQL
+	FROM master_otu_table
+	WHERE
+			domain <> "Archaea"
+	GROUP BY genus
+;		
+EOF
+
+sqlite3 -header database.sqlite3 < $SQL | sed "s/denovo/OTU_/g" > $XLS
diff --git a/4.RDP_to_table b/4.RDP_to_table
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+BIOM=3.QIIME_analysis/USEARCH_picked_otus/otu_table_non_chimeric.biom
+AB=otu_table_non_chimeric_rdp.tab
+ABTMP=$AB.tmp
+FA=3.QIIME_analysis/USEARCH_picked_otus/seqs_chimeras_filtered_otus_rep_set.fna
+FATMP=$FA.tmp
+RDP=seqs_chimeras_filtered_otus_rep_set.RDP_classifier.xls
+RDPFLAT=$RDP.flat
+XLS=otu_table.xls
+DB=database.sqlite3
+
+source globals 
+
+SQL=$0.sql
+
+if [ ! -e $AB ]
+then
+	echo converting biom to tab
+	biom convert -i $BIOM -o $AB -b
+fi
+awk '{ if (line >= 2) print $0; ++line; }' $AB > $ABTMP
+
+if [ ! -e $RDP ]
+then
+	echo running RDP classifier
+
+	java -Xmx1g -jar $RDPPATH classify -q $FA -o $RDP	
+
+	echo flattening $RDP for loading to the database
+	awk -F'\t' '{ gsub(/"/, "", $0); otu = $1; rank = 0; for (i = 3; i <= NF; i += 3) { rank++; printf("%s\t%d\t%s\t%s\t%s\n", otu, rank, $i, $(i + 1), $(i + 2)); } }' $RDP > $RDPFLAT
+fi
+
+echo flatting $FA for loading to the database
+awk -f fa2tab.awk $FA > $FATMP
+
+echo using the one ring to bind them all, muahahah
+
+cat << EOF > $SQL
+.mode tabs
+
+-- load abundances from $AB
+DROP TABLE otu_table;
+CREATE TABLE otu_table (
+
+EOF
+
+
+head -2 $AB | tail -1 | awk -F'\t' '{ printf("\tOTUId VARCHAR(32) PRIMARY KEY NOT NULL,\n"); for (i = 2; i <= NF; ++i) { printf("\t%s INTEGER%s\n", $i, (i == NF ? "" : ", ")); } }' >> $SQL
+
+cat << EOF >> $SQL
+);
+.import $ABTMP otu_table
+
+-- load the tax hierarchy from $RDP
+DROP TABLE class_table;
+CREATE TABLE class_table (
+	OTUId	VARCHAR(32) NOT NULL,
+	rank_no	INTEGER NOT NULL,
+	name	VARCHAR(32) NOT NULL,
+	rank	VARCHAR(32) NOT NULL,
+	confidence FLOAT NOT NULL
+);
+.import $RDPFLAT class_table
+
+-- load the sequences from $FA
+DROP TABLE IF EXISTS sequences;
+CREATE TABLE sequences ( locus VARCHAR(32) PRIMARY KEY, description VARCHAR(256), sequence LONGTEXT );
+.import $FATMP sequences
+
+-- create flattened class table
+DROP TABLE IF EXISTS class_flat;
+CREATE TABLE class_flat
+AS SELECT otus.OTUId, 
+-- the following 7 lines are how this statement should be, but a bug in sqlite3
+-- causes the AS named fields to return empty results
+--		d.name AS domain, d.confidence AS domain_confidence, 
+--		p.name AS phylum, p.confidence AS phylum_confidence, 
+--		c.name AS class, c.confidence AS class_confidence, 
+--		o.name AS order_, o.confidence AS order_confidence, 
+--		f.name AS family, f.confidence AS family_confidence, 
+--		g.name AS genus, g.confidence AS genus_confidence, 
+--		s.name AS species, s.confidence AS species_confidence
+		d.name, d.confidence, 
+		p.name, p.confidence, 
+		c.name, c.confidence, 
+		o.name, o.confidence, 
+		f.name, f.confidence, 
+		g.name, g.confidence, 
+		s.name, s.confidence
+	FROM (SELECT DISTINCT OTUId FROM class_table) AS otus
+		LEFT JOIN class_table AS d ON otus.OTUId = d.OTUId AND d.rank = "domain"
+		LEFT JOIN class_table AS p ON otus.OTUId = p.OTUId AND p.rank = "phylum"
+		LEFT JOIN class_table AS c ON otus.OTUId = c.OTUId AND c.rank = "class"
+		LEFT JOIN class_table AS o ON otus.OTUId = o.OTUId AND o.rank = "order"
+		LEFT JOIN class_table AS f ON otus.OTUId = f.OTUId AND f.rank = "family"
+		LEFT JOIN class_table AS g ON otus.OTUId = g.OTUId AND g.rank = "genus"
+		LEFT JOIN class_table AS s ON otus.OTUId = s.OTUId AND s.rank = "species"
+;
+
+-- uber join
+DROP VIEW master_otu_table;
+CREATE VIEW master_otu_table AS 
+SELECT ot.OTUId,
+EOF
+
+head -2 $AB | tail -1 | awk -F'\t' '{ for (i = 2; i <= NF; ++i) { printf("\t\t%s ,\n", $i); } }' >> $SQL
+
+cat << EOF >> $SQL
+-- the following 2 lines are how this statement should be, but a bug in sqlite3
+-- cases the above commented out 7 lines to return empty result sets so we have to use the crappy names
+--		domain, domain_confidence, phylum, phylum_confidence, class, class_confidence,
+--			order_, order_confidence, family, family_confidence, genus, genus_confidence, species, species_confidence,
+		name AS domain, confidence AS domain_confidence,
+		"name:1" AS phylum, "confidence:1" AS phylum_confidence,
+		"name:2" AS class , "confidence:2" AS class_confidence,
+		"name:3" AS order_, "confidence:3" AS order_confidence,
+		"name:4" AS family, "confidence:4" AS family_confidence,
+		"name:5" AS genus, "confidence:5" AS genus_confidence,
+		"name:6" AS species, "confidence:6" AS species_confidence,
+		sequence
+	FROM otu_table AS ot
+		INNER JOIN class_flat AS cf ON ot.OTUId = cf.OTUId
+		INNER JOIN sequences AS st ON ot.OTUId = st.locus
+	ORDER BY ( 0
+EOF
+
+head -2 $AB | tail -1 | awk -F'\t' '{ for (i = 2; i <= NF; ++i) { printf("\t\t\t+ %s\n", $i); } }' >> $SQL
+
+cat << EOF >> $SQL
+		) DESC
+;
+
+SELECT * FROM master_otu_table;
+EOF
+
+sqlite3 -header -separator $'\t' $DB < $SQL > $XLS
+
+exit
+
+\rm -rf $ABTMP $FATMP
diff --git a/LICENSE b/LICENSE
diff --git a/fa2tab.awk b/fa2tab.awk
@@ -0,0 +1,23 @@
+BEGIN {
+	s = -1;
+}
+
+/^>/ {
+	++s;
+	l=$1;
+	sub(">", "", l);
+	locus[s] = l;
+	description[s] = $2;
+	for (i = 3; i <= NF; ++i)
+		description[s] = description[s] " " $i; 
+	sequence[s] = "";
+}
+{
+	if (substr($1, 1, 1) != ">")
+		sequence[s] = sequence[s] $1;
+}
+END {
+	for (i = 0; i <= s; ++i) { 
+		printf("%s\t%s\t%s\n", locus[i], description[i], sequence[i]);
+	}
+} 
diff --git a/globals b/globals
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # global variables for RZ16S 2015
-SAMPLES="Bacteria10_S10 Bacteria11_S11 Bacteria12_S12 Bacteria133_S64 Bacteria138_S67 Bacteria1_S1 Bacteria2_S2 Bacteria3_S3 Bacteria4_S4 Bacteria5_S5 Bacteria6_S6 Bacteria73_S37 Bacteria74_S38 Bacteria75_S39 Bacteria76_S40 Bacteria77_S41 Bacteria78_S42 Bacteria79_S43 Bacteria7_S7 Bacteria80_S44 Bacteria81_S45 Bacteria8_S8 Bacteria9_S9"
+SAMPLES="Bacteria49_S25 Bacteria50_S26 Bacteria51_S27 Bacteria52_S28 Bacteria53_S29 Bacteria54_S30 Bacteria55_S31 Bacteria56_S32 Bacteria57_S33 Bacteria58_S34 Bacteria59_S35 Bacteria60_S36 Bacteria73_S37 Bacteria74_S38 Bacteria75_S39 Bacteria76_S40 Bacteria77_S41 Bacteria78_S42 Bacteria79_S43 Bacteria80_S44 Bacteria81_S45"
 SAMPLE_PREFIX="Linkoeping-A-"
 SAMPLE_MIDFIX="_L001_R"
 SAMPLE_POSTFIX="_001.fastq"
@@ -12,8 +12,14 @@ M=fastq-join
 # overlap length
 J=100
 
+#USEARCH quality filtering
+USEARCH="/macqiime/QIIME/bin/usearch61"
+RDPPATH="/rdp_classifier_2.10.1/dist/classifier.jar" 
+TRUNCLEN=298
+MAXEE=.5
+
 # split_libraries_fastq
 # filtering quality
-Q=19
+Q=20
 
 
diff --git a/map.txt b/map.txt