Skip to content
This repository was archived by the owner on Nov 28, 2020. It is now read-only.

Coverage experiments

Marek Wiewiórka edited this page Nov 18, 2018 · 20 revisions

Using SeQuiLa SQL

#cluster
#cleanup
rm -rf ~/.ivy2/cache/org.biodatageeks/bdg-sequila_2.11/*
rm -rf ~/.ivy2/jars/org.biodatageeks_bdg-sequila_2.11*
export HADOOP_CONF_DIR=/etc/hadoop/conf
cd /data/local/opt/spark-2.2.1-bin-hadoop2.7/bin
./spark-shell  --conf "spark.sql.catalogImplementation=in-memory" --master=yarn-client --driver-memory=4g --executor-memory=4g --num-executors=40 --packages org.biodatageeks:bdg-sequila_2.11:0.4.1-SNAPSHOT --repositories https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ -v

#local testing
#cleanup
rm -rf ~/.ivy2/cache/org.biodatageeks/bdg-sequila_2.11/*
rm -rf ~/.ivy2/jars/org.biodatageeks_bdg-sequila_2.11*
spark-shell --master=local[2] --driver-memory=4g --packages org.biodatageeks:bdg-sequila_2.11:0.4.1-SNAPSHOT --repositories https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ -v

Exome

import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}

val ss = SequilaSession(spark)
/*inject bdg-granges strategy*/
SequilaRegister.register(ss)


ss.sql("""
CREATE TABLE reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/data/granges/NA12878.ga2.exome.maq.recal.bam')""")


//builtin
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","false")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")


spark.time{
ss.sql("select * from bdg_coverage('reads_exome','NA12878.ga2.exome','blocks')").count 
}

//gkl
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","true")

spark.time{
ss.sql("select * from bdg_coverage('reads_exome','NA12878.ga2.exome','mosdepth')").count 
}

spark.time{
ss.sql("create table cov_ex stored as parquet as select * from bdg_coverage('reads_exome','NA12878.ga2.exome','bdg','blocks')")
}

Genome

ss.sql("""
CREATE TABLE reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/data/granges/NA12878.hiseq.wgs.bwa.recal.bam')""")

spark.time{
ss.sql("select * from bdg_coverage('reads_genome','NA12878.hiseq','bdg')").count }
###piping
curl ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20101201_cg_NA12878/NA12878.hiseq.wgs.bwa.recal.bam | hdfs dfs -put - /tmp/fp16yq/data/genome/NA12878.bam

ORC vs SNAPPY

/*orc vs paqrquet - snappy*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/genome/*.bam')""")
 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_genome','NA12878', 'blocks')").write.format("parquet").option("compression","snappy").save("/tmp/fp16yq/data/50snappy.parquet")}
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/genome/*.bam')""")
 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_genome','NA12878', 'blocks')").write.format("orc").option("compression","snappy").save("/tmp/fp16yq/data/50snappy.orc")}

Block size tuning

#32MB
hadoop fs -mkdir -p  /tmp/fp16yq/data/exome/32MB/
hadoop fs -D dfs.block.size=$((134217728/4)) -put NA12878_exome.bam /tmp/fp16yq/data/exome/32MB/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/*bases-blocks*/
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/32MB/*.bam')""")
 spark.time{
 ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks')").write.format("parquet").save("/tmp/fp16yq/data/32MB_200_4.parquet")}

/*windows - 500*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/*bases-blocks*/
  ss.sql("""
    CREATE TABLE IF NOT EXISTS reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/32MB/*.bam')""")
spark.time{
ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks', '500')").write.format("parquet").save("/tmp/fp16yq/data/32MB_w500_3.parquet") }

build mosdepth from source

export LD_LIBRARY_PATH=/usr/local/lib
git clone https://github.com/brentp/mosdepth.git
cd mosdepth
docker run --rm  -it -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim bash

Clone this wiki locally