This repository was archived by the owner on Nov 28, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
Coverage experiments
Marek Wiewiórka edited this page Nov 18, 2018
·
20 revisions
#cluster
#cleanup
rm -rf ~/.ivy2/cache/org.biodatageeks/bdg-sequila_2.11/*
rm -rf ~/.ivy2/jars/org.biodatageeks_bdg-sequila_2.11*
export HADOOP_CONF_DIR=/etc/hadoop/conf
cd /data/local/opt/spark-2.2.1-bin-hadoop2.7/bin
./spark-shell --conf "spark.sql.catalogImplementation=in-memory" --master=yarn-client --driver-memory=4g --executor-memory=4g --num-executors=40 --packages org.biodatageeks:bdg-sequila_2.11:0.4.1-SNAPSHOT --repositories https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ -v
#local testing
#cleanup
rm -rf ~/.ivy2/cache/org.biodatageeks/bdg-sequila_2.11/*
rm -rf ~/.ivy2/jars/org.biodatageeks_bdg-sequila_2.11*
spark-shell --master=local[2] --driver-memory=4g --packages org.biodatageeks:bdg-sequila_2.11:0.4.1-SNAPSHOT --repositories https://zsibio.ii.pw.edu.pl/nexus/repository/maven-snapshots/ -vimport org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
/*inject bdg-granges strategy*/
SequilaRegister.register(ss)
ss.sql("""
CREATE TABLE reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/data/granges/NA12878.ga2.exome.maq.recal.bam')""")
//builtin
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","false")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
spark.time{
ss.sql("select * from bdg_coverage('reads_exome','NA12878.ga2.exome','blocks')").count
}
//gkl
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","true")
spark.time{
ss.sql("select * from bdg_coverage('reads_exome','NA12878.ga2.exome','mosdepth')").count
}
spark.time{
ss.sql("create table cov_ex stored as parquet as select * from bdg_coverage('reads_exome','NA12878.ga2.exome','bdg','blocks')")
}
ss.sql("""
CREATE TABLE reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/data/granges/NA12878.hiseq.wgs.bwa.recal.bam')""")
spark.time{
ss.sql("select * from bdg_coverage('reads_genome','NA12878.hiseq','bdg')").count }
###piping
curl ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20101201_cg_NA12878/NA12878.hiseq.wgs.bwa.recal.bam | hdfs dfs -put - /tmp/fp16yq/data/genome/NA12878.bam
/*orc vs paqrquet - snappy*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
ss.sql("""
CREATE TABLE IF NOT EXISTS reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/genome/*.bam')""")
spark.time{
ss.sql(s"SELECT * FROM bdg_coverage('reads_genome','NA12878', 'blocks')").write.format("parquet").option("compression","snappy").save("/tmp/fp16yq/data/50snappy.parquet")}
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
ss.sql("""
CREATE TABLE IF NOT EXISTS reads_genome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/genome/*.bam')""")
spark.time{
ss.sql(s"SELECT * FROM bdg_coverage('reads_genome','NA12878', 'blocks')").write.format("orc").option("compression","snappy").save("/tmp/fp16yq/data/50snappy.orc")}#32MB
hadoop fs -mkdir -p /tmp/fp16yq/data/exome/32MB/
hadoop fs -D dfs.block.size=$((134217728/4)) -put NA12878_exome.bam /tmp/fp16yq/data/exome/32MB/import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/*bases-blocks*/
ss.sql("""
CREATE TABLE IF NOT EXISTS reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/32MB/*.bam')""")
spark.time{
ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks')").write.format("parquet").save("/tmp/fp16yq/data/32MB_200_4.parquet")}
/*windows - 500*/
import org.apache.spark.sql.SequilaSession
import org.biodatageeks.utils.{SequilaRegister, UDFRegister}
val ss = SequilaSession(spark)
SequilaRegister.register(ss)
ss.sqlContext.setConf("spark.biodatageeks.bam.useGKLInflate","true")
ss.sqlContext.setConf("spark.biodatageeks.bam.useSparkBAM","false")
/*bases-blocks*/
ss.sql("""
CREATE TABLE IF NOT EXISTS reads_exome USING org.biodatageeks.datasources.BAM.BAMDataSource OPTIONS(path '/tmp/fp16yq/data/exome/32MB/*.bam')""")
spark.time{
ss.sql(s"SELECT * FROM bdg_coverage('reads_exome','NA12878', 'blocks', '500')").write.format("parquet").save("/tmp/fp16yq/data/32MB_w500_3.parquet") }export LD_LIBRARY_PATH=/usr/local/lib
git clone https://github.com/brentp/mosdepth.git
cd mosdepth
docker run --rm -it -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim bash