diff --git a/doc/plots/iGenVar_only/results.DEL.png b/doc/plots/iGenVar_only/results.DEL.png index 0b08a4bf..ee332b74 100644 Binary files a/doc/plots/iGenVar_only/results.DEL.png and b/doc/plots/iGenVar_only/results.DEL.png differ diff --git a/doc/plots/iGenVar_only/results.INS.png b/doc/plots/iGenVar_only/results.INS.png index cccb12ad..0a982226 100644 Binary files a/doc/plots/iGenVar_only/results.INS.png and b/doc/plots/iGenVar_only/results.INS.png differ diff --git a/doc/plots/iGenVar_only/results.all.png b/doc/plots/iGenVar_only/results.all.png index e46a5681..cd36aeb3 100644 Binary files a/doc/plots/iGenVar_only/results.all.png and b/doc/plots/iGenVar_only/results.all.png differ diff --git a/doc/plots/iGenVar_only/results.no_DUP_and_INV.png b/doc/plots/iGenVar_only/results.no_DUP_and_INV.png index 69f61d2b..a3422c4d 100644 Binary files a/doc/plots/iGenVar_only/results.no_DUP_and_INV.png and b/doc/plots/iGenVar_only/results.no_DUP_and_INV.png differ diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/callers.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/callers.smk index 300ba8e4..b37a14ed 100644 --- a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/callers.smk +++ b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/callers.smk @@ -1,4 +1,5 @@ -sample = config["parameters"]["sample"], +sample_HG002 = config["parameters"]["sample_HG002"], +sample_NA12878 = config["parameters"]["sample_NA12878"], min_var_length = config["parameters"]["min_var_length"], max_var_length = config["parameters"]["max_var_length"] @@ -13,14 +14,28 @@ rule run_iGenVar: short_bam = config["short_read_bam"]["s1"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S2': # Illumina Mate Pair short_bam = config["short_read_bam"]["s2"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ + --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} + """) + elif wildcards.input_combination == 'S3': # Illumina HiSeq + short_bam = config["short_read_bam"]["s3"] + shell(""" + /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ + --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} + """) + elif wildcards.input_combination == 'S4': # NA12878 WGS GRCh38 + short_bam = config["short_read_bam"]["s4"] + shell(""" + /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ + --output {output.vcf} --vcf_sample_name {sample_NA12878} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S1L1': # Illumina Paired End & MtSinai PacBio @@ -28,7 +43,7 @@ rule run_iGenVar: long_bam = config["long_read_bam"]["l1"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S2L1': # Illumina Mate Pair & MtSinai PacBio @@ -36,7 +51,15 @@ rule run_iGenVar: long_bam = config["long_read_bam"]["l1"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ + --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} + """) + elif wildcards.input_combination == 'S3L1': # Illumina HiSeq & MtSinai PacBio + short_bam = config["short_read_bam"]["s3"], + long_bam = config["long_read_bam"]["l1"] + shell(""" + /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S1L2': # Illumina Paired End & PacBio CCS @@ -44,7 +67,7 @@ rule run_iGenVar: long_bam = config["long_read_bam"]["l2"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S2L2': # Illumina Mate Pair & PacBio CCS @@ -52,7 +75,15 @@ rule run_iGenVar: long_bam = config["long_read_bam"]["l2"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ + --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} + """) + elif wildcards.input_combination == 'S3L2': # Illumina HiSeq & PacBio CCS + short_bam = config["short_read_bam"]["s3"], + long_bam = config["long_read_bam"]["l2"] + shell(""" + /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S1L3': # Illumina Paired End & 10X Genomics @@ -60,7 +91,7 @@ rule run_iGenVar: long_bam = config["long_read_bam"]["l3"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'S2L3': # Illumina Mate Pair & 10X Genomics @@ -68,56 +99,64 @@ rule run_iGenVar: long_bam = config["long_read_bam"]["l3"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ + --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} + """) + elif wildcards.input_combination == 'S3L3': # Illumina HiSeq & 10X Genomics + short_bam = config["short_read_bam"]["s3"], + long_bam = config["long_read_bam"]["l3"] + shell(""" + /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} --input_long_reads {long_bam} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'L1': # MtSinai PacBio long_bam = config["long_read_bam"]["l1"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'L2': # PacBio CCS long_bam = config["long_read_bam"]["l2"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'L3': # 10X Genomics long_bam = config["long_read_bam"]["l3"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_long_reads {long_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'hg38_Sim_default': # Illumina short_bam = config["simulated_short_read_bam"]["sim1"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'hg38_Sim_InDel': # Illumina short_bam = config["simulated_short_read_bam"]["sim2"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) elif wildcards.input_combination == 'hg38_Sim_noSNP': # Illumina short_bam = config["simulated_short_read_bam"]["sim3"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) else: # wildcards.input_combination == 'hg38_Sim_SNPandSV': # Illumina short_bam = config["simulated_short_read_bam"]["sim4"] shell(""" /usr/bin/time -v ./build/iGenVar/bin/iGenVar --input_short_reads {short_bam} \ - --output {output.vcf} --vcf_sample_name {sample} --threads {threads} \ + --output {output.vcf} --vcf_sample_name {sample_HG002} --threads {threads} \ --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1 &>> {log} """) # Defaults: diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.DEL.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.DEL.smk index 9c97f2da..fecef7b8 100644 --- a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.DEL.smk +++ b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.DEL.smk @@ -30,8 +30,8 @@ rule truvari_DEL: "logs/truvari/truvari_output.{input_combination}.{min_qual}.log" params: output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/DEL.min_qual_{min_qual}", - truth_set_gz = config["truth_set_DEL_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_DEL_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] shell: """ rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ @@ -58,6 +58,10 @@ rule cat_truvari_results_DEL: min_qual = min_qual_iGenVar), S2 = expand("results/caller_comparison_iGenVar_only/eval/S2/DEL.min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3 = expand("results/caller_comparison_iGenVar_only/eval/S3/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), + # S4 = expand("results/caller_comparison_iGenVar_only/eval/S4/min_qual_{min_qual}/pr_rec.txt", + # min_qual = min_qual_iGenVar), L1 = expand("results/caller_comparison_iGenVar_only/eval/L1/DEL.min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), L2 = expand("results/caller_comparison_iGenVar_only/eval/L2/DEL.min_qual_{min_qual}/pr_rec.txt", @@ -67,6 +71,8 @@ rule cat_truvari_results_DEL: output: S1 = temp("results/caller_comparison_iGenVar_only/eval/S1.DEL_results.txt"), S2 = temp("results/caller_comparison_iGenVar_only/eval/S2.DEL_results.txt"), + S3 = temp("results/caller_comparison_iGenVar_only/eval/S3.all_results.txt"), + # S4 = temp("results/caller_comparison_iGenVar_only/eval/S4.all_results.txt"), L1 = temp("results/caller_comparison_iGenVar_only/eval/L1.DEL_results.txt"), L2 = temp("results/caller_comparison_iGenVar_only/eval/L2.DEL_results.txt"), L3 = temp("results/caller_comparison_iGenVar_only/eval/L3.DEL_results.txt"), @@ -75,10 +81,16 @@ rule cat_truvari_results_DEL: run: shell("cat {input.S1} > {output.S1}") shell("cat {input.S2} > {output.S2}") + shell("cat {input.S3} > {output.S3}") + # shell("cat {input.S4} > {output.S4}") shell("cat {input.L1} > {output.L1}") shell("cat {input.L2} > {output.L2}") shell("cat {input.L3} > {output.L3}") shell(""" - cat {output.S1} {output.S2} \ + cat {output.S1} {output.S2} {output.S3} \ {output.L1} {output.L2} {output.L3} > {output.all} """) + # shell(""" + # cat {output.S1} {output.S2} {output.S3} {output.S4} \ + # {output.L1} {output.L2} {output.L3} > {output.all} + # """) diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.INS.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.INS.smk index 47421159..261ab916 100644 --- a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.INS.smk +++ b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.INS.smk @@ -30,8 +30,8 @@ rule truvari_INS: "logs/truvari/truvari_output.{input_combination}.{min_qual}.log" params: output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/INS.min_qual_{min_qual}", - truth_set_gz = config["truth_set_INS_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_INS_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] shell: """ rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ @@ -58,6 +58,10 @@ rule cat_truvari_results_INS: min_qual = min_qual_iGenVar), S2 = expand("results/caller_comparison_iGenVar_only/eval/S2/INS.min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3 = expand("results/caller_comparison_iGenVar_only/eval/S3/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), + # S4 = expand("results/caller_comparison_iGenVar_only/eval/S4/min_qual_{min_qual}/pr_rec.txt", + # min_qual = min_qual_iGenVar), L1 = expand("results/caller_comparison_iGenVar_only/eval/L1/INS.min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), L2 = expand("results/caller_comparison_iGenVar_only/eval/L2/INS.min_qual_{min_qual}/pr_rec.txt", @@ -67,6 +71,8 @@ rule cat_truvari_results_INS: output: S1 = temp("results/caller_comparison_iGenVar_only/eval/S1.INS_results.txt"), S2 = temp("results/caller_comparison_iGenVar_only/eval/S2.INS_results.txt"), + S3 = temp("results/caller_comparison_iGenVar_only/eval/S3.all_results.txt"), + # S4 = temp("results/caller_comparison_iGenVar_only/eval/S4.all_results.txt"), L1 = temp("results/caller_comparison_iGenVar_only/eval/L1.INS_results.txt"), L2 = temp("results/caller_comparison_iGenVar_only/eval/L2.INS_results.txt"), L3 = temp("results/caller_comparison_iGenVar_only/eval/L3.INS_results.txt"), @@ -75,10 +81,16 @@ rule cat_truvari_results_INS: run: shell("cat {input.S1} > {output.S1}") shell("cat {input.S2} > {output.S2}") + shell("cat {input.S3} > {output.S3}") + # shell("cat {input.S4} > {output.S4}") shell("cat {input.L1} > {output.L1}") shell("cat {input.L2} > {output.L2}") shell("cat {input.L3} > {output.L3}") shell(""" - cat {output.S1} {output.S2} \ + cat {output.S1} {output.S2} {output.S3} \ {output.L1} {output.L2} {output.L3} > {output.all} """) + # shell(""" + # cat {output.S1} {output.S2} {output.S3} {output.S4} \ + # {output.L1} {output.L2} {output.L3} > {output.all} + # """) diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.no_DUP_and_INV.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.no_DUP_and_INV.smk index bfff251b..6c022452 100644 --- a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.no_DUP_and_INV.smk +++ b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.no_DUP_and_INV.smk @@ -57,8 +57,8 @@ rule no_DUP_and_INV_truvari: "logs/truvari/truvari_output.{input_combination}.{min_qual}.log" params: output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/no_DUP_and_INV.min_qual_{min_qual}", - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] shell: """ rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ @@ -85,6 +85,10 @@ rule no_DUP_and_INV_cat_truvari_results_all: min_qual = min_qual_iGenVar), S2 = expand("results/caller_comparison_iGenVar_only/eval/S2/no_DUP_and_INV.min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3 = expand("results/caller_comparison_iGenVar_only/eval/S3/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), + # S4 = expand("results/caller_comparison_iGenVar_only/eval/S4/min_qual_{min_qual}/pr_rec.txt", + # min_qual = min_qual_iGenVar), L1 = expand("results/caller_comparison_iGenVar_only/eval/L1/no_DUP_and_INV.min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), L2 = expand("results/caller_comparison_iGenVar_only/eval/L2/no_DUP_and_INV.min_qual_{min_qual}/pr_rec.txt", @@ -94,6 +98,8 @@ rule no_DUP_and_INV_cat_truvari_results_all: output: S1 = temp("results/caller_comparison_iGenVar_only/eval/S1.no_DUP_and_INV_results.txt"), S2 = temp("results/caller_comparison_iGenVar_only/eval/S2.no_DUP_and_INV_results.txt"), + S3 = temp("results/caller_comparison_iGenVar_only/eval/S3.all_results.txt"), + # S4 = temp("results/caller_comparison_iGenVar_only/eval/S4.all_results.txt"), L1 = temp("results/caller_comparison_iGenVar_only/eval/L1.no_DUP_and_INV_results.txt"), L2 = temp("results/caller_comparison_iGenVar_only/eval/L2.no_DUP_and_INV_results.txt"), L3 = temp("results/caller_comparison_iGenVar_only/eval/L3.no_DUP_and_INV_results.txt"), @@ -102,10 +108,16 @@ rule no_DUP_and_INV_cat_truvari_results_all: run: shell("cat {input.S1} > {output.S1}") shell("cat {input.S2} > {output.S2}") + shell("cat {input.S3} > {output.S3}") + # shell("cat {input.S4} > {output.S4}") shell("cat {input.L1} > {output.L1}") shell("cat {input.L2} > {output.L2}") shell("cat {input.L3} > {output.L3}") shell(""" - cat {output.S1} {output.S2} \ + cat {output.S1} {output.S2} {output.S3} \ {output.L1} {output.L2} {output.L3} > {output.all} """) + # shell(""" + # cat {output.S1} {output.S2} {output.S3} {output.S4} \ + # {output.L1} {output.L2} {output.L3} > {output.all} + # """) diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.smk b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.smk index 126baa40..603f875d 100644 --- a/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.smk +++ b/test/benchmark/caller_comparison_iGenVar_only/workflow/rules/eval.smk @@ -37,15 +37,18 @@ rule truvari: log: "logs/truvari/truvari_output.{input_combination}.{min_qual}.log" params: - output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/min_qual_{min_qual}", - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] - shell: - """ - rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ - --passonly --includebed {params.truth_set_bed} &>> {log} - """ - # -f data/reference/hs37d5.fa + output_dir = "results/caller_comparison_iGenVar_only/eval/{input_combination}/min_qual_{min_qual}" + run: + if wildcards.input_combination == 'S4': # NA12878 WGS GRCh38 + truth_set_gz = config["truth_set_NA12878"]["gz"], + truth_set_bed = config["truth_set_NA12878"]["bed"] + else: + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] + shell(""" + rm -rf {params.output_dir} && truvari bench -b {truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ + --passonly --includebed {truth_set_bed} &>> {log} + """) rule reformat_truvari_results: input: @@ -66,18 +69,28 @@ rule cat_truvari_results_all: min_qual = min_qual_iGenVar), S2 = expand("results/caller_comparison_iGenVar_only/eval/S2/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3 = expand("results/caller_comparison_iGenVar_only/eval/S3/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), + # S4 = expand("results/caller_comparison_iGenVar_only/eval/S4/min_qual_{min_qual}/pr_rec.txt", + # min_qual = min_qual_iGenVar), S1L1 = expand("results/caller_comparison_iGenVar_only/eval/S1L1/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), S2L1 = expand("results/caller_comparison_iGenVar_only/eval/S2L1/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3L1 = expand("results/caller_comparison_iGenVar_only/eval/S3L1/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), S1L2 = expand("results/caller_comparison_iGenVar_only/eval/S1L2/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), S2L2 = expand("results/caller_comparison_iGenVar_only/eval/S2L2/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3L2 = expand("results/caller_comparison_iGenVar_only/eval/S3L2/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), S1L3 = expand("results/caller_comparison_iGenVar_only/eval/S1L3/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), S2L3 = expand("results/caller_comparison_iGenVar_only/eval/S2L3/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), + S3L3 = expand("results/caller_comparison_iGenVar_only/eval/S3L3/min_qual_{min_qual}/pr_rec.txt", + min_qual = min_qual_iGenVar), L1 = expand("results/caller_comparison_iGenVar_only/eval/L1/min_qual_{min_qual}/pr_rec.txt", min_qual = min_qual_iGenVar), L2 = expand("results/caller_comparison_iGenVar_only/eval/L2/min_qual_{min_qual}/pr_rec.txt", @@ -87,12 +100,17 @@ rule cat_truvari_results_all: output: S1 = temp("results/caller_comparison_iGenVar_only/eval/S1.all_results.txt"), S2 = temp("results/caller_comparison_iGenVar_only/eval/S2.all_results.txt"), + S3 = temp("results/caller_comparison_iGenVar_only/eval/S3.all_results.txt"), + # S4 = temp("results/caller_comparison_iGenVar_only/eval/S4.all_results.txt"), S1L1 = temp("results/caller_comparison_iGenVar_only/eval/S1L1.all_results.txt"), S2L1 = temp("results/caller_comparison_iGenVar_only/eval/S2L1.all_results.txt"), + S3L1 = temp("results/caller_comparison_iGenVar_only/eval/S3L1.all_results.txt"), S1L2 = temp("results/caller_comparison_iGenVar_only/eval/S1L2.all_results.txt"), S2L2 = temp("results/caller_comparison_iGenVar_only/eval/S2L2.all_results.txt"), + S3L2 = temp("results/caller_comparison_iGenVar_only/eval/S3L2.all_results.txt"), S1L3 = temp("results/caller_comparison_iGenVar_only/eval/S1L3.all_results.txt"), S2L3 = temp("results/caller_comparison_iGenVar_only/eval/S2L3.all_results.txt"), + S3L3 = temp("results/caller_comparison_iGenVar_only/eval/S3L3.all_results.txt"), L1 = temp("results/caller_comparison_iGenVar_only/eval/L1.all_results.txt"), L2 = temp("results/caller_comparison_iGenVar_only/eval/L2.all_results.txt"), L3 = temp("results/caller_comparison_iGenVar_only/eval/L3.all_results.txt"), @@ -101,17 +119,31 @@ rule cat_truvari_results_all: run: shell("cat {input.S1} > {output.S1}") shell("cat {input.S2} > {output.S2}") + shell("cat {input.S3} > {output.S3}") + # shell("cat {input.S4} > {output.S4}") shell("cat {input.S1L1} > {output.S1L1}") shell("cat {input.S2L1} > {output.S2L1}") + shell("cat {input.S3L1} > {output.S3L1}") shell("cat {input.S1L2} > {output.S1L2}") shell("cat {input.S2L2} > {output.S2L2}") + shell("cat {input.S3L2} > {output.S3L2}") shell("cat {input.S1L3} > {output.S1L3}") shell("cat {input.S2L3} > {output.S2L3}") + shell("cat {input.S3L3} > {output.S3L3}") shell("cat {input.L1} > {output.L1}") shell("cat {input.L2} > {output.L2}") shell("cat {input.L3} > {output.L3}") shell(""" - cat {output.S1} {output.S2} {output.S1L1} {output.S2L1} \ - {output.S1L2} {output.S2L2} {output.S1L3} {output.S2L3} \ + cat {output.S1} {output.S2} {output.S3} \ + {output.S1L1} {output.S2L1} \ + {output.S1L2} {output.S2L2} \ + {output.S1L3} {output.S2L3} \ {output.L1} {output.L2} {output.L3} > {output.all} """) + # shell(""" + # cat {output.S1} {output.S2} {output.S3} {output.S4} \ + # {output.S1L1} {output.S2L1} \ + # {output.S1L2} {output.S2L2} \ + # {output.S1L3} {output.S2L3} \ + # {output.L1} {output.L2} {output.L3} > {output.all} + # """) diff --git a/test/benchmark/caller_comparison_iGenVar_only/workflow/scripts/plot_all_results.R b/test/benchmark/caller_comparison_iGenVar_only/workflow/scripts/plot_all_results.R index 7175f6db..0de5ee7a 100644 --- a/test/benchmark/caller_comparison_iGenVar_only/workflow/scripts/plot_all_results.R +++ b/test/benchmark/caller_comparison_iGenVar_only/workflow/scripts/plot_all_results.R @@ -22,9 +22,10 @@ f1 <- add_column(f1, min_qual = NA, .after="input_combination") # Combine res & f1 total <- rbind(res, f1) total$input_combination = factor(total$input_combination, - levels=c('S1', 'S2', 'S1L1', 'S2L1', 'S1L2', 'S2L2', 'S1L3', 'S2L3', 'L1', 'L2', 'L3', + levels=c('S1', 'S2', 'S3', #'S4', + 'S1L1', 'S2L1', 'S1L2', 'S2L2', 'S1L3', 'S2L3', 'L1', 'L2', 'L3', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9'), - labels=c('Illumina Paired End', 'Illumina Mate Pair', + labels=c('Illumina Paired End', 'Illumina Mate Pair', 'Illumina HiSeq', #'NA12878 WGS GRCh38', 'Illumina Paired End & MtSinai PacBio', 'Illumina Mate Pair & MtSinai PacBio', 'Illumina Paired End & PacBio CCS', 'Illumina Mate Pair & PacBio CCS', 'Illumina Paired End & 10X Genomics', 'Illumina Mate Pair & 10X Genomics', @@ -36,12 +37,12 @@ scale_custom <- list( scale_shape_manual(values = c(15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 46, 46, 46, 46, 46, 46, 46, 46, 46)), # https://www.farb-tabelle.de/de/farbtabelle.htm - scale_color_manual(breaks = c('Illumina Paired End', 'Illumina Mate Pair', + scale_color_manual(breaks = c('Illumina Paired End', 'Illumina Mate Pair', 'Illumina HiSeq', 'NA12878 WGS GRCh38', 'Illumina Paired End & MtSinai PacBio', 'Illumina Mate Pair & MtSinai PacBio', 'Illumina Paired End & PacBio CCS', 'Illumina Mate Pair & PacBio CCS', 'Illumina Paired End & 10X Genomics', 'Illumina Mate Pair & 10X Genomics', 'MtSinai PacBio', 'PacBio CCS', '10X Genomics'), - values = c("chartreuse3", "chartreuse1", + values = c("chartreuse4", "chartreuse3", "chartreuse1", #"green3", "deepskyblue", "dodgerblue1", "dodgerblue2", "dodgerblue3", "dodgerblue4", "darkblue", "firebrick1", "firebrick3", "firebrick4", "tan", "tan", "tan", "tan", "tan", "tan", "tan", "tan", "tan"), diff --git a/test/benchmark/caller_comparison_long_read/workflow/rules/callers.smk b/test/benchmark/caller_comparison_long_read/workflow/rules/callers.smk index 51c863ba..21731e49 100644 --- a/test/benchmark/caller_comparison_long_read/workflow/rules/callers.smk +++ b/test/benchmark/caller_comparison_long_read/workflow/rules/callers.smk @@ -1,4 +1,4 @@ -sample = config["parameters"]["sample"], +sample = config["parameters"]["sample_HG002"], min_var_length = config["parameters"]["min_var_length"], max_var_length = config["parameters"]["max_var_length"] diff --git a/test/benchmark/caller_comparison_long_read/workflow/rules/eval.smk b/test/benchmark/caller_comparison_long_read/workflow/rules/eval.smk index 1fafefea..e0a43763 100644 --- a/test/benchmark/caller_comparison_long_read/workflow/rules/eval.smk +++ b/test/benchmark/caller_comparison_long_read/workflow/rules/eval.smk @@ -46,14 +46,14 @@ rule truvari: output_dir = "results/caller_comparison_long_read/{dataset}/eval/{caller}/min_qual_{min_qual}" run: if wildcards.dataset == 'MtSinai_PacBio': - truth_set_gz = config["truth_set"]["gz"], - truth_set_bed = config["truth_set"]["bed"] + truth_set_gz = config["truth_set_HG002"]["gz"], + truth_set_bed = config["truth_set_HG002"]["bed"] elif wildcards.dataset == 'PacBio_CCS': - truth_set_gz = config["truth_set"]["gz"], - truth_set_bed = config["truth_set"]["bed"] + truth_set_gz = config["truth_set_HG002"]["gz"], + truth_set_bed = config["truth_set_HG002"]["bed"] else: # wildcards.dataset == '10X_Genomics' - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] shell(""" rm -rf {params.output_dir} && truvari bench -b {truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ --passonly --includebed {truth_set_bed} &>> {log} diff --git a/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/callers.smk b/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/callers.smk index 8b0b73a0..ef2b4c89 100644 --- a/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/callers.smk +++ b/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/callers.smk @@ -1,4 +1,5 @@ -sample = config["parameters"]["sample"], +sample = config["parameters"]["sample_HG002"], +sample_NA12878 = config["parameters"]["sample_NA12878"], min_var_length = config["parameters"]["min_var_length"], max_var_length = config["parameters"]["max_var_length"] diff --git a/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.DUP_as_INS.smk b/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.DUP_as_INS.smk index b7daa93e..55103426 100644 --- a/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.DUP_as_INS.smk +++ b/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.DUP_as_INS.smk @@ -30,8 +30,8 @@ rule DUP_as_INS_truvari: "logs/truvari/truvari_output.{dataset}.{long_read_enhancement}.{min_qual}.log" params: output_dir = "results/caller_comparison_long_read_enhancement/{dataset}/eval/{long_read_enhancement}/DUP_as_INS.min_qual_{min_qual}", - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] shell: """ rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ diff --git a/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.smk b/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.smk index 0cbb138e..65771977 100644 --- a/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.smk +++ b/test/benchmark/caller_comparison_long_read_enhancement/workflow/rules/eval.smk @@ -38,8 +38,8 @@ rule truvari: "logs/truvari/truvari_output.{dataset}.{long_read_enhancement}.{min_qual}.log" params: output_dir = "results/caller_comparison_long_read_enhancement/{dataset}/eval/{long_read_enhancement}/min_qual_{min_qual}", - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] shell: """ rm -rf {params.output_dir} && truvari bench -b {params.truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ diff --git a/test/benchmark/caller_comparison_short_read/workflow/rules/callers.smk b/test/benchmark/caller_comparison_short_read/workflow/rules/callers.smk index 290730cc..1ef0f856 100644 --- a/test/benchmark/caller_comparison_short_read/workflow/rules/callers.smk +++ b/test/benchmark/caller_comparison_short_read/workflow/rules/callers.smk @@ -1,4 +1,5 @@ -sample = config["parameters"]["sample"], +sample = config["parameters"]["sample_HG002"], +sample_NA12878 = config["parameters"]["sample_NA12878"], min_var_length = config["parameters"]["min_var_length"], max_var_length = config["parameters"]["max_var_length"] diff --git a/test/benchmark/caller_comparison_short_read/workflow/rules/eval.smk b/test/benchmark/caller_comparison_short_read/workflow/rules/eval.smk index 1e3e0b57..445fcbf3 100644 --- a/test/benchmark/caller_comparison_short_read/workflow/rules/eval.smk +++ b/test/benchmark/caller_comparison_short_read/workflow/rules/eval.smk @@ -71,14 +71,14 @@ rule truvari: elif (wildcards.dataset == 'hg38_Sim_SNPandSV'): truth_set_gz = config["truth_set_simulation_SNPandSV"]["gz"] elif (wildcards.caller == 'GRIDSS'): - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] elif wildcards.dataset == 'Illumina_Paired_End': - truth_set_gz = config["truth_set"]["gz"], - truth_set_bed = config["truth_set"]["bed"] + truth_set_gz = config["truth_set_HG002"]["gz"], + truth_set_bed = config["truth_set_HG002"]["bed"] else: # wildcards.dataset == 'Illumina_Mate_Pair' - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] if (wildcards.dataset == 'Illumina_Paired_End') | (wildcards.dataset == 'Illumina_Mate_Pair'): shell(""" rm -rf {params.output_dir} && truvari bench -b {truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ diff --git a/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/callers.smk b/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/callers.smk index 4ae948cd..79f8f0fc 100644 --- a/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/callers.smk +++ b/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/callers.smk @@ -1,4 +1,5 @@ -sample = config["parameters"]["sample"], +sample = config["parameters"]["sample_HG002"], +sample_NA12878 = config["parameters"]["sample_NA12878"], min_var_length = config["parameters"]["min_var_length"] rule run_Vaquita_LR: diff --git a/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.DUP_as_INS.smk b/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.DUP_as_INS.smk index 6c684d5c..ea694478 100644 --- a/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.DUP_as_INS.smk +++ b/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.DUP_as_INS.smk @@ -33,11 +33,11 @@ rule DUP_as_INS_truvari: output_dir = "results/caller_comparison_vaquita_lr/eval/{input_combination}/DUP_as_INS.min_qual_{min_qual}" run: if (wildcards.input_combination == 'S2') | (wildcards.input_combination == 'L3') | (wildcards.input_combination == 'S2L3'): - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] else: # S1, L1, L2, S1L1, S1L2 - truth_set_gz = config["truth_set"]["gz"], - truth_set_bed = config["truth_set"]["bed"] + truth_set_gz = config["truth_set_HG002"]["gz"], + truth_set_bed = config["truth_set_HG002"]["bed"] shell(""" rm -rf {params.output_dir} && truvari bench -b {truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ --passonly --includebed {truth_set_bed} &>> {log} diff --git a/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.smk b/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.smk index 3cdbbb8f..46b33c40 100644 --- a/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.smk +++ b/test/benchmark/caller_comparison_vaquita_lr/workflow/rules/eval.smk @@ -40,11 +40,11 @@ rule truvari: output_dir = "results/caller_comparison_vaquita_lr/eval/{input_combination}/min_qual_{min_qual}" run: if (wildcards.input_combination == 'S2') | (wildcards.input_combination == 'L3') | (wildcards.input_combination == 'S2L3'): - truth_set_gz = config["truth_set_DEL_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_DEL_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] else: # S1, L1, L2, S1L1, S1L2 - truth_set_gz = config["truth_set_DEL"]["gz"], - truth_set_bed = config["truth_set"]["bed"] + truth_set_gz = config["truth_set_HG002_DEL"]["gz"], + truth_set_bed = config["truth_set_HG002"]["bed"] shell(""" rm -rf {params.output_dir} && truvari bench -b {truth_set_gz} -c {input.vcf} -o {params.output_dir} -p 0 \ --passonly --includebed {truth_set_bed} &>> {log} diff --git a/test/benchmark/config/caller_comparison_config.yaml b/test/benchmark/config/caller_comparison_config.yaml index 04d5643f..24b4d8ca 100644 --- a/test/benchmark/config/caller_comparison_config.yaml +++ b/test/benchmark/config/caller_comparison_config.yaml @@ -3,15 +3,18 @@ ## NIST Stanford Illumina 6kb Mate Pair ### GRCh37 short_read_bam: - s1: data/short_reads/GRCh37/HG002.hs37d5.2x250.sorted.bam # ~40-50x coverage (Average = 65.8713) - s2: data/short_reads/GRCh37/HG002.mate_pair.sorted.bam # 13-17x coverage (Average = 14.1621) + s1: data/short_reads/GRCh37/HG002/HG002.hs37d5.2x250.sorted.bam # ~40-50x coverage (Average = 65.8713) + s2: data/short_reads/GRCh37/HG002/HG002.mate_pair.sorted.bam # 13-17x coverage (Average = 14.1621) + s3: data/short_reads/GRCh37/HG002/HG002.hs37d5.60x.1.sorted.bam # 60x coverage (Average = 59.0516) ### GRCh38 -# short_read_bam: # s1: data/short_reads/GRCh38/HG002.GRCh38.2x250.sorted.bam # s2: data/short_reads/GRCh38/HG002.sorted.bam + s4: data/short_reads/GRCh38/NA12878/NA12878_S1.bam # 50x coverage (Average = 47.6849) short_bam_name: s1: "AshkenazimTrio - HG002 NA24385 Son - NIST Illumina 2x250bps Paired-end" s2: "AshkenazimTrio - HG002 NA24385 Son - NIST Stanford Illumina 6kb Mate Pair" + s3: "AshkenazimTrio - HG002 NA24385 Son - ..." + s4: "NA12878 - ..." # long read data ## MtSinai PacBio (minimap2) @@ -64,7 +67,8 @@ simulated_short_bam_name: ### GRCh37 reference_fa: Illumina_Paired_End: data/reference/GRCh37/hs37d5.fa - Illumina_Mate_Pair: data/reference/GRCh37/hg19.reordered.short.fa + Illumina_Mate_Pair: data/reference/GRCh37/hg19.reordered.fa + Illumina_HiSeq: data/reference/GRCh37/hs37d5.fa MtSinai_PacBio: data/reference/GRCh37/hs37d5.fa PacBio_CCS: data/reference/GRCh37/hs37d5.fa 10X_Genomics: data/reference/GRCh37/hg19.reordered.fa @@ -72,23 +76,24 @@ reference_fa: # Illumina_Paired_End: data/reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fa # Illumina_Mate_Pair: data/reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fa # MtSinai_PacBio: data/reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa + NA12878_WGS_GRCh38: data/reference/GRCh38/hg38.fa # PacBio_CCS: data/reference/GRCh38/??? # 10X_Genomics: data/reference/GRCh38/??? simulation_ref: data/simulation_GRCh38/hg38_chr21.fa -truth_set: +truth_set_HG002: gz: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.vcf.gz bed: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.bed -truth_set_renamed_chr: +truth_set_HG002_renamed_chr: gz: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.renamed_chr.vcf.gz bed: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.renamed_chr.bed -truth_set_INS: +truth_set_HG002_INS: gz: data/truth_set/HG002/INS/HG002_SVs_Tier1_v0.6.INS.vcf.gz -truth_set_INS_renamed_chr: +truth_set_HG002_INS_renamed_chr: gz: data/truth_set/HG002/INS/HG002_SVs_Tier1_v0.6.INS.renamed_chr.vcf.gz -truth_set_DEL: +truth_set_HG002_DEL: gz: data/truth_set/HG002/DEL/HG002_SVs_Tier1_v0.6.DEL.vcf.gz -truth_set_DEL_renamed_chr: +truth_set_HG002_DEL_renamed_chr: gz: data/truth_set/HG002/DEL/HG002_SVs_Tier1_v0.6.DEL.renamed_chr.vcf.gz truth_set_simulation_default: gz: data/simulation_GRCh38/hg38_SV_simulation_default_sorted_fixed.vcf.gz @@ -99,17 +104,21 @@ truth_set_simulation_noSNP: truth_set_simulation_SNPandSV: gz: data/simulation_GRCh38/hg38_SV_simulation_SNPandSV_sorted_fixed.vcf.gz # truth_set: -# gz: data/truth_set/HG002_SVs_Tier1_v0.6.Hg38.vcf.gz -# bed: data/truth_set/HG002_SVs_Tier1_v0.6.Hg38.bed +# gz: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.Hg38.vcf.gz +# bed: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.Hg38.bed # truth_set_renamed_chr: -# gz: data/truth_set/HG002_SVs_Tier1_v0.6.Hg38.renamed_chr.vcf.gz -# bed: data/truth_set/HG002_SVs_Tier1_v0.6.Hg38.renamed_chr.bed +# gz: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.Hg38.renamed_chr.vcf.gz +# bed: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.Hg38.renamed_chr.bed +truth_set_NA12878: + gz: data/truth_set/NA12878/NA12878_S1.vcf.gz + bed: data/truth_set/NA12878/NA12878_S1.ROH.bed # GRIDSS: for hg38 (UCSC chr notation) # blacklist: data/blacklist/ENCFF356LFX.bed parameters: - sample: HG002 + sample_HG002: HG002 + sample_NA12878: NA12878 min_var_length: 40 max_var_length: 1000000 diff --git a/test/benchmark/dataset_downloads.sh b/test/benchmark/dataset_downloads.sh index 4d0f66ab..977f0b56 100644 --- a/test/benchmark/dataset_downloads.sh +++ b/test/benchmark/dataset_downloads.sh @@ -9,13 +9,15 @@ mkdir -p short_reads && cd short_reads # HG002 - NA24385: mkdir -p GRCh37 && cd GRCh37 +mkdir -p HG002 && cd HG002 ## NIST Illumina 2x250bps Paired-end ### reference: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz ### README: .../NIST_Illumina_2x250bps/novoalign_bams/README_update_feb2019 ### coverage: ~40-50x (samtools depth average = 65.8713) ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Illumina_2x250bps/novoalign_bams/HG002.hs37d5.2x250.bam ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Illumina_2x250bps/novoalign_bams/HG002.hs37d5.2x250.bam.bai -cd .. && mkdir -p GRCh38 && cd GRCh38 +cd ../.. && mkdir -p GRCh38 && cd GRCh38 +mkdir -p HG002 && cd HG002 ### reference: GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna ### README: .../NIST_Illumina_2x250bps/novoalign_bams/README_update_feb2019 ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Illumina_2x250bps/novoalign_bams/HG002.GRCh38.2x250.bam @@ -24,19 +26,42 @@ ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Illumina_2x250bps/novoalign_ba ## NIST Stanford Illumina 6kb matepair ### Reads were then mapped to the hg19 reference genome from ucsc or the GRCh38 reference genome with decoy but no alts ### using bwa mem (Li 2013) with default settings, and duplicates were marked using samblaster (Faust 2014). -cd .. && cd GRCh37 +cd ../.. && cd GRCh37/HG002 ### reference: hg19 from ucsc ### README: .../NIST_Stanford_Illumina_6kb_matepair/README.NIST_Stanford_Illumina_6kb_matepair ### coverage: 13-17x (samtools depth average = 14.1621) ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Stanford_Illumina_6kb_matepair/bams/HG002.mate_pair.sorted.bam ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Stanford_Illumina_6kb_matepair/bams/HG002.mate_pair.sorted.bam.bai -cd .. && cd GRCh38 +cd ../.. && cd GRCh38/HG002 ### reference: GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna ### README: .../NIST_Stanford_Illumina_6kb_matepair/README.NIST_Stanford_Illumina_6kb_matepair ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Stanford_Illumina_6kb_matepair/bams/GRCh38/HG002.sorted.bam ${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_Stanford_Illumina_6kb_matepair/bams/GRCh38/HG002.sorted.bam.bai -cd ../.. +## NIST HiSeq - NHGRI Illumina 300X +cd ../.. && cd GRCh37/HG002 +### README: .../NIST_HiSeq_HG002_Homogeneity-10953946/README_NIST_Illumina_pairedend_HG002.txt +### coverage: 60x (samtools depth average = 59.0516) +${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.hs37d5.60x.1.bam +${WGET} ${NCBI}/${ReferenceSamples}/${HG002}/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.hs37d5.60x.1.bam.bai +cd ../.. && cd GRCh38/HG002 +### README: .../NIST_HiSeq_HG002_Homogeneity-10953946/README_NIST_Illumina_pairedend_HG002.txt +${WGET} ${NCBI}/${ReferenceSamples}/${HG002}NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.GRCh38.60x.1.bam +${WGET} ${NCBI}/${ReferenceSamples}/${HG002}NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.GRCh38.60x.1.bam.bai + +cd .. + +mkdir -p NA12878 && cd NA12878 + +## NA12878 WGS GRCh38 - Platinum Genomes +### Project HiSeq 2000: TruSeq PCR-Free (Platinum Genomes) +### coverage: 50x (samtools depth average = 47.6849) +curl 'https://basespace-data-east.s3-external-1.amazonaws.com/47c60b82082847db8cf4a7d900e530b4/NA12878_S1.bam?AWSAccessKeyId=AKIARPYQJSWQRDJKAWUT&Expires=1665567801&response-content-disposition=attachment%3Bfilename%3DNA12878_S1.bam&response-content-type=application%2Foctet-stream&Signature=ca6d7EvQOvop49gEh%2Fh6pTs2HIQ%3D' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Firefox/102.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: de,en-US;q=0.7,en;q=0.3' -H 'Accept-Encoding: gzip, deflate, br' -H 'Referer: https://basespace.illumina.com/' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'Sec-Fetch-User: ?1' \ + --output NA12878_S1.bam +curl 'https://basespace-data-east.s3-external-1.amazonaws.com/47c60b82082847db8cf4a7d900e530b4/NA12878_S1.bam.bai?AWSAccessKeyId=AKIARPYQJSWQRDJKAWUT&Expires=1665568255&response-content-disposition=attachment%3Bfilename%3DNA12878_S1.bam.bai&response-content-type=application%2Foctet-stream&Signature=PQNrr3brzcLKkqZRgh1zw3AJcuM%3D' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Firefox/102.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: de,en-US;q=0.7,en;q=0.3' -H 'Accept-Encoding: gzip, deflate, br' -H 'Referer: https://basespace.illumina.com/' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'Sec-Fetch-User: ?1' \ +--output NA12878_S1.bam.bai + +cd ../../.. echo "$(tput setaf 1)$(tput setab 7)------- short reads downloaded (3.1/3.4) --------$(tput sgr 0)" 1>&3 @@ -120,6 +145,7 @@ echo "$(tput setaf 1)$(tput setab 7)------- reference downloaded (3.3/3.4) ----- # -------- -------- get truth set ../../data -------- -------- # mkdir -p truth_set && cd truth_set +mkdir -p HG002 && cd HG002 # HG002 - NA24385: ## GRCh37 @@ -131,10 +157,22 @@ ${WGET} ${NCBI}/${ReferenceSamples}/AshkenazimTrio/analysis/NIST_SVs_Integration # ${WGET} ${NCBI}/${ReferenceSamples}/AshkenazimTrio/analysis/10XGenomics_ChromiumGenome_LongRanger2.2_Supernova2.0.1_04122018/GRCh38/NA24385_300G/NA24385.GRCh38.large_svs.vcf.gz # ${WGET} ${NCBI}/${ReferenceSamples}/AshkenazimTrio/analysis/10XGenomics_ChromiumGenome_LongRanger2.2_Supernova2.0.1_04122018/GRCh38/NA24385_300G/NA24385.GRCh38.large_svs.vcf.gz.tbi +cd .. +mkdir -p NA12878 && cd NA12878 + +## NA12878 WGS GRCh38 - Platinum Genomes +### Project HiSeq 2000: TruSeq PCR-Free (Platinum Genomes) +curl 'https://basespace-data-east.s3-external-1.amazonaws.com/47c60b82082847db8cf4a7d900e530b4/NA12878_S1.vcf.gz?AWSAccessKeyId=AKIARPYQJSWQRDJKAWUT&Expires=1665568301&response-content-disposition=attachment%3Bfilename%3DNA12878_S1.vcf.gz&response-content-type=application%2Fx-gzip&Signature=yF%2Fk2li9bNaCdbQxqoX0BKtIdww%3D' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Firefox/102.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: de,en-US;q=0.7,en;q=0.3' -H 'Accept-Encoding: gzip, deflate, br' -H 'Referer: https://basespace.illumina.com/' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'Sec-Fetch-User: ?1' \ +--output NA12878_S1.vcf.gz +curl 'https://basespace-data-east.s3-external-1.amazonaws.com/47c60b82082847db8cf4a7d900e530b4/NA12878_S1.vcf.gz.tbi?AWSAccessKeyId=AKIARPYQJSWQRDJKAWUT&Expires=1665568395&response-content-disposition=attachment%3Bfilename%3DNA12878_S1.vcf.gz.tbi&response-content-type=application%2Foctet-stream&Signature=nS2QjrLPMsvAgJFWn%2BHjmXfTdWw%3D' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Firefox/102.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: de,en-US;q=0.7,en;q=0.3' -H 'Accept-Encoding: gzip, deflate, br' -H 'Referer: https://basespace.illumina.com/' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'Sec-Fetch-User: ?1' \ +--output NA12878_S1.vcf.gz.tbi +curl 'https://basespace-data-east.s3-external-1.amazonaws.com/47c60b82082847db8cf4a7d900e530b4/NA12878_S1.ROH.bed?AWSAccessKeyId=AKIARPYQJSWQRDJKAWUT&Expires=1665568438&response-content-disposition=attachment%3Bfilename%3DNA12878_S1.ROH.bed&response-content-type=application%2Foctet-stream&Signature=yijtINIcFla%2FLMGZPCKLospTIWI%3D' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Firefox/102.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: de,en-US;q=0.7,en;q=0.3' -H 'Accept-Encoding: gzip, deflate, br' -H 'Referer: https://basespace.illumina.com/' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'Sec-Fetch-User: ?1' \ +--output NA12878_S1.ROH.bed + # HG005 - NA24631: (there is no truth set so far) # ${WGET} ${NCBI}/${ReferenceSamples}/... .vcf.gz # ${WGET} ${NCBI}/${ReferenceSamples}/... .vcf.gz.tbi # ${WGET} ${NCBI}/${ReferenceSamples}/... .bed -cd .. +cd ../.. echo "$(tput setaf 1)$(tput setab 7)------- truth set downloaded (3.4/3.4) --------$(tput sgr 0)" 1>&3 diff --git a/test/benchmark/parameter_benchmarks/Snakefile b/test/benchmark/parameter_benchmarks/Snakefile index 9d386d58..aa31596f 100644 --- a/test/benchmark/parameter_benchmarks/Snakefile +++ b/test/benchmark/parameter_benchmarks/Snakefile @@ -96,8 +96,8 @@ rule truvari: summary = "results/parameter_benchmarks/{dataset}/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/summary.txt" params: output_dir = "results/parameter_benchmarks/{dataset}/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}", - truth_set_gz = config["truth_set_renamed_chr"]["gz"], - truth_set_bed = config["truth_set_renamed_chr"]["bed"] + truth_set_gz = config["truth_set_HG002_renamed_chr"]["gz"], + truth_set_bed = config["truth_set_HG002_renamed_chr"]["bed"] log: "logs/parameter_benchmarks/truvari/{dataset}/{parameter_name}_{parameter_value}_min_qual_{min_qual}_output.log" run: diff --git a/test/benchmark/prepare_BAM_with_crossmap.sh b/test/benchmark/prepare_BAM_with_crossmap.sh index b216acb4..e2c69043 100644 --- a/test/benchmark/prepare_BAM_with_crossmap.sh +++ b/test/benchmark/prepare_BAM_with_crossmap.sh @@ -26,17 +26,21 @@ less data/reference/GRCh37/hg19.reordered.fa | sed -e '/>chr_1/,$d' >> data/refe echo "$(tput setaf 1)$(tput setab 7)------- reference files prepared (5.2/9.6) --------$(tput sgr 0)" 1>&3 # Illumina Paired End -samtools sort short_reads/GRCh37/HG002.hs37d5.2x250.bam -o short_reads/GRCh37/HG002.hs37d5.2x250.sorted.bam -samtools sort short_reads/GRCh38/HG002.GRCh38.2x250.bam -o short_reads/GRCh38/HG002.GRCh38.2x250.sorted.bam -samtools index short_reads/GRCh37/HG002.hs37d5.2x250.sorted.bam -samtools index short_reads/GRCh38/HG002.GRCh38.2x250.sorted.bam +samtools sort short_reads/GRCh37/HG002/HG002.hs37d5.2x250.bam -o short_reads/GRCh37/HG002/HG002.hs37d5.2x250.sorted.bam +samtools sort short_reads/GRCh38/HG002/HG002.GRCh38.2x250.bam -o short_reads/GRCh38/HG002/HG002.GRCh38.2x250.sorted.bam +samtools index short_reads/GRCh37/HG002/HG002.hs37d5.2x250.sorted.bam +samtools index short_reads/GRCh38/HG002/HG002.GRCh38.2x250.sorted.bam + +# Illumina HiSeq +samtools sort data/short_reads/GRCh37/HG002/HG002.hs37d5.60x.1.bam -o data/short_reads/GRCh37/HG002/HG002.hs37d5.60x.1.sorted.bam +samtools index data/short_reads/GRCh37/HG002/HG002.hs37d5.60x.1.sorted.bam # MtSinai PacBio ## Add MD tag for Sniffles -samtools calmd --threads 16 -Q -b long_reads/GRCh37/HG002_PacBio_GRCh37.bam \ - > long_reads/GRCh37/HG002_PacBio_GRCh37.md.bam reference/GRCh37/hs37d5.fa -samtools calmd --threads 16 -Q -b long_reads/GRCh38/HG002_PacBio_GRCh38.bam \ - > long_reads/GRCh38/HG002_PacBio_GRCh38.md.bam reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa +samtools calmd --threads 16 -Q -b long_reads/GRCh37/HG002/HG002_PacBio_GRCh37.bam \ + > long_reads/GRCh37/HG002/HG002_PacBio_GRCh37.md.bam reference/GRCh37/hs37d5.fa +samtools calmd --threads 16 -Q -b long_reads/GRCh38/HG002/HG002_PacBio_GRCh38.bam \ + > long_reads/GRCh38/HG002/HG002_PacBio_GRCh38.md.bam reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa # PacBio CCS 10kb ## Add MD tag for Sniffles