-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathprocess_vcf.sh
More file actions
121 lines (100 loc) · 3.96 KB
/
process_vcf.sh
File metadata and controls
121 lines (100 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
set -eo pipefail
declare -a DEPENDENCIES=(parallel bcftools bgzip tabix gatk)
for DEP in ${DEPENDENCIES[@]}; do $(command -v ${DEP} > /dev/null 2> /dev/null) || (echo "Cannot find ${DEP}" >&2; exit 1); done
OUTDIR_DEFAULT="$(pwd -P)/sample_vcfs"
function Usage() {
echo -e "\
1. Filter VCF to only contain biallelic variants
2. Split a VCF containing records for multiple individuals into one VCF per individual and tabix the files \n\
3. For each sample create two fasta ref files for each haplotype
4. For each sample VCF only keep het variants
\n\
Usage: $(basename $0) -v|--vcf in.vcf -f|--fasta genome.fa [-o|--outdir /path/to/outdir] [-s|--samples list_samples.txt]\n\
Where: -v|--vcf is an input VCF file to be split into sample VCF files \n\
-f|--fasta is the genome reference fasta file \n\
[-o|--outdir] is an optional path to an output directory to put sample VCF files \n\
defaults to ${OUTDIR_DEFAULT} \n\
[-s|--samples] is an optional file containing the list of samples to keep from the vcf \n\
defaults to ${SAMPLES_DEFAULT} \n\
" >&2
exit 1
}
export -f Usage
function SplitVCF() {
local vcf="$1"
local outdir="$2"
local donor="$3"
local outname="${outdir}/${donor}.vcf.gz"
(set -x; bcftools view --force-samples -s "${donor}" "${vcf}" | bgzip -c > "${outname}")
(set -x; tabix -p vcf "${outname}")
}
export -f SplitVCF
function generateFA() {
local fasta="$1"
local outdir="$2"
local donor="$3"
local hap="$4"
local faname="${outdir}/${donor}_hap${hap}.fa"
local dictname="${outdir}/${donor}_hap${hap}.dict"
local vcfname="${outdir}/${donor}.vcf.gz"
(set -x; bcftools consensus -f ${fasta} -H ${hap} -o ${faname} ${vcfname})
(set -x; gatk CreateSequenceDictionary -R ${faname} -O ${dictname})
}
export -f generateFA
function keepHet() {
local donor="$1"
local outdir="$2"
local samplename="${outdir}/${donor}.vcf.gz"
local outname="${outdir}/${donor}_het.vcf.gz"
(set -x; bcftools view -m2 -M2 -v snps -Ov -g het ${samplename} | bgzip -c > ${outname})
(set -x; tabix -p vcf ${outname})
}
export -f keepHet
[[ "$#" -lt 1 ]] && Usage
while [[ "$#" -gt 1 ]]; do
case "$1" in
-v|--vcf)
VCF="$2"
shift
;;
-f|--fasta)
GENOME=$2
shift
;;
-o|--outdir)
OUTDIR="$2"
shift
;;
-s|--samples)
SAMPLES="$2"
shift
;;
*)
Usage
;;
esac
shift
done
[[ -z "${VCF}" ]] && Usage
[[ -f "${VCF}" ]] || (echo "Cannot find VCF file ${VCF}" >&2; exit 1)
[[ -z "${GENOME}" ]] && Usage
[[ -f "${GENOME}" ]] || (echo "Cannot find genome fasta reference file ${GENOME}" >&2; exit 1)
[[ -z "${SAMPLES}" ]] && Usage
[[ -f "${SAMPLES}" ]] || (echo "Cannot find samples file ${SAMPLES}" >&2; exit 1)
[[ -z "${OUTDIR}" ]] && OUTDIR="${OUTDIR_DEFAULT}"
(set -x; bcftools norm -d all -Ou -N ${VCF} | \
bcftools view -m2 -M2 -v snps -S ${SAMPLES} -Ov --force-samples | \
bgzip -c > ${VCF/.vcf.gz/_biallelic.vcf.gz}
)
(set -x; tabix ${VCF/.vcf.gz/_biallelic.vcf.gz})
(set -x; mkdir -vp "${OUTDIR}")
#declare -a ALL_SAMPLES=($(set -x; bcftools query --list-samples "${VCF}"))
#[[ ${#ALL_SAMPLES[@]} -lt 1 ]] && (echo "Fewer than 1 sample found in ${VCF/.vcf.gz/_biallelic.vcf.gz}" >&2; exit 1)
declare -a NEW_SAMPLES=($(set -x; bcftools query --list-samples "${VCF/.vcf.gz/_biallelic.vcf.gz}"))
[[ ${#NEW_SAMPLES[@]} -lt 2 ]] && (echo "Fewer than 2 samples found in ${VCF}" >&2; exit 1)
echo "Splitting ${VCF} into ${#NEW_SAMPLES[@]} sample VCF files" >&2
parallel --verbose "SplitVCF ${VCF/.vcf.gz/_biallelic.vcf.gz} ${OUTDIR} {}" ::: ${NEW_SAMPLES[@]}
parallel --verbose "generateFA ${GENOME} ${OUTDIR} {} 1" ::: ${NEW_SAMPLES[@]}
parallel --verbose "generateFA ${GENOME} ${OUTDIR} {} 2" ::: ${NEW_SAMPLES[@]}
parallel --verbose "keepHet {} ${OUTDIR}" ::: ${NEW_SAMPLES[@]}