From 48e9f8e59a9eb3b69571c2f9ce0f03a6008e514f Mon Sep 17 00:00:00 2001 From: "rafiga.masmaliyeva" Date: Tue, 27 Jan 2026 16:05:03 +0100 Subject: [PATCH 1/3] Update README.md to include project overview, features, usage instructions, and output file details --- README.md | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 376 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 16376f2..d98ea65 100644 --- a/README.md +++ b/README.md @@ -1 +1,376 @@ -# genomics-scripts \ No newline at end of file +# genomics-scripts + +A comprehensive toolkit for analyzing genomic structural variants (SVs) across cohorts. This project provides tools for processing VCF files containing SV information and generating unified cohort-level SV catalogs. + +--- + +## Table of Contents + +- [Overview](#overview) +- [Scripts](#scripts) + - [cohort_vcf_SV.py](#cohort_vcf_svpy) +- [Installation](#installation) +- [Usage](#usage) +- [Output Files](#output-files) +- [Functions Reference](#functions-reference) +- [Author](#author) + +--- + +## Overview + +This project processes structural variant (SV) data from multiple VCF files to identify unique deletions across a cohort, calculate allele frequencies, and generate standardized output files for further analysis. + +### Key Features + +- **Multi-sample VCF processing**: Reads VCF files from a directory recursively +- **SV deduplication**: Identifies unique deletions accounting for overlapping coordinates (≥80% overlap) +- **Allele frequency calculation**: Computes allele counts (AC), allele frequencies (AF), and distinguishes heterozygous (het) vs. homozygous (hom) calls +- **Multiple output formats**: Generates VCF, BED, and TSV files for downstream analysis +- **Deletion-specific filtering**: Focuses on deletions (DEL) ≥50 bp + +--- + +## Scripts + +### cohort_vcf_SV.py + +**Purpose**: Processes a cohort of VCF files containing structural variant (SV) data, identifies unique deletions across samples, and generates unified SV catalogs in multiple formats. + +**Author**: Rafiga Masmaliyeva (@RafigaM) +**Email**: rmasmaliyeva@gmail.com +**Date Created**: Tue Nov 19 2024 + +#### Dependencies + +``` +pysam +scikit-allel (imported as allel) +pandas +numpy +``` + +Install dependencies: +```bash +pip install pysam scikit-allel pandas numpy +``` + +#### Main Functions + +##### `get_samples(input)` +Recursively searches a directory for all VCF files. + +**Parameters:** +- `input` (str): Path to directory containing VCF files + +**Returns:** +- `list`: List of file paths to all `.vcf` files found + +--- + +##### `convert_array_to_scalar(arr)` +Converts a genotype array to VCF genotype format (e.g., `[0, 1]` → `"0/1"`). + +**Parameters:** +- `arr` (array): Array with two genotype values + +**Returns:** +- `str`: Genotype string in format "allele1/allele2" + +--- + +##### `find_unique_thresholds_with_indexes(thresholds, chr)` +Identifies unique SV positions by clustering overlapping intervals on the same chromosome. Uses 80% overlap threshold to group similar deletions across samples. + +**Parameters:** +- `thresholds` (list): List of tuples `(start, end)` representing deletion positions +- `chr` (str): Chromosome name + +**Returns:** +- `tuple`: + - `unique_thresholds` (list): List of non-overlapping representative intervals + - `threshold_map` (dict): Maps unique intervals to their "mate" variants with indices + +**Overlap Logic:** +- Computes overlap percentage for each interval pair +- Groups intervals if either has ≥80% overlap with existing unique interval +- Allows clustering of SVs that are similar across samples + +--- + +##### `generate_cohort(input)` +Reads all VCF files in a directory and extracts PASS deletions ≥50 bp. Creates an initial dictionary with variant information. + +**Parameters:** +- `input` (str): Path to directory with VCF files + +**Returns:** +- `defaultdict`: Dictionary with structure: + ``` + { + "chrom:pos:end": { + "CHROM": str, + "POS": int, + "ID": str, + "REF": str, + "ALT": str, + "QUAL": int, + "FILTER": str, + "INFO": str, + "sample": [list of genotypes], + "num": [list of VCF filenames] + } + } + ``` + +**Filtering Criteria:** +- Only PASS variants +- Type: DEL (deletion) +- Length: ≥50 bp + +--- + +##### `unique_sv(my_dict, n)` +Groups similar deletions across the cohort and calculates allele frequency statistics. This is the core function for deduplication and frequency calculation. + +**Parameters:** +- `my_dict` (dict): Output from `generate_cohort()` +- `n` (int): Total number of samples in the cohort + +**Returns:** +- `defaultdict`: Unique SV dictionary with structure: + ``` + { + "chrom:start:end": { + "CHROM": str, + "POS": int, + "end": int, + "ID": str, + "SVTYPE": str, + "SVLEN": int, + "FILTER": str, + "QUAL": int, + "GT": [list of genotypes], + "sample": [list of sample filenames], + "mates": [list of overlapping variants], + "ac": int (allele count), + "ac_het": int (heterozygous count), + "ac_hom": int (homozygous count), + "af": float (allele frequency) + } + } + ``` + +**Filtering:** +- Processes all 24 chromosomes (chr1-chr22, chrX, chrY) +- Only includes deletions <1,000,000 bp (1 Mb) +- Clusters overlapping variants + +**Calculations:** +- **AC_het**: Number of heterozygous samples (0/1 genotype) +- **AC_hom**: Number of homozygous samples (1/1 genotype) +- **AC**: Total allele count = AC_het + (2 × AC_hom) +- **AF**: Allele frequency = AC / (n_samples × 2) + +--- + +##### `find_pos(mates)` +Finds the longest interval among mate variants (overlapping SVs representing the same deletion). + +**Parameters:** +- `mates` (list/tuple): List of `(start, end)` coordinate pairs + +**Returns:** +- `tuple`: `(longest_start, longest_end)` - the longest interval + +--- + +##### `find_gt(gt_list)` +Determines the most common genotype across all samples for a variant. + +**Parameters:** +- `gt_list` (list): List of genotype strings + +**Returns:** +- `str`: Most frequently occurring genotype + +--- + +##### `write2vcf(output_file, converted_dict)` +Writes unique SVs to a VCF format file with proper headers. + +**Parameters:** +- `output_file` (str): Output VCF filename +- `converted_dict` (list): List of SV dictionaries (one per variant) + +**Output Format:** VCF 4.2 with INFO fields: +- `AF`: Allele frequency +- `AC`: Allele count +- `AC_het`: Heterozygous count +- `AC_hom`: Homozygous count +- `END`: Deletion end position +- `SVTYPE`: Structural variant type +- `SVLEN`: Structural variant length + +--- + +##### `write2bed(output_file, inner_dict)` +Writes SV data to a TSV (tab-separated) file. + +**Parameters:** +- `output_file` (str): Output TSV filename +- `inner_dict` (list): List of SV dictionaries + +--- + +##### `write2bed2(output_file, converted_dict)` +Writes a simplified BED-format file with core SV information. + +**Parameters:** +- `output_file` (str): Output BED filename +- `converted_dict` (list): List of SV dictionaries + +**BED Format Columns:** +1. CHROM: Chromosome +2. POS: Start position +3. end: End position +4. SVTYPE: Type of SV (e.g., DEL) +5. af: Allele frequency + +--- + +#### Usage + +##### Basic Usage + +1. **Prepare your VCF files** in a directory (e.g., `vcf_files/`) +2. **Edit the main() function** to configure: + - `folder`: Directory containing VCF files + - `n`: Total number of samples in your cohort + - `output_*`: Output file names + +3. **Run the script**: +```bash +python cohort_vcf_SV.py +``` + +##### Example Configuration + +```python +def main(): + folder = "vcf_files" # Directory with VCF files + output_vcf = "cohort_del.vcf" # Output VCF file + output_bed0 = "cohort_del.bed" # Output TSV file + output_bed = "cohort_del_all.bed" # Output BED file + + my_dict = generate_cohort(folder) + n = 1000 # Set to your actual cohort size + unique_dict = unique_sv(my_dict, n) + + # Convert to records format for writing + unique_normal_dict = {key: value for key, value in unique_dict.items()} + unique_df = pd.DataFrame.from_dict(unique_normal_dict, orient='index') + unique_converted_dict = unique_df.to_dict(orient='records') + + # Generate output files + write2vcf(output_vcf, unique_converted_dict) + write2bed(output_bed0, unique_converted_dict) + write2bed2(output_bed, unique_converted_dict) +``` + +--- + +## Output Files + +The script generates three output files: + +### 1. **cohort_del.vcf** +Standard VCF 4.2 format containing unique deletions with population statistics. + +**Fields:** +- CHROM, POS, ID, REF, ALT, QUAL, FILTER +- INFO: AF, AC, AC_het, AC_hom, END, SVTYPE, SVLEN +- FORMAT: GT, FT, GQ +- Sample genotypes + +### 2. **cohort_del.bed** (TSV format) +Tab-separated file with all SV information converted from dictionaries. + +### 3. **cohort_del_all.bed** (BED format) +Simplified BED format with essential SV coordinates and frequency: +``` +chr1 1000 5000 DEL 0.025 +chr2 20000 22000 DEL 0.010 +... +``` + +--- + +## Input Requirements + +### VCF File Format + +Input VCF files should contain: +- **FILTER field**: Marked as PASS for variants to include +- **SVTYPE**: Must be "DEL" for deletion +- **SVLEN**: Deletion length (must be ≥50 bp) +- **END**: Deletion end position +- **Genotype data**: GT field with call data + +**Supported VCF versions**: 4.1+ + +--- + +## Algorithm Summary + +### Workflow + +1. **Read VCFs**: Recursively scan directory for `.vcf` files +2. **Filter variants**: Keep only PASS, DEL, ≥50 bp +3. **Cluster deletions**: Group overlapping variants (≥80% overlap) per chromosome +4. **Calculate statistics**: + - Count heterozygous and homozygous carriers + - Compute allele counts and frequencies + - Identify longest interval among mates +5. **Output results**: Write to VCF, BED, and TSV formats + +### Overlap Threshold + +Two intervals are considered the same deletion if: +- Overlap percentage for interval 1 ≥ 80%, OR +- Overlap percentage for interval 2 ≥ 80% + +This allows flexible clustering of imprecise SV boundaries. + +--- + +## Notes and Limitations + +- **Maximum SV size**: 1,000,000 bp (filtered in `unique_sv()`) +- **Chromosome handling**: Currently supports hg38-style chromosome names (chr1-chr22, chrX, chrY) +- **Genotype parsing**: Expects standard VCF genotype format (0/0, 0/1, 1/1, etc.) +- **Sample count**: Must specify correct `n` value for accurate AF calculation +- **Performance**: Runtime scales with number of VCF files and total variants + +--- + +## Installation & Setup + +1. Clone the repository +2. Install Python 3.6+ +3. Install dependencies: + ```bash + pip install pysam scikit-allel pandas numpy + ``` +4. Organize your VCF files in a single directory +5. Update file paths in `main()` function +6. Run: `python cohort_SV/cohort_vcf_SV.py` + +--- + +## Author + +**Rafiga Masmaliyeva** +Twitter/GitHub: @RafigaM +Email: rmasmaliyeva@gmail.com +Created: November 19, 2024 From 77be3635861d0ea9ea9ee8ce70aa0bbff3195092 Mon Sep 17 00:00:00 2001 From: "rafiga.masmaliyeva" Date: Tue, 27 Jan 2026 16:07:25 +0100 Subject: [PATCH 2/3] Update README.md to include project overview, features, usage instructions, and output file details --- .idea/.gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .idea/.gitignore diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..ab1f416 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Ignored default folder with query files +/queries/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ From 162a34c6e106e6afaf23d67616ad6113a46bdfcc Mon Sep 17 00:00:00 2001 From: "rafiga.masmaliyeva" Date: Tue, 27 Jan 2026 16:44:37 +0100 Subject: [PATCH 3/3] Enhance cohort_vcf_SV.py with detailed function docstrings and comments for clarity --- cohort_SV/cohort_vcf_SV.py | 40 +++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/cohort_SV/cohort_vcf_SV.py b/cohort_SV/cohort_vcf_SV.py index 6e7af0e..2b6da12 100644 --- a/cohort_SV/cohort_vcf_SV.py +++ b/cohort_SV/cohort_vcf_SV.py @@ -25,24 +25,25 @@ def convert_array_to_scalar(arr): + # Convert a diploid GT array into a "0/1" style string. gt = str(arr[0]) + "/" + str(arr[1]) return gt def get_samples(input): - #print(input) + # Collect all VCF file paths under the input directory. samples = [] for subdir, dirs, files in os.walk(input): for file in files: filepath = subdir + os.sep + file if filepath.endswith(".vcf"): - #print(filepath) samples.append(filepath) return samples def find_unique_thresholds_with_indexes(thresholds, chr): + # Merge near-overlapping (>=80% overlap) intervals and keep mates per merged interval. unique_thresholds = [] threshold_map = {} @@ -57,7 +58,7 @@ def has_significant_overlap(t1, t2): percent_overlap_1 = overlap / (high1 - low1) percent_overlap_2 = overlap / (high2 - low2) - # If either percentage is >= 20%, return True + # If either percentage is >= 80%, consider as the same SV event. return percent_overlap_1 >= 0.8 or percent_overlap_2 >= 0.8 for index, threshold in enumerate(thresholds): @@ -80,6 +81,21 @@ def has_significant_overlap(t1, t2): def generate_cohort(input): + """Parse VCFs under a directory and collect passing DEL calls. + + This function scans the input directory recursively for `.vcf` files, + reads each VCF with scikit-allel, and aggregates passing deletion (DEL) + records longer than 50 bp into a single dictionary keyed by + "{CHROM}:{POS}:{END}". + + Args: + input (str): Directory containing VCF files (recursively searched). + + Returns: + collections.defaultdict: Mapping of variant keys to a dict with fields: + CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, sample (GTs), num (VCF name). + """ + # Parse all VCFs and collect passing DEL calls into a unified dictionary. samples = get_samples(input) my_dict = defaultdict( lambda: {"CHROM": [], "POS": [], "ID": [], "REF": [], "ALT": [], "QUAL": [], "FILTER": [], "INFO": [], "sample": [], "num": []}) @@ -89,27 +105,20 @@ def generate_cohort(input): l = len(callset['variants/POS']) for record in range(l): FT = callset['variants/FILTER_PASS'][record] - #imprecise = callset['variants/IMPRECISE'][record] if FT == True: - #if imprecise == False: pos = callset['variants/POS'][record] sv_len = callset['variants/SVLEN'][record] end = pos + abs(sv_len) - #end = callset['variants/END'][record] - #sv_len = abs(int(pos) - int(end)) ### if len(callset) > 0: - #sv_len = callset['variants/SVLEN'][record] SV_type = str(callset['variants/SVTYPE'][record]) if SV_type == "DEL" and abs(sv_len) > 50: chrom = callset['variants/CHROM'][record] - #pos = callset['variants/POS'][record] ref = callset['variants/REF'][record] alt = str(callset['variants/ALT'][record][0]) - #end = callset['variants/END'][record] variant_key = f"{chrom}:{pos}:{end}" id = chrom + "_" + str(pos) + "_" + str(end) qual = callset['variants/QUAL'][record] - # Dict to store all varinats + # Store per-variant details and sample GT. my_dict[variant_key]['CHROM'] = chrom my_dict[variant_key]['POS'] = pos my_dict[variant_key]['ID'] = id @@ -128,6 +137,7 @@ def generate_cohort(input): return my_dict def write2vcf(output_file, converted_dict): + # Write a minimal cohort VCF containing aggregated SV records. vcf_header = [ "##fileformat=VCFv4.2", "##source=custom_script", @@ -166,12 +176,14 @@ def write2vcf(output_file, converted_dict): def write2bed(output_file, inner_dict): + # Dump all records as BED-like TSV without header. print(output_file) my_df = pd.DataFrame.from_dict(inner_dict) my_df.to_csv(output_file, sep='\t', index=False, header=False) def find_pos(mates): + # Pick the widest mate interval to represent the merged SV. mates = list(mates) longest_threshold = 0 longest_pair = None @@ -184,6 +196,7 @@ def find_pos(mates): return(longest_pair) def find_gt(gt_list): + # Return the most common genotype among samples. counts = Counter(gt_list) most_popular = counts.most_common(1)[0] @@ -191,6 +204,7 @@ def find_gt(gt_list): def write2bed2(output_file, converted_dict): + # Write a compact BED with CHROM, POS, END, SVTYPE, AF. with open(output_file, 'w') as file: for inner_dict in converted_dict: vcf_row = [ @@ -204,6 +218,7 @@ def write2bed2(output_file, converted_dict): def unique_sv(my_dict, n): + # Merge overlapping SVs across samples and compute cohort-level stats. n_samples = n dict_unique = defaultdict( lambda: {"CHROM": [], "POS": [], "end": [], "ID": [], "SVTYPE": [], "SVLEN": [], "FILTER": [], "QUAL": [], @@ -239,6 +254,7 @@ def unique_sv(my_dict, n): l = len(mychr[i][1]) for ii in range(l): index = mychr[i][1][ii][0] + # Aggregate GTs and sample names from all mate SVs. dict_unique[key]['GT'].append(my_dict[index]['sample']) dict_unique[key]['sample'].append(my_dict[index]['num']) dict_unique[key]['mates'].append(mychr[i][1][ii][1]) @@ -258,6 +274,7 @@ def unique_sv(my_dict, n): continue dict_unique[key]['ac_het'] = n_het dict_unique[key]['ac_hom'] = n_hom + # Allele frequency: (hets + 2*homs) / (2 * n_samples). af = (n_het + (2 * n_hom)) / (n_samples * 2) dict_unique[key]['af'] = af dict_unique[key]['ac'] = (n_het + (2 * n_hom)) @@ -272,6 +289,7 @@ def unique_sv(my_dict, n): def main(): + # Input/output configuration. folder = "vcf_files" output_vcf = "cohort_del.vcf" output_bed0 = "cohort_del.bed"