From 48e9f8e59a9eb3b69571c2f9ce0f03a6008e514f Mon Sep 17 00:00:00 2001
From: "rafiga.masmaliyeva" <rmasmaliyeva@gmail.com>
Date: Tue, 27 Jan 2026 16:05:03 +0100
Subject: [PATCH 1/3] Update README.md to include project overview, features,
 usage instructions, and output file details

---
 README.md | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 376 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 16376f2..d98ea65 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,376 @@
-# genomics-scripts
\ No newline at end of file
+# genomics-scripts
+
+A comprehensive toolkit for analyzing genomic structural variants (SVs) across cohorts. This project provides tools for processing VCF files containing SV information and generating unified cohort-level SV catalogs.
+
+---
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Scripts](#scripts)
+  - [cohort_vcf_SV.py](#cohort_vcf_svpy)
+- [Installation](#installation)
+- [Usage](#usage)
+- [Output Files](#output-files)
+- [Functions Reference](#functions-reference)
+- [Author](#author)
+
+---
+
+## Overview
+
+This project processes structural variant (SV) data from multiple VCF files to identify unique deletions across a cohort, calculate allele frequencies, and generate standardized output files for further analysis.
+
+### Key Features
+
+- **Multi-sample VCF processing**: Reads VCF files from a directory recursively
+- **SV deduplication**: Identifies unique deletions accounting for overlapping coordinates (≥80% overlap)
+- **Allele frequency calculation**: Computes allele counts (AC), allele frequencies (AF), and distinguishes heterozygous (het) vs. homozygous (hom) calls
+- **Multiple output formats**: Generates VCF, BED, and TSV files for downstream analysis
+- **Deletion-specific filtering**: Focuses on deletions (DEL) ≥50 bp
+
+---
+
+## Scripts
+
+### cohort_vcf_SV.py
+
+**Purpose**: Processes a cohort of VCF files containing structural variant (SV) data, identifies unique deletions across samples, and generates unified SV catalogs in multiple formats.
+
+**Author**: Rafiga Masmaliyeva (@RafigaM)  
+**Email**: rmasmaliyeva@gmail.com  
+**Date Created**: Tue Nov 19 2024
+
+#### Dependencies
+
+```
+pysam
+scikit-allel (imported as allel)
+pandas
+numpy
+```
+
+Install dependencies:
+```bash
+pip install pysam scikit-allel pandas numpy
+```
+
+#### Main Functions
+
+##### `get_samples(input)`
+Recursively searches a directory for all VCF files.
+
+**Parameters:**
+- `input` (str): Path to directory containing VCF files
+
+**Returns:**
+- `list`: List of file paths to all `.vcf` files found
+
+---
+
+##### `convert_array_to_scalar(arr)`
+Converts a genotype array to VCF genotype format (e.g., `[0, 1]` → `"0/1"`).
+
+**Parameters:**
+- `arr` (array): Array with two genotype values
+
+**Returns:**
+- `str`: Genotype string in format "allele1/allele2"
+
+---
+
+##### `find_unique_thresholds_with_indexes(thresholds, chr)`
+Identifies unique SV positions by clustering overlapping intervals on the same chromosome. Uses 80% overlap threshold to group similar deletions across samples.
+
+**Parameters:**
+- `thresholds` (list): List of tuples `(start, end)` representing deletion positions
+- `chr` (str): Chromosome name
+
+**Returns:**
+- `tuple`: 
+  - `unique_thresholds` (list): List of non-overlapping representative intervals
+  - `threshold_map` (dict): Maps unique intervals to their "mate" variants with indices
+
+**Overlap Logic:**
+- Computes overlap percentage for each interval pair
+- Groups intervals if either has ≥80% overlap with existing unique interval
+- Allows clustering of SVs that are similar across samples
+
+---
+
+##### `generate_cohort(input)`
+Reads all VCF files in a directory and extracts PASS deletions ≥50 bp. Creates an initial dictionary with variant information.
+
+**Parameters:**
+- `input` (str): Path to directory with VCF files
+
+**Returns:**
+- `defaultdict`: Dictionary with structure:
+  ```
+  {
+    "chrom:pos:end": {
+      "CHROM": str,
+      "POS": int,
+      "ID": str,
+      "REF": str,
+      "ALT": str,
+      "QUAL": int,
+      "FILTER": str,
+      "INFO": str,
+      "sample": [list of genotypes],
+      "num": [list of VCF filenames]
+    }
+  }
+  ```
+
+**Filtering Criteria:**
+- Only PASS variants
+- Type: DEL (deletion)
+- Length: ≥50 bp
+
+---
+
+##### `unique_sv(my_dict, n)`
+Groups similar deletions across the cohort and calculates allele frequency statistics. This is the core function for deduplication and frequency calculation.
+
+**Parameters:**
+- `my_dict` (dict): Output from `generate_cohort()`
+- `n` (int): Total number of samples in the cohort
+
+**Returns:**
+- `defaultdict`: Unique SV dictionary with structure:
+  ```
+  {
+    "chrom:start:end": {
+      "CHROM": str,
+      "POS": int,
+      "end": int,
+      "ID": str,
+      "SVTYPE": str,
+      "SVLEN": int,
+      "FILTER": str,
+      "QUAL": int,
+      "GT": [list of genotypes],
+      "sample": [list of sample filenames],
+      "mates": [list of overlapping variants],
+      "ac": int (allele count),
+      "ac_het": int (heterozygous count),
+      "ac_hom": int (homozygous count),
+      "af": float (allele frequency)
+    }
+  }
+  ```
+
+**Filtering:**
+- Processes all 24 chromosomes (chr1-chr22, chrX, chrY)
+- Only includes deletions <1,000,000 bp (1 Mb)
+- Clusters overlapping variants
+
+**Calculations:**
+- **AC_het**: Number of heterozygous samples (0/1 genotype)
+- **AC_hom**: Number of homozygous samples (1/1 genotype)
+- **AC**: Total allele count = AC_het + (2 × AC_hom)
+- **AF**: Allele frequency = AC / (n_samples × 2)
+
+---
+
+##### `find_pos(mates)`
+Finds the longest interval among mate variants (overlapping SVs representing the same deletion).
+
+**Parameters:**
+- `mates` (list/tuple): List of `(start, end)` coordinate pairs
+
+**Returns:**
+- `tuple`: `(longest_start, longest_end)` - the longest interval
+
+---
+
+##### `find_gt(gt_list)`
+Determines the most common genotype across all samples for a variant.
+
+**Parameters:**
+- `gt_list` (list): List of genotype strings
+
+**Returns:**
+- `str`: Most frequently occurring genotype
+
+---
+
+##### `write2vcf(output_file, converted_dict)`
+Writes unique SVs to a VCF format file with proper headers.
+
+**Parameters:**
+- `output_file` (str): Output VCF filename
+- `converted_dict` (list): List of SV dictionaries (one per variant)
+
+**Output Format:** VCF 4.2 with INFO fields:
+- `AF`: Allele frequency
+- `AC`: Allele count
+- `AC_het`: Heterozygous count
+- `AC_hom`: Homozygous count
+- `END`: Deletion end position
+- `SVTYPE`: Structural variant type
+- `SVLEN`: Structural variant length
+
+---
+
+##### `write2bed(output_file, inner_dict)`
+Writes SV data to a TSV (tab-separated) file.
+
+**Parameters:**
+- `output_file` (str): Output TSV filename
+- `inner_dict` (list): List of SV dictionaries
+
+---
+
+##### `write2bed2(output_file, converted_dict)`
+Writes a simplified BED-format file with core SV information.
+
+**Parameters:**
+- `output_file` (str): Output BED filename
+- `converted_dict` (list): List of SV dictionaries
+
+**BED Format Columns:**
+1. CHROM: Chromosome
+2. POS: Start position
+3. end: End position
+4. SVTYPE: Type of SV (e.g., DEL)
+5. af: Allele frequency
+
+---
+
+#### Usage
+
+##### Basic Usage
+
+1. **Prepare your VCF files** in a directory (e.g., `vcf_files/`)
+2. **Edit the main() function** to configure:
+   - `folder`: Directory containing VCF files
+   - `n`: Total number of samples in your cohort
+   - `output_*`: Output file names
+
+3. **Run the script**:
+```bash
+python cohort_vcf_SV.py
+```
+
+##### Example Configuration
+
+```python
+def main():
+    folder = "vcf_files"           # Directory with VCF files
+    output_vcf = "cohort_del.vcf"  # Output VCF file
+    output_bed0 = "cohort_del.bed" # Output TSV file
+    output_bed = "cohort_del_all.bed"  # Output BED file
+    
+    my_dict = generate_cohort(folder)
+    n = 1000  # Set to your actual cohort size
+    unique_dict = unique_sv(my_dict, n)
+    
+    # Convert to records format for writing
+    unique_normal_dict = {key: value for key, value in unique_dict.items()}
+    unique_df = pd.DataFrame.from_dict(unique_normal_dict, orient='index')
+    unique_converted_dict = unique_df.to_dict(orient='records')
+    
+    # Generate output files
+    write2vcf(output_vcf, unique_converted_dict)
+    write2bed(output_bed0, unique_converted_dict)
+    write2bed2(output_bed, unique_converted_dict)
+```
+
+---
+
+## Output Files
+
+The script generates three output files:
+
+### 1. **cohort_del.vcf**
+Standard VCF 4.2 format containing unique deletions with population statistics.
+
+**Fields:**
+- CHROM, POS, ID, REF, ALT, QUAL, FILTER
+- INFO: AF, AC, AC_het, AC_hom, END, SVTYPE, SVLEN
+- FORMAT: GT, FT, GQ
+- Sample genotypes
+
+### 2. **cohort_del.bed** (TSV format)
+Tab-separated file with all SV information converted from dictionaries.
+
+### 3. **cohort_del_all.bed** (BED format)
+Simplified BED format with essential SV coordinates and frequency:
+```
+chr1    1000    5000    DEL     0.025
+chr2    20000   22000   DEL     0.010
+...
+```
+
+---
+
+## Input Requirements
+
+### VCF File Format
+
+Input VCF files should contain:
+- **FILTER field**: Marked as PASS for variants to include
+- **SVTYPE**: Must be "DEL" for deletion
+- **SVLEN**: Deletion length (must be ≥50 bp)
+- **END**: Deletion end position
+- **Genotype data**: GT field with call data
+
+**Supported VCF versions**: 4.1+
+
+---
+
+## Algorithm Summary
+
+### Workflow
+
+1. **Read VCFs**: Recursively scan directory for `.vcf` files
+2. **Filter variants**: Keep only PASS, DEL, ≥50 bp
+3. **Cluster deletions**: Group overlapping variants (≥80% overlap) per chromosome
+4. **Calculate statistics**:
+   - Count heterozygous and homozygous carriers
+   - Compute allele counts and frequencies
+   - Identify longest interval among mates
+5. **Output results**: Write to VCF, BED, and TSV formats
+
+### Overlap Threshold
+
+Two intervals are considered the same deletion if:
+- Overlap percentage for interval 1 ≥ 80%, OR
+- Overlap percentage for interval 2 ≥ 80%
+
+This allows flexible clustering of imprecise SV boundaries.
+
+---
+
+## Notes and Limitations
+
+- **Maximum SV size**: 1,000,000 bp (filtered in `unique_sv()`)
+- **Chromosome handling**: Currently supports hg38-style chromosome names (chr1-chr22, chrX, chrY)
+- **Genotype parsing**: Expects standard VCF genotype format (0/0, 0/1, 1/1, etc.)
+- **Sample count**: Must specify correct `n` value for accurate AF calculation
+- **Performance**: Runtime scales with number of VCF files and total variants
+
+---
+
+## Installation & Setup
+
+1. Clone the repository
+2. Install Python 3.6+
+3. Install dependencies:
+   ```bash
+   pip install pysam scikit-allel pandas numpy
+   ```
+4. Organize your VCF files in a single directory
+5. Update file paths in `main()` function
+6. Run: `python cohort_SV/cohort_vcf_SV.py`
+
+---
+
+## Author
+
+**Rafiga Masmaliyeva**  
+Twitter/GitHub: @RafigaM  
+Email: rmasmaliyeva@gmail.com  
+Created: November 19, 2024

From 77be3635861d0ea9ea9ee8ce70aa0bbff3195092 Mon Sep 17 00:00:00 2001
From: "rafiga.masmaliyeva" <rmasmaliyeva@gmail.com>
Date: Tue, 27 Jan 2026 16:07:25 +0100
Subject: [PATCH 2/3] Update README.md to include project overview, features,
 usage instructions, and output file details

---
 .idea/.gitignore | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .idea/.gitignore

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..ab1f416
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,10 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Ignored default folder with query files
+/queries/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/

From 162a34c6e106e6afaf23d67616ad6113a46bdfcc Mon Sep 17 00:00:00 2001
From: "rafiga.masmaliyeva" <rmasmaliyeva@gmail.com>
Date: Tue, 27 Jan 2026 16:44:37 +0100
Subject: [PATCH 3/3] Enhance cohort_vcf_SV.py with detailed function
 docstrings and comments for clarity

---
 cohort_SV/cohort_vcf_SV.py | 40 +++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/cohort_SV/cohort_vcf_SV.py b/cohort_SV/cohort_vcf_SV.py
index 6e7af0e..2b6da12 100644
--- a/cohort_SV/cohort_vcf_SV.py
+++ b/cohort_SV/cohort_vcf_SV.py
@@ -25,24 +25,25 @@
 
 
 def convert_array_to_scalar(arr):
+    # Convert a diploid GT array into a "0/1" style string.
     gt = str(arr[0]) + "/" + str(arr[1])
     return gt
 
 
 def get_samples(input):
-    #print(input)
+    # Collect all VCF file paths under the input directory.
     samples = []
     for subdir, dirs, files in os.walk(input):
         for file in files:
             filepath = subdir + os.sep + file
             if filepath.endswith(".vcf"):
-                #print(filepath)
                 samples.append(filepath)
 
     return samples
 
 
 def find_unique_thresholds_with_indexes(thresholds, chr):
+    # Merge near-overlapping (>=80% overlap) intervals and keep mates per merged interval.
     unique_thresholds = []
     threshold_map = {}
 
@@ -57,7 +58,7 @@ def has_significant_overlap(t1, t2):
         percent_overlap_1 = overlap / (high1 - low1)
         percent_overlap_2 = overlap / (high2 - low2)
 
-        # If either percentage is >= 20%, return True
+        # If either percentage is >= 80%, consider as the same SV event.
         return percent_overlap_1 >= 0.8 or percent_overlap_2 >= 0.8
 
     for index, threshold in enumerate(thresholds):
@@ -80,6 +81,21 @@ def has_significant_overlap(t1, t2):
 
 
 def generate_cohort(input):
+    """Parse VCFs under a directory and collect passing DEL calls.
+
+    This function scans the input directory recursively for `.vcf` files,
+    reads each VCF with scikit-allel, and aggregates passing deletion (DEL)
+    records longer than 50 bp into a single dictionary keyed by
+    "{CHROM}:{POS}:{END}".
+
+    Args:
+        input (str): Directory containing VCF files (recursively searched).
+
+    Returns:
+        collections.defaultdict: Mapping of variant keys to a dict with fields:
+        CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, sample (GTs), num (VCF name).
+    """
+    # Parse all VCFs and collect passing DEL calls into a unified dictionary.
     samples = get_samples(input)
     my_dict = defaultdict(
         lambda: {"CHROM": [], "POS": [], "ID": [], "REF": [], "ALT": [], "QUAL": [], "FILTER": [], "INFO": [], "sample": [], "num": []})
@@ -89,27 +105,20 @@ def generate_cohort(input):
             l = len(callset['variants/POS'])
             for record in range(l):
                 FT = callset['variants/FILTER_PASS'][record]
-                #imprecise = callset['variants/IMPRECISE'][record]
                 if FT == True:
-                    #if imprecise == False:
                     pos = callset['variants/POS'][record]
                     sv_len = callset['variants/SVLEN'][record]
                     end = pos + abs(sv_len)
-                    #end = callset['variants/END'][record]
-                    #sv_len = abs(int(pos) - int(end))  ###
                     if len(callset) > 0:
-                        #sv_len = callset['variants/SVLEN'][record]
                         SV_type = str(callset['variants/SVTYPE'][record])
                         if SV_type == "DEL" and abs(sv_len) > 50:
                             chrom = callset['variants/CHROM'][record]
-                            #pos = callset['variants/POS'][record]
                             ref = callset['variants/REF'][record]
                             alt = str(callset['variants/ALT'][record][0])
-                            #end = callset['variants/END'][record]
                             variant_key = f"{chrom}:{pos}:{end}"
                             id = chrom + "_" + str(pos) + "_" + str(end)
                             qual = callset['variants/QUAL'][record]
-                            # Dict to store all varinats
+                            # Store per-variant details and sample GT.
                             my_dict[variant_key]['CHROM'] = chrom
                             my_dict[variant_key]['POS'] = pos
                             my_dict[variant_key]['ID'] = id
@@ -128,6 +137,7 @@ def generate_cohort(input):
     return my_dict
 
 def write2vcf(output_file, converted_dict):
+    # Write a minimal cohort VCF containing aggregated SV records.
     vcf_header = [
         "##fileformat=VCFv4.2",
         "##source=custom_script",
@@ -166,12 +176,14 @@ def write2vcf(output_file, converted_dict):
 
 
 def write2bed(output_file, inner_dict):
+    # Dump all records as BED-like TSV without header.
     print(output_file)
     my_df = pd.DataFrame.from_dict(inner_dict)
     my_df.to_csv(output_file, sep='\t', index=False, header=False)
 
 
 def find_pos(mates):
+    # Pick the widest mate interval to represent the merged SV.
     mates = list(mates)
     longest_threshold = 0
     longest_pair = None
@@ -184,6 +196,7 @@ def find_pos(mates):
     return(longest_pair)
 
 def find_gt(gt_list):
+    # Return the most common genotype among samples.
     counts = Counter(gt_list)
     most_popular = counts.most_common(1)[0]
 
@@ -191,6 +204,7 @@ def find_gt(gt_list):
 
 
 def write2bed2(output_file, converted_dict):
+    # Write a compact BED with CHROM, POS, END, SVTYPE, AF.
     with open(output_file, 'w') as file:
         for inner_dict in converted_dict:
             vcf_row = [
@@ -204,6 +218,7 @@ def write2bed2(output_file, converted_dict):
 
 
 def unique_sv(my_dict, n):
+    # Merge overlapping SVs across samples and compute cohort-level stats.
     n_samples = n
     dict_unique = defaultdict(
         lambda: {"CHROM": [], "POS": [], "end": [], "ID": [], "SVTYPE": [], "SVLEN": [], "FILTER": [], "QUAL": [],
@@ -239,6 +254,7 @@ def unique_sv(my_dict, n):
                 l = len(mychr[i][1])
                 for ii in range(l):
                     index = mychr[i][1][ii][0]
+                    # Aggregate GTs and sample names from all mate SVs.
                     dict_unique[key]['GT'].append(my_dict[index]['sample'])
                     dict_unique[key]['sample'].append(my_dict[index]['num'])
                     dict_unique[key]['mates'].append(mychr[i][1][ii][1])
@@ -258,6 +274,7 @@ def unique_sv(my_dict, n):
                         continue
                     dict_unique[key]['ac_het'] = n_het
                 dict_unique[key]['ac_hom'] = n_hom
+                # Allele frequency: (hets + 2*homs) / (2 * n_samples).
                 af = (n_het + (2 * n_hom)) / (n_samples * 2)
                 dict_unique[key]['af'] = af
                 dict_unique[key]['ac'] = (n_het + (2 * n_hom))
@@ -272,6 +289,7 @@ def unique_sv(my_dict, n):
 
 
 def main():
+    # Input/output configuration.
     folder = "vcf_files"
     output_vcf = "cohort_del.vcf"
     output_bed0 = "cohort_del.bed"