From 1867e1fb6fd2169e645375fa4d84b45d89a46d5d Mon Sep 17 00:00:00 2001
From: azimgivron <azimgivron@gmail.com>
Date: Fri, 7 Feb 2025 09:12:57 +0100
Subject: [PATCH 1/2] make a script that outputs a binary directly exploitable

---
 README.md                   |  49 +++++++++----
 genemap2_parser/__init__.py |   0
 genemap2_parser/script.py   | 137 ++++++++++++++++++++++++++++++++++++
 parseGeneMap2.py            |  91 ------------------------
 pyproject.toml              |  21 ++++++
 5 files changed, 195 insertions(+), 103 deletions(-)
 create mode 100644 genemap2_parser/__init__.py
 create mode 100644 genemap2_parser/script.py
 delete mode 100644 parseGeneMap2.py
 create mode 100644 pyproject.toml

diff --git a/README.md b/README.md
index eddbb80..52b2dc3 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,45 @@
-GeneMap2.txt Parser:
-====================
+# GeneMap2-Parser
 
-This script demonstrates how to parse the genemap2.txt file that can be downloaded from
-[OMIM](https://omim.org/). Note that while it parses out every field in the file it 
-doesn't output anything.
+## Overview
 
-The genemap2.txt file can downloaded from [OMIM Downloads](https://omim.org/downloads)
-(registration required).
+This script parses the `genemap2.txt` file from [OMIM](https://omim.org/) and extracts gene-related data, including phenotypes and inheritance information. The parsed output is serialized into a `.pickle` file.
 
-Sample Use:
------------
+## Installation
 
-`cat genemap2.txt | ./parseGeneMap2.py`
+```shell
+pip install git+https://github.com/OMIM-org/genemap2-parser.git
+```
 
-Userful Links:
---------------
+
+## Usage
+
+```bash
+parseGeneMap2 -i path/to/genemap2.txt -o path/to/output/
+```
+
+### Arguments
+
+- `-i, --input_file`  (Required) Path to `genemap2.txt`
+- `-o, --output_path` (Optional) Output directory (default: current directory)
+
+### Example:
+
+```bash
+parseGeneMap2 -i genemap2.txt -o output_dir/
+```
+
+## Output
+
+A `output.pickle` file containing extracted gene and phenotype data is created in the specified directory. To read:
+
+```python
+import pickle
+with open("output_dir/output.pickle", "rb") as f:
+    data = pickle.load(f)
+print(data[:5])  # First 5 entries
+```
+
+## Useful Links
 
 - [OMIM](https://omim.org/)
 - [OMIM Downloads](https://omim.org/downloads)
diff --git a/genemap2_parser/__init__.py b/genemap2_parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/genemap2_parser/script.py b/genemap2_parser/script.py
new file mode 100644
index 0000000..d586860
--- /dev/null
+++ b/genemap2_parser/script.py
@@ -0,0 +1,137 @@
+import re
+import argparse
+import multiprocessing
+import pickle
+from pathlib import Path
+from typing import Optional, Dict, List, Any
+
+def process_line(line: str) -> Optional[Dict[str, Any]]:
+    """
+    Processes a single line from the genemap2 file.
+    
+    :param line: A single line of text from the file
+    :return: A dictionary containing extracted data or None if the line is invalid
+    """
+    if line.startswith('#') or not line.strip():
+        return None
+    
+    value_list = line.strip().split('\t')
+    if len(value_list) < 14:
+        return None
+    
+    try:
+        chromosome, genomic_position_start, genomic_position_end, cyto_location, computed_cyto_location, mim_number, \
+        gene_symbols, gene_name, approved_gene_symbol, entrez_gene_id, ensembl_gene_id, comments, phenotype_string, mouse = value_list
+    except ValueError:
+        raise ValueError(f"Unexpected number of columns in line: {line}")
+    
+    if not phenotype_string:
+        return None
+    
+    phenotypes = []
+    for phenotype in phenotype_string.split(';'):
+        phenotype = phenotype.strip()
+        matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$', phenotype)
+        
+        if matcher:
+            phenotype_name = matcher.group(1)
+            phenotype_mim_number = matcher.group(2)
+            phenotype_mapping_key = matcher.group(3)
+            inheritances = matcher.group(5).split(', ') if matcher.group(5) else []
+        else:
+            matcher = re.match(r'^(.*)\((\d)\)(|, (.*))$', phenotype)
+            if matcher:
+                phenotype_name = matcher.group(1)
+                phenotype_mim_number = None
+                phenotype_mapping_key = matcher.group(2)
+                inheritances = matcher.group(4).split(', ') if matcher.group(4) else []
+            else:
+                continue
+        
+        phenotypes.append({
+            'name': phenotype_name,
+            'mim_number': phenotype_mim_number,
+            'mapping_key': phenotype_mapping_key,
+            'inheritance': inheritances
+        })
+    
+    return {
+        'chromosome': chromosome,
+        'genomic_position_start': genomic_position_start,
+        'genomic_position_end': genomic_position_end,
+        'cyto_location': cyto_location,
+        'computed_cyto_location': computed_cyto_location,
+        'mim_number': mim_number,
+        'gene_symbols': gene_symbols,
+        'gene_name': gene_name,
+        'approved_gene_symbol': approved_gene_symbol,
+        'entrez_gene_id': entrez_gene_id,
+        'ensembl_gene_id': ensembl_gene_id,
+        'comments': comments,
+        'mouse': mouse,
+        'phenotypes': phenotypes
+    }
+
+def parse_genemap2(filename: Path) -> List[Dict[str, Any]]:
+    """
+    Parses the genemap2 file and extracts relevant data.
+    
+    :param filename: Path to the genemap2.txt file
+    :return: A list of dictionaries containing parsed data
+    """
+    try:
+        with filename.open('r', encoding='utf-8') as file:
+            lines = file.readlines()
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Input file '{filename}' not found.")
+    except IOError as e:
+        raise IOError(f"Error reading file '{filename}': {e}")
+    
+    with multiprocessing.Pool() as pool:
+        parsed_data = pool.map(process_line, lines)
+    
+    return [entry for entry in parsed_data if entry]
+
+def main():
+    """Main"""
+    parser = argparse.ArgumentParser(
+        description=(
+            "This is a simple script to parse the genemap2.txt file that "
+            "can be downloaded from https://omim.org/\n\n"
+            "The file can be downloaded from https://omim.org/downloads "
+            "(registration required)."
+        )
+    )
+    parser.add_argument("--input_file", "-i", type=str, required=True, help="Path to the genemap2.txt file.")
+    parser.add_argument("--output_path", "-o", type=str, default=".", help="Path to write the serialized parsed data (default: %(default)s).")
+    args = parser.parse_args()
+    
+    input_file = Path(args.input_file)
+    output_path = Path(args.output_path)
+    
+    if not input_file.exists():
+        raise FileNotFoundError(f"Input file '{input_file}' does not exist.")
+    
+    if not input_file.is_file():
+        raise ValueError(f"Specified input path '{input_file}' is not a valid file.")
+    
+    if input_file.suffix != ".txt":
+        raise ValueError(f"Expected a .txt file but got '{input_file.suffix}' instead.")
+    
+    if not output_path.exists():
+        raise FileNotFoundError(f"Output path '{output_path}' does not exist.")
+    
+    if not output_path.is_dir():
+        raise ValueError(f"Specified output path '{output_path}' is not a directory.")
+    
+    parsed_data = parse_genemap2(input_file)
+    
+    try:
+        with (output_path / "output.pickle").open('wb') as file:
+            pickle.dump(parsed_data, file)
+    except IOError as e:
+        raise IOError(f"Error writing output file: {e}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/parseGeneMap2.py b/parseGeneMap2.py
deleted file mode 100644
index 307b33c..0000000
--- a/parseGeneMap2.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
-#
-# This is a simple script to parse the genemap2.txt file that
-# can be downloaded from https://omim.org/
-#
-# The file can downloaded from https://omim.org/downloads
-# (registration required).
-#
-
-
-# Imports
-import sys
-import re
-
-
-# Read from stdin
-for line in sys.stdin:
-
-    # Skip comments
-    if line.startswith('#'):
-        continue
-
-    # Strip trailing new line
-    line = line.strip('\n')
-
-    # Get the values
-    valueList = line.split('\t')
-
-    # Get the fields
-    chromosome = valueList[0]
-    genomicPositionStart = valueList[1]
-    genomicPositionEnd = valueList[2]
-    cytoLocation = valueList[3]
-    computedCytoLocation = valueList[4]
-    mimNumber = valueList[5]
-    geneSymbols = valueList[6]
-    geneName = valueList[7]
-    approvedGeneSymbol = valueList[8]
-    entrezGeneID = valueList[9]
-    ensemblGeneID = valueList[10]
-    comments = valueList[11]
-    phenotypeString = valueList[12]
-    mouse = valueList[13]
-
-    # Skip empty phenotypes
-    if not phenotypeString:
-        continue
-
-    # Parse the phenotypes
-    for phenotype in phenotypeString.split(';'):
-
-        # Clean the phenotype
-        phenotype = phenotype.strip()
-
-        # Long phenotype
-        matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$', phenotype)
-        if matcher:
-
-            # Get the fields
-            phenotype = matcher.group(1)
-            phenotypeMimNumber = matcher.group(2)
-            phenotypeMappingKey = matcher.group(3)
-            inheritances = matcher.group(5)
-
-            # Get the inheritances, may or may not be there
-            if inheritances:
-                for inheritance in inheritances.split(','):
-                    inheritance = inheritance.strip()
-
-        # Short phenotype
-        else:
-
-            # Short phenotype
-            matcher = re.match(r'^(.*)\((\d)\)(|, (.*))$', phenotype)
-            if matcher:
-
-                # Get the fields
-                phenotype = matcher.group(1)
-                phenotypeMappingKey = matcher.group(2)
-                inheritances = matcher.group(3)
-
-                # Get the inheritances, may or may not be there
-                if inheritances:
-                    for inheritance in inheritances.split(','):
-                        inheritance = inheritance.strip()
-
-
-
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..0726b54
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "genemap2_parser"
+readme = "README.md"
+description = "This script parses the `genemap2.txt` file from OMIM and extracts gene-related data"
+classifiers = ["Programming Language :: Python :: 3.13"]
+dynamic = ["version", "dependencies"]
+
+[project.scripts]
+parseGeneMap2 = "genemap2_parser.script:main"
+
+[tool.setuptools]
+packages = [
+    "genemap2_parser",
+]
+
+[tool.pytest.ini_options]
+pythonpath = ["."]
\ No newline at end of file

From dd83804932f28b2bcb089df2856848aaf7f23492 Mon Sep 17 00:00:00 2001
From: azimgivron <azimgivron@gmail.com>
Date: Fri, 7 Feb 2025 09:20:56 +0100
Subject: [PATCH 2/2] add contributors

---
 README.md | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 52b2dc3..4465f48 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,37 @@
-# GeneMap2-Parser
+# GeneMap2-Parser ⚡  
 
-## Overview
+## Overview  
 
-This script parses the `genemap2.txt` file from [OMIM](https://omim.org/) and extracts gene-related data, including phenotypes and inheritance information. The parsed output is serialized into a `.pickle` file.
+This script parses the `genemap2.txt` file from [OMIM](https://omim.org/) and extracts gene-related data, including phenotypes and inheritance information. The parsed output is serialized into a `.pickle` file for efficient reuse.  
 
-## Installation
+## Installation  
 
 ```shell
 pip install git+https://github.com/OMIM-org/genemap2-parser.git
 ```
 
-
-## Usage
+## Usage  
 
 ```bash
 parseGeneMap2 -i path/to/genemap2.txt -o path/to/output/
 ```
 
-### Arguments
+### Arguments  
 
-- `-i, --input_file`  (Required) Path to `genemap2.txt`
-- `-o, --output_path` (Optional) Output directory (default: current directory)
+- `-i, --input_file`  (Required) Path to `genemap2.txt`  
+- `-o, --output_path` (Optional) Output directory (default: current directory)  
 
-### Example:
+### Example  
 
 ```bash
 parseGeneMap2 -i genemap2.txt -o output_dir/
 ```
 
-## Output
+## Output 💾  
+
+A `output.pickle` file containing extracted gene and phenotype data is created in the specified directory.  
 
-A `output.pickle` file containing extracted gene and phenotype data is created in the specified directory. To read:
+To read the output:  
 
 ```python
 import pickle
@@ -39,7 +40,11 @@ with open("output_dir/output.pickle", "rb") as f:
 print(data[:5])  # First 5 entries
 ```
 
-## Useful Links
+## Useful Links  
+
+- [OMIM](https://omim.org/)  
+- [OMIM Downloads](https://omim.org/downloads)  
+
+## Contributors  
 
-- [OMIM](https://omim.org/)
-- [OMIM Downloads](https://omim.org/downloads)
+- **Givron Azim** 🚀  
\ No newline at end of file