From 1867e1fb6fd2169e645375fa4d84b45d89a46d5d Mon Sep 17 00:00:00 2001 From: azimgivron Date: Fri, 7 Feb 2025 09:12:57 +0100 Subject: [PATCH 1/2] make a script that outputs a binary directly exploitable --- README.md | 49 +++++++++---- genemap2_parser/__init__.py | 0 genemap2_parser/script.py | 137 ++++++++++++++++++++++++++++++++++++ parseGeneMap2.py | 91 ------------------------ pyproject.toml | 21 ++++++ 5 files changed, 195 insertions(+), 103 deletions(-) create mode 100644 genemap2_parser/__init__.py create mode 100644 genemap2_parser/script.py delete mode 100644 parseGeneMap2.py create mode 100644 pyproject.toml diff --git a/README.md b/README.md index eddbb80..52b2dc3 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,45 @@ -GeneMap2.txt Parser: -==================== +# GeneMap2-Parser -This script demonstrates how to parse the genemap2.txt file that can be downloaded from -[OMIM](https://omim.org/). Note that while it parses out every field in the file it -doesn't output anything. +## Overview -The genemap2.txt file can downloaded from [OMIM Downloads](https://omim.org/downloads) -(registration required). +This script parses the `genemap2.txt` file from [OMIM](https://omim.org/) and extracts gene-related data, including phenotypes and inheritance information. The parsed output is serialized into a `.pickle` file. -Sample Use: ------------ +## Installation -`cat genemap2.txt | ./parseGeneMap2.py` +```shell +pip install git+https://github.com/OMIM-org/genemap2-parser.git +``` -Userful Links: --------------- + +## Usage + +```bash +parseGeneMap2 -i path/to/genemap2.txt -o path/to/output/ +``` + +### Arguments + +- `-i, --input_file` (Required) Path to `genemap2.txt` +- `-o, --output_path` (Optional) Output directory (default: current directory) + +### Example: + +```bash +parseGeneMap2 -i genemap2.txt -o output_dir/ +``` + +## Output + +A `output.pickle` file containing extracted gene and phenotype data is created in the specified directory. To read: + +```python +import pickle +with open("output_dir/output.pickle", "rb") as f: + data = pickle.load(f) +print(data[:5]) # First 5 entries +``` + +## Useful Links - [OMIM](https://omim.org/) - [OMIM Downloads](https://omim.org/downloads) diff --git a/genemap2_parser/__init__.py b/genemap2_parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/genemap2_parser/script.py b/genemap2_parser/script.py new file mode 100644 index 0000000..d586860 --- /dev/null +++ b/genemap2_parser/script.py @@ -0,0 +1,137 @@ +import re +import argparse +import multiprocessing +import pickle +from pathlib import Path +from typing import Optional, Dict, List, Any + +def process_line(line: str) -> Optional[Dict[str, Any]]: + """ + Processes a single line from the genemap2 file. + + :param line: A single line of text from the file + :return: A dictionary containing extracted data or None if the line is invalid + """ + if line.startswith('#') or not line.strip(): + return None + + value_list = line.strip().split('\t') + if len(value_list) < 14: + return None + + try: + chromosome, genomic_position_start, genomic_position_end, cyto_location, computed_cyto_location, mim_number, \ + gene_symbols, gene_name, approved_gene_symbol, entrez_gene_id, ensembl_gene_id, comments, phenotype_string, mouse = value_list + except ValueError: + raise ValueError(f"Unexpected number of columns in line: {line}") + + if not phenotype_string: + return None + + phenotypes = [] + for phenotype in phenotype_string.split(';'): + phenotype = phenotype.strip() + matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$', phenotype) + + if matcher: + phenotype_name = matcher.group(1) + phenotype_mim_number = matcher.group(2) + phenotype_mapping_key = matcher.group(3) + inheritances = matcher.group(5).split(', ') if matcher.group(5) else [] + else: + matcher = re.match(r'^(.*)\((\d)\)(|, (.*))$', phenotype) + if matcher: + phenotype_name = matcher.group(1) + phenotype_mim_number = None + phenotype_mapping_key = matcher.group(2) + inheritances = matcher.group(4).split(', ') if matcher.group(4) else [] + else: + continue + + phenotypes.append({ + 'name': phenotype_name, + 'mim_number': phenotype_mim_number, + 'mapping_key': phenotype_mapping_key, + 'inheritance': inheritances + }) + + return { + 'chromosome': chromosome, + 'genomic_position_start': genomic_position_start, + 'genomic_position_end': genomic_position_end, + 'cyto_location': cyto_location, + 'computed_cyto_location': computed_cyto_location, + 'mim_number': mim_number, + 'gene_symbols': gene_symbols, + 'gene_name': gene_name, + 'approved_gene_symbol': approved_gene_symbol, + 'entrez_gene_id': entrez_gene_id, + 'ensembl_gene_id': ensembl_gene_id, + 'comments': comments, + 'mouse': mouse, + 'phenotypes': phenotypes + } + +def parse_genemap2(filename: Path) -> List[Dict[str, Any]]: + """ + Parses the genemap2 file and extracts relevant data. + + :param filename: Path to the genemap2.txt file + :return: A list of dictionaries containing parsed data + """ + try: + with filename.open('r', encoding='utf-8') as file: + lines = file.readlines() + except FileNotFoundError: + raise FileNotFoundError(f"Input file '{filename}' not found.") + except IOError as e: + raise IOError(f"Error reading file '{filename}': {e}") + + with multiprocessing.Pool() as pool: + parsed_data = pool.map(process_line, lines) + + return [entry for entry in parsed_data if entry] + +def main(): + """Main""" + parser = argparse.ArgumentParser( + description=( + "This is a simple script to parse the genemap2.txt file that " + "can be downloaded from https://omim.org/\n\n" + "The file can be downloaded from https://omim.org/downloads " + "(registration required)." + ) + ) + parser.add_argument("--input_file", "-i", type=str, required=True, help="Path to the genemap2.txt file.") + parser.add_argument("--output_path", "-o", type=str, default=".", help="Path to write the serialized parsed data (default: %(default)s).") + args = parser.parse_args() + + input_file = Path(args.input_file) + output_path = Path(args.output_path) + + if not input_file.exists(): + raise FileNotFoundError(f"Input file '{input_file}' does not exist.") + + if not input_file.is_file(): + raise ValueError(f"Specified input path '{input_file}' is not a valid file.") + + if input_file.suffix != ".txt": + raise ValueError(f"Expected a .txt file but got '{input_file.suffix}' instead.") + + if not output_path.exists(): + raise FileNotFoundError(f"Output path '{output_path}' does not exist.") + + if not output_path.is_dir(): + raise ValueError(f"Specified output path '{output_path}' is not a directory.") + + parsed_data = parse_genemap2(input_file) + + try: + with (output_path / "output.pickle").open('wb') as file: + pickle.dump(parsed_data, file) + except IOError as e: + raise IOError(f"Error writing output file: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/parseGeneMap2.py b/parseGeneMap2.py deleted file mode 100644 index 307b33c..0000000 --- a/parseGeneMap2.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - - -# -# This is a simple script to parse the genemap2.txt file that -# can be downloaded from https://omim.org/ -# -# The file can downloaded from https://omim.org/downloads -# (registration required). -# - - -# Imports -import sys -import re - - -# Read from stdin -for line in sys.stdin: - - # Skip comments - if line.startswith('#'): - continue - - # Strip trailing new line - line = line.strip('\n') - - # Get the values - valueList = line.split('\t') - - # Get the fields - chromosome = valueList[0] - genomicPositionStart = valueList[1] - genomicPositionEnd = valueList[2] - cytoLocation = valueList[3] - computedCytoLocation = valueList[4] - mimNumber = valueList[5] - geneSymbols = valueList[6] - geneName = valueList[7] - approvedGeneSymbol = valueList[8] - entrezGeneID = valueList[9] - ensemblGeneID = valueList[10] - comments = valueList[11] - phenotypeString = valueList[12] - mouse = valueList[13] - - # Skip empty phenotypes - if not phenotypeString: - continue - - # Parse the phenotypes - for phenotype in phenotypeString.split(';'): - - # Clean the phenotype - phenotype = phenotype.strip() - - # Long phenotype - matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(|, (.*))$', phenotype) - if matcher: - - # Get the fields - phenotype = matcher.group(1) - phenotypeMimNumber = matcher.group(2) - phenotypeMappingKey = matcher.group(3) - inheritances = matcher.group(5) - - # Get the inheritances, may or may not be there - if inheritances: - for inheritance in inheritances.split(','): - inheritance = inheritance.strip() - - # Short phenotype - else: - - # Short phenotype - matcher = re.match(r'^(.*)\((\d)\)(|, (.*))$', phenotype) - if matcher: - - # Get the fields - phenotype = matcher.group(1) - phenotypeMappingKey = matcher.group(2) - inheritances = matcher.group(3) - - # Get the inheritances, may or may not be there - if inheritances: - for inheritance in inheritances.split(','): - inheritance = inheritance.strip() - - - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0726b54 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "genemap2_parser" +readme = "README.md" +description = "This script parses the `genemap2.txt` file from OMIM and extracts gene-related data" +classifiers = ["Programming Language :: Python :: 3.13"] +dynamic = ["version", "dependencies"] + +[project.scripts] +parseGeneMap2 = "genemap2_parser.script:main" + +[tool.setuptools] +packages = [ + "genemap2_parser", +] + +[tool.pytest.ini_options] +pythonpath = ["."] \ No newline at end of file From dd83804932f28b2bcb089df2856848aaf7f23492 Mon Sep 17 00:00:00 2001 From: azimgivron Date: Fri, 7 Feb 2025 09:20:56 +0100 Subject: [PATCH 2/2] add contributors --- README.md | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 52b2dc3..4465f48 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,37 @@ -# GeneMap2-Parser +# GeneMap2-Parser ⚡ -## Overview +## Overview -This script parses the `genemap2.txt` file from [OMIM](https://omim.org/) and extracts gene-related data, including phenotypes and inheritance information. The parsed output is serialized into a `.pickle` file. +This script parses the `genemap2.txt` file from [OMIM](https://omim.org/) and extracts gene-related data, including phenotypes and inheritance information. The parsed output is serialized into a `.pickle` file for efficient reuse. -## Installation +## Installation ```shell pip install git+https://github.com/OMIM-org/genemap2-parser.git ``` - -## Usage +## Usage ```bash parseGeneMap2 -i path/to/genemap2.txt -o path/to/output/ ``` -### Arguments +### Arguments -- `-i, --input_file` (Required) Path to `genemap2.txt` -- `-o, --output_path` (Optional) Output directory (default: current directory) +- `-i, --input_file` (Required) Path to `genemap2.txt` +- `-o, --output_path` (Optional) Output directory (default: current directory) -### Example: +### Example ```bash parseGeneMap2 -i genemap2.txt -o output_dir/ ``` -## Output +## Output 💾 + +A `output.pickle` file containing extracted gene and phenotype data is created in the specified directory. -A `output.pickle` file containing extracted gene and phenotype data is created in the specified directory. To read: +To read the output: ```python import pickle @@ -39,7 +40,11 @@ with open("output_dir/output.pickle", "rb") as f: print(data[:5]) # First 5 entries ``` -## Useful Links +## Useful Links + +- [OMIM](https://omim.org/) +- [OMIM Downloads](https://omim.org/downloads) + +## Contributors -- [OMIM](https://omim.org/) -- [OMIM Downloads](https://omim.org/downloads) +- **Givron Azim** 🚀 \ No newline at end of file