TCP-Lab
diff --git a/‎CHANGELOG.md‎
Lines changed: 58 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎src/db_rebuilder/daedalus/__init__.py‎
Lines changed: 2 additions & 13 deletions b/‎src/db_rebuilder/daedalus/__init__.py‎
Lines changed: 2 additions & 13 deletions
diff --git a/‎src/db_rebuilder/daedalus/constants/__init__.py‎
Lines changed: 66 additions & 0 deletions b/‎src/db_rebuilder/daedalus/constants/__init__.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎…/db_rebuilder/daedalus/url_hardpoints.py‎ ‎…der/daedalus/constants/url_hardpoints.py‎src/db_rebuilder/daedalus/url_hardpoints.py renamed to src/db_rebuilder/daedalus/constants/url_hardpoints.py
Lines changed: 29 additions & 51 deletions b/‎…/db_rebuilder/daedalus/url_hardpoints.py‎ ‎…der/daedalus/constants/url_hardpoints.py‎src/db_rebuilder/daedalus/url_hardpoints.py renamed to src/db_rebuilder/daedalus/constants/url_hardpoints.py
Lines changed: 29 additions & 51 deletions
diff --git a/‎src/db_rebuilder/daedalus/errors.py‎
Lines changed: 1 addition & 1 deletion b/‎src/db_rebuilder/daedalus/errors.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…/db_rebuilder/daedalus/tests/__init__.py‎ ‎…ebuilder/daedalus/local_data/__init__.py‎src/db_rebuilder/daedalus/tests/__init__.py renamed to src/db_rebuilder/daedalus/local_data/__init__.py b/‎…/db_rebuilder/daedalus/tests/__init__.py‎ ‎…ebuilder/daedalus/local_data/__init__.py‎src/db_rebuilder/daedalus/tests/__init__.py renamed to src/db_rebuilder/daedalus/local_data/__init__.py
diff --git a/‎…alus/manual_data/atp_driven_ABC_data.csv‎ ‎…dalus/local_data/atp_driven_ABC_data.csv‎src/db_rebuilder/daedalus/manual_data/atp_driven_ABC_data.csv renamed to src/db_rebuilder/daedalus/local_data/atp_driven_ABC_data.csv b/‎…alus/manual_data/atp_driven_ABC_data.csv‎ ‎…dalus/local_data/atp_driven_ABC_data.csv‎src/db_rebuilder/daedalus/manual_data/atp_driven_ABC_data.csv renamed to src/db_rebuilder/daedalus/local_data/atp_driven_ABC_data.csv
diff --git a/‎src/db_rebuilder/schema.sql‎ ‎…rebuilder/daedalus/local_data/schema.sql‎src/db_rebuilder/schema.sql renamed to src/db_rebuilder/daedalus/local_data/schema.sql
Lines changed: 7 additions & 7 deletions b/‎src/db_rebuilder/schema.sql‎ ‎…rebuilder/daedalus/local_data/schema.sql‎src/db_rebuilder/schema.sql renamed to src/db_rebuilder/daedalus/local_data/schema.sql
Lines changed: 7 additions & 7 deletions
diff --git a/‎…ilder/daedalus/manual_data/thesaurus.csv‎ ‎…uilder/daedalus/local_data/thesaurus.csv‎src/db_rebuilder/daedalus/manual_data/thesaurus.csv renamed to src/db_rebuilder/daedalus/local_data/thesaurus.csv b/‎…ilder/daedalus/manual_data/thesaurus.csv‎ ‎…uilder/daedalus/local_data/thesaurus.csv‎src/db_rebuilder/daedalus/manual_data/thesaurus.csv renamed to src/db_rebuilder/daedalus/local_data/thesaurus.csv
@@ -0,0 +1,58 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Calendar Versioning](https://calver.org/) with the format `MAJOR.YY.0W[_MINOR][-Modifier]`. The major version increases when the database schema changes. Minor tags are added for multiple releases in the same week, starting from `2` (the `1` is implicit). Modifiers are added for pre-releases (e.g. `beta` or `alpha`).
+
+## [0.23.15-beta] - First release
+
+This is the first release of the database. The DB features data from 7 different databases, all joined up for ease of consumption. We include:
+- [ENSEMBL](https://www.ensembl.org/index.html) gene IDs and information, making the backbone of the database IDs;
+- [HGNC](https://www.genenames.org/) for up-to-date, official gene names and gene grouping;
+- [IUPHAR](https://www.guidetopharmacology.org/) for target (in our case transporters) and ligand (i.e. drugs/internal compounds) interactions, as well as gene grouping, ion channel conductances, and more;
+- [COSMIC](https://cancer.sanger.ac.uk/cosmic) for mutational information;
+- [SLC tables](http://slc.bioparadigms.org/) for solute carrier information, such as their class and carried solute;
+- [TCDB](https://www.tcdb.org/) for transporter classification information.
+
+We apply manual patches to the data where expert information is lacking from the above databases.
+
+The database is released as a `.sqlite` file at each release.
+
+I highlight the latest changes:
+
+### Changes
+- [a03ab0b] **Major refactoring of Daedalus**
+    - The current list of `IF-ELSE` statements to run or skip some parsers
+    (for debugging purposes) was terrible. Now, a new class handles
+    running them properly.
+    - The `parsers.py` file was getting too long for comfort. It was broken up
+    into chunks and ported to multiple files in `./parsers/`
+    - A new `./constants` module holds all of the constants that were strewn
+    about, with the exception of some constants that are very
+    parser-specific.
+    - A lot of things were removed from the module `init.py`, since they
+    did not belong there.
+    - The argparser was finally actually finished.
+    - If the COSMIC username/password combo is not specified, the cosmic
+    data will not be downloaded (at the user's risk).
+    - New CLI parameters `run` and `skip` allow easier selective running of
+    the different parsers, so that we don't commit breaking changes
+    anymore by accident (aka `SKIP_ALL = True`)
+    - We use `package.resources` everywhere now, without having to use
+    wobbly relative paths. This should make us ready to convert to a
+    proper package.
+    - The `tests/` folder is now out of `./daedalus/.` It is probably
+    completely broken now, but it was useless anyway.
+- [8f29fbe] **Many Daedalus logic changes**
+    - Changed Biomart's `XML`s to be more efficient. Should reduce download times a bit.
+    - Allowed Biomart to download colnames too, therefore making manual colnames useless. I just standardize back to the same format we have used until now the names tha biomart gives us.
+        - This means that all of the colnames around were updated to the new naming.
+    - Changed from `CSV` to `TSV` the format for the BioMart data. It seems that the csv parser does not escape commas in the data. How fun! This makes the tsv option the only feasible one.
+    - Moved to the top of the BioMart list the `entrez` entry, so that the retriever has to download little data before crashing (easier debugging!).
+    - Moved the logic for saving a pickle of the data to the `ResourceCache` class, from the bad hack in `make_database`.
+    - Made the downloads from multithreaded to single-threaded. Why were they multithreaded in the first place? I have no idea.
+    - The parsers that fail are now skipped gracefully, before dumping all failures at once and aborting. This should make large-scale failures easier to debug, since all parsers do not depend on each other to run (they only write to the database, they cannot read from it).
+    - Written comments here and there.
+    - Added delays after the warnings when using `--overwrite` and `--regen-cache`, so that one can `CTRL-C` when mistakes are made.
+- [960da8f] **Added a project changelog**
+    - We will follow CalVer `MAJOR.YY.0W[_MINOR][-Modifier]` from now on.
@@ -1,21 +1,10 @@
 import logging
 from logging import StreamHandler
-from pathlib import Path
 
 from colorama import Back, Fore, Style
 
-OUT_ANCHOR: Path = Path("/app/out")
-
-__all__ = ["OUTANCHOR"]
-__version__ = "0.1.0"
-
-DB_PATH = OUT_ANCHOR / f"MTPDB_v{__version__}.sqlite"
-
-if DB_PATH.exists():
-    raise Exception(f"Target DB already exists @{DB_PATH}. Aborting")
-
-
-SCHEMA = "BEGIN;\n{}\nEND;".format(Path("/app/schema.sql").read_text())
+__all__ = ["DB_NAME", "SCHEMA"]
+__version__ = "0.23.15-beta"
 
 
 class ColorFormatter(logging.Formatter):
 
@@ -0,0 +1,66 @@
+"""Constants that are used throughout the program"""
+
+# Re-export constants, so they can all be accessed from here
+from daedalus import __version__
+from daedalus.constants.url_hardpoints import (
+    BIOMART,
+    BIOMART_XML_REQUESTS,
+    COSMIC,
+    HUGO,
+    IUPHAR_COMPILED,
+    IUPHAR_DB,
+    SLC_TABLES,
+    TCDB,
+)
+
+__all__ = [
+    "BIOMART",
+    "BIOMART_XML_REQUESTS",
+    "TCDB",
+    "COSMIC",
+    "IUPHAR_DB",
+    "IUPHAR_COMPILED",
+    "HUGO",
+    "SLC_TABLES",
+    "DESCRIPTION",
+    "NAME",
+    "EPILOG",
+    "DB_NAME",
+    "CACHE_NAME",
+    "THESAURUS_FILE",
+]
+
+## TODO: It could be beneficial to bundle all of these constants into
+# just one box and re-export just that.
+
+DESCRIPTION = """
+    >>> DAEDALUS <<<
+
+This program builds the MTP-Db from information retrieved from online databases.
+The rationale is that if the databases update, we also update accordingly.
+We also add a pinch of manual curation to fill in the gaps of knowledge from the
+online databases.
+
+Some of the parsing steps from the remote databases to the local DB are
+heuristic in nature, and therefore might give imperfect information.
+Feel free to open issues on GitHub @ https://github.com/CMA-Lab/MTP-DB/issues
+if you find any incorrect or missing information.
+"""
+"""A short description of Daedalus"""
+
+NAME = "Daedalus, the MTP-Db rebuilder"
+"""The name of the program, to be shown by Argparser"""
+
+EPILOG = (
+    "For more usage information, please refer to https://github.com/CMA-Lab/MTP-DB/"
+)
+"""Message shown by argparser at the bottom of the usage info"""
+
+DB_NAME = f"MTPDB_v{__version__}.sqlite"
+"""Name of the DB file to save as output"""
+
+CACHE_NAME = f"MTPDB_datacache.pickle"
+"""Name of the cache file to use to stash the downloaded data"""
+
+THESAURUS_FILE = "thesaurus.csv"
+"""Name of the local thesaurus file"""
@@ -1,80 +1,56 @@
 BIOMART = "http://www.ensembl.org/biomart/martservice"
 """The Url used by Biomart to accept requests"""
+
 BIOMART_XML_REQUESTS = {
-    "IDs+desc": {
-        "query": """<?xml version="1.0" encoding="UTF-8"?>
+    "entrez": """<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
-<Query  virtualSchemaName = "default" formatter = "CSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" datasetConfigVersion = "0.6" >
 
 	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
 		<Filter name = "biotype" value = "protein_coding"/>
 		<Attribute name = "ensembl_gene_id_version" />
-		<Attribute name = "ensembl_transcript_id_version" />
-		<Attribute name = "description" />
-		<Attribute name = "external_gene_name" />
-		<Attribute name = "ensembl_peptide_id_version" />
-		<Attribute name = "entrezgene_id" />
-		<Attribute name = "pdb" />
-		<Attribute name = "refseq_mrna" />
+        <Attribute name = "entrezgene_id" />
 	</Dataset>
 </Query>""",
-        "colnames": [
-            "ensembl_gene_id_version",
-            "ensembl_transcript_id_version",
-            "description",
-            "external_gene_name",
-            "ensembl_peptide_id_version",
-            "entrezgene_id",
-            "pdb",
-            "refseq_mrna",
-        ],
-    },
-    "hugo_symbols": {
-        "query": """<?xml version="1.0" encoding="UTF-8"?>
+    "IDs": """<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
-<Query  virtualSchemaName = "default" formatter = "CSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" datasetConfigVersion = "0.6" >
 
 	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
 		<Filter name = "biotype" value = "protein_coding"/>
-		<Attribute name = "hgnc_id" />
-		<Attribute name = "hgnc_symbol" />
 		<Attribute name = "ensembl_gene_id_version" />
+		<Attribute name = "ensembl_transcript_id_version" />
 	</Dataset>
 </Query>""",
-        "colnames": ["hgnc_id", "hgnc_symbol", "ensembl_gene_id_version"],
-    },
-    "IDs": {
-        "query": """<?xml version="1.0" encoding="UTF-8"?>
+    "proteins": """<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE Query>
-<Query  virtualSchemaName = "default" formatter = "CSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" datasetConfigVersion = "0.6" >
 
 	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
 		<Filter name = "biotype" value = "protein_coding"/>
-		<Attribute name = "ensembl_gene_id" />
-		<Attribute name = "ensembl_transcript_id" />
-		<Attribute name = "ensembl_peptide_id" />
-		<Attribute name = "version" />
-		<Attribute name = "transcript_version" />
-		<Attribute name = "peptide_version" />
+		<Attribute name = "ensembl_transcript_id_version" />
+        <Attribute name = "ensembl_peptide_id_version" />
+		<Attribute name = "pdb" />
 		<Attribute name = "refseq_mrna" />
-		<Attribute name = "refseq_peptide" />
+        <Attribute name = "refseq_peptide" />
+	</Dataset>
+</Query>""",
+    "gene_names": """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" datasetConfigVersion = "0.6" >
+
+	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
+		<Filter name = "biotype" value = "protein_coding"/>
+		<Attribute name = "hgnc_id" />
+		<Attribute name = "hgnc_symbol" />
+        <Attribute name = "description" />
+		<Attribute name = "ensembl_gene_id_version" />
 	</Dataset>
 </Query>""",
-        "colnames": [
-            "ensembl_gene_id",
-            "ensembl_transcript_id",
-            "ensembl_peptide_id",
-            "version",
-            "transcript_version",
-            "peptide_version",
-            "refseq_mrna",
-            "refseq_peptide",
-        ],
-    },
 }
 """Hardpoints with Biomart data.
 
-In the form of 'table_name': {'query': xlm_query, 'colnames': [list of colnames]}
+In the form of 'table_name': 'xml_query'
 """
 
 TCDB = {
@@ -104,6 +80,7 @@
 
 IUPHAR_DB = "https://www.guidetopharmacology.org/DATA/public_iuphardb_v2022.2.zip"
 """URL to the download of the full IUPHAR database"""
+
 IUPHAR_COMPILED = {
     "targets+families": "https://www.guidetopharmacology.org/DATA/targets_and_families.csv",
     "ligands": "https://www.guidetopharmacology.org/DATA/ligands.csv",
@@ -112,7 +89,7 @@
 """URLs to the compiled IUPHAR data from their downloads page"""
 
 HUGO = {
-    "nomenclature": "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2021-03-01.txt",
+    "nomenclature": "https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-04-01.txt",
     "groups": {
         # I could download json files, but most of the data is flat anyway, so...
         "endpoint": "https://www.genenames.org/cgi-bin/genegroup/download?id={id}&type=branch",
@@ -138,3 +115,4 @@
 """Hugo downloads as found on their download pages"""
 
 SLC_TABLES = "http://slc.bioparadigms.org/"
+"""URL to the SLC tables that have data regarding solute carriers"""
@@ -5,6 +5,6 @@ class CacheKeyError(Exception):
 
 
 class Abort(Exception):
-    """The program cannot continue, but the error was logged."""
+    """The program cannot continue, but the error was caught, logged, and we can exit gracefully."""
 
     pass
@@ -1,20 +1,20 @@
 CREATE TABLE gene_ids (
-    ensg_version TEXT UNIQUE NOT NULL, -- from biomart > IDs+desc > ensembl_gene_id_version
-    ensg TEXT PRIMARY KEY, -- from biomart > IDs+desc > ensembl_gene_id_version
-    ensg_version_leaf INT NOT NULL -- from biomart > IDs+desc > ensembl_gene_id_version
+    ensg_version TEXT UNIQUE NOT NULL, -- from biomart > IDs+desc > gene_stable_id_version
+    ensg TEXT PRIMARY KEY, -- from biomart > IDs+desc > gene_stable_id_version
+    ensg_version_leaf INT NOT NULL -- from biomart > IDs+desc > gene_stable_id_version
 );
 
 CREATE TABLE transcript_ids (
-    ensg TEXT NOT NULL, -- from biomart > IDs+desc > ensembl_gene_id_version
-    enst TEXT PRIMARY KEY, -- from biomart > IDs+desc > ensembl_transcript_id_version
+    ensg TEXT NOT NULL, -- from biomart > IDs+desc > gene_stable_id_version
+    enst TEXT PRIMARY KEY, -- from biomart > IDs+desc > transcript_stable_id_version
     enst_version TEXT UNIQUE NOT NULL, -- same as enst
     enst_version_leaf INT NOT NULL, -- same as enst
     is_canonical_isoform INT NOT NULL -- bool
 );
 
 CREATE TABLE mrna_refseq (
     -- These cannot be unique, as some refseq IDs are missing
-    enst TEXT NOT NULL, -- from biomart > IDs+desc > ensembl_transcript_id_version
+    enst TEXT NOT NULL, -- from biomart > IDs+desc > transcript_stable_id_version
     refseq_transcript_id TEXT -- from biomart > IDs+desc > refseq_mrna
     -- refseq_transcript_id_version INT -- MISSING?? No version for refseq?
     -- refseq_transcrpit_id_version_leaf INT -- See aboveref
@@ -29,7 +29,7 @@ CREATE TABLE protein_ids (
 );
 
 CREATE TABLE gene_names (
-    ensg TEXT, -- from biomart > IDs+desc > ensembl_gene_id_version
+    ensg TEXT, -- from biomart > IDs+desc > gene_stable_id_version
     hugo_gene_id TEXT, -- from biomart > hugo_symbols > hgnc_id
     hugo_gene_symbol TEXT, -- from biomart > hugo_symbols > hugo_gene symbol
     -- (double check with the description field below)