diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..b04fb15 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,10 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3f0c0ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,171 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# configs/ # commented as new configs can be added as a part of a feature + +/.idea +/data +/logs +/results_buffer +electra_pretrained.ckpt +.isort.cfg +/.vscode diff --git a/chebai_graph/preprocessing/datasets/chebi.py b/chebai_graph/preprocessing/datasets/chebi.py index f84b3a5..60a711f 100644 --- a/chebai_graph/preprocessing/datasets/chebi.py +++ b/chebai_graph/preprocessing/datasets/chebi.py @@ -1,26 +1,26 @@ -from typing import Optional, List, Callable +import importlib +import os +from typing import Callable, List, Optional +import pandas as pd +import torch +import tqdm +from chebai.preprocessing.datasets.base import XYBaseDataModule from chebai.preprocessing.datasets.chebi import ( ChEBIOver50, ChEBIOver100, ChEBIOverXPartial, ) -from chebai.preprocessing.datasets.base import XYBaseDataModule from lightning_utilities.core.rank_zero import rank_zero_info +from torch_geometric.data.data import Data as GeomData -from chebai_graph.preprocessing.reader import GraphReader, GraphPropertyReader +import chebai_graph.preprocessing.properties as graph_properties from chebai_graph.preprocessing.properties import ( AtomProperty, BondProperty, MolecularProperty, ) -import pandas as pd -from torch_geometric.data.data import Data as GeomData -import torch -import chebai_graph.preprocessing.properties as graph_properties -import importlib -import os -import tqdm +from chebai_graph.preprocessing.reader import GraphPropertyReader, GraphReader class ChEBI50GraphData(ChEBIOver50): @@ -84,9 +84,11 @@ def _setup_properties(self): for file in file_names: # processed_dir_main only exists for ChEBI datasets path = os.path.join( - self.processed_dir_main - if hasattr(self, "processed_dir_main") - else self.raw_dir, + ( + self.processed_dir_main + if hasattr(self, "processed_dir_main") + else self.raw_dir + ), file, ) raw_data += list(self._load_dict(path)) @@ -94,8 +96,8 @@ def _setup_properties(self): features = [row["features"] for row in raw_data] # use vectorized version of encode function, apply only if value is present - enc_if_not_none = ( - lambda encode, value: [encode(atom_v) for atom_v in value] + enc_if_not_none = lambda encode, value: ( + [encode(atom_v) for atom_v in value] if value is not None and len(value) > 0 else None ) @@ -134,11 +136,14 @@ def get_property_path(self, property: MolecularProperty): f"{property.name}_{property.encoder.name}.pt", ) - def setup(self, **kwargs): - super().setup(keep_reader=True, **kwargs) - self._setup_properties() + def _after_setup(self, **kwargs): + """ + Finalize the setup process after ensuring the processed data is available. - self.reader.on_finish() + This method performs post-setup tasks like finalizing the reader and setting internal properties. + """ + self._setup_properties() + super()._after_setup(**kwargs) def _merge_props_into_base(self, row): geom_data = row["features"]