From fce6df56e7b511d25a011126fefcdb049d5269f7 Mon Sep 17 00:00:00 2001 From: Satkar Juneja Date: Thu, 19 Mar 2026 13:30:24 +0530 Subject: [PATCH] Doc: Added Documentation for how to add datasets --- .../examples/how_to_add_custom_datasets.ipynb | 400 ++++++++++++++++++ 1 file changed, 400 insertions(+) create mode 100644 website/examples/how_to_add_custom_datasets.ipynb diff --git a/website/examples/how_to_add_custom_datasets.ipynb b/website/examples/how_to_add_custom_datasets.ipynb new file mode 100644 index 00000000..8be78606 --- /dev/null +++ b/website/examples/how_to_add_custom_datasets.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3793337f", + "metadata": {}, + "source": [ + "# How to Add Custom Datasets to AtomDB\n", + "This notebook walks through the steps required to integrate a custom dataset into AtomDB." + ] + }, + { + "cell_type": "markdown", + "id": "5b1092f0", + "metadata": {}, + "source": [ + "## 1. Import Required Libraries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b4fd23a2", + "metadata": {}, + "outputs": [], + "source": [ + "import os \n", + "import numpy as np\n", + "import atomdb" + ] + }, + { + "cell_type": "markdown", + "id": "5a9eb2a6", + "metadata": {}, + "source": [ + "## 2. Define Custom Dataset Structure\n", + "\n", + "A custom dataset must live under `atomdb/datasets//` and include:\n", + "\n", + "- A `run.py` file exposing a `run(elem, charge, mult, nexc, dataset, datapath)` function that compiles raw data into a `Species` instance.\n", + "- A `raw/` folder containing the raw data files that `run()` will read. This is where AtomDB looks for the source data (e.g. `.slater` files, Gaussian output files, etc.) before compiling them into the database cache.\n", + "\n", + "```\n", + "atomdb/datasets/custom_ds/\n", + "├── run.py\n", + "└── raw/\n", + " ├── H_0_1.mydata\n", + " ├── He_0_1.mydata\n", + " └── ...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "2011bc7d", + "metadata": {}, + "source": [ + "## 3. Implement the `run()` Function\n", + "\n", + "The `run()` function is the entry point AtomDB calls when compiling a species. It must read the raw data files from the `raw/` folder, compute the relevant properties, and return an `atomdb.Species` instance.\n", + "\n", + "There are two existing datasets that serve as good reference implementations:\n", + "\n", + "- **Slater** ([`atomdb/datasets/slater/run.py`](https://github.com/theochem/AtomDB/blob/master/atomdb/datasets/slater/run.py)): reads `.slater` files containing Hartree-Fock Slater-type orbital coefficients and exponents. A good template if your raw data describes atomic wavefunctions in terms of Slater-type orbitals.\n", + "\n", + "- **Gaussian** ([`atomdb/datasets/gaussian/run.py`](https://github.com/theochem/AtomDB/blob/master/atomdb/datasets/gaussian/run.py)): reads Gaussian basis set output files. A good template if your raw data comes from Gaussian calculations using a contracted Gaussian basis set.\n", + "\n", + "The key things `run()` must do:\n", + "1. Parse the raw data file for the given element, charge, and multiplicity.\n", + "2. Build a `fields` dictionary with the required `Species` fields (elem, atnum, nelec, nspin, etc.).\n", + "3. Return `atomdb.Species(dataset, fields)`.\n", + "\n", + "Note that the return value is **not a plain dictionary** — it must be an `atomdb.Species` instance constructed from the fields dict." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e22dcc8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#run.py template\n", + "import atomdb\n", + "import numpy as np\n", + "from atomdb.periodic import Element\n", + "import os\n", + "\n", + "\n", + "def run(elem, charge, mult, nexc, dataset, datapath):\n", + " \"\"\"Compile a species from custom raw data into AtomDB.\"\"\"\n", + " #element info\n", + " elem = atomdb.element_symbol(elem)\n", + " atnum = atomdb.element_number(elem)\n", + " nelec = atnum - charge\n", + " nspin = mult - 1\n", + "\n", + " # Load your raw data file from the raw/ folder\n", + " raw_file = os.path.join(datapath, dataset, \"raw\", f\"{elem}_{charge}_{mult}.mydata\")\n", + " data = np.loadtxt(raw_file)\n", + " rs = data[:, 0] \n", + " dens_tot = data[:, 1] \n", + " \n", + " # Get element-level properties (radii, mass, etc.) for neutral species\n", + " atom = Element(elem)\n", + " cov_radius, vdw_radius, at_radius, polarizability, dispersion = [None] * 5\n", + " if charge == 0:\n", + " cov_radius = atom.cov_radius\n", + " vdw_radius = atom.vdw_radius\n", + " at_radius = atom.at_radius\n", + " polarizability = atom.pold\n", + " dispersion = {\"C6\": atom.c6}\n", + "\n", + " # Build the fields dict keys must match the Species constructor\n", + " fields = dict(\n", + " elem=elem,\n", + " atnum=atnum,\n", + " obasis_name=dataset, #name of your dataset\n", + " nelec=nelec,\n", + " nspin=nspin,\n", + " nexc=nexc,\n", + " atmass=atom.mass,\n", + " cov_radius=cov_radius,\n", + " vdw_radius=vdw_radius,\n", + " at_radius=at_radius,\n", + " polarizability=polarizability,\n", + " dispersion=dispersion,\n", + " energy=None, # fill in from your raw data\n", + " # ... add density, ked, mo arrays etc. as needed\n", + " )\n", + "\n", + " # Must return a Species instance, not a plain dict\n", + " return atomdb.Species(dataset, fields)" + ] + }, + { + "cell_type": "markdown", + "id": "6fd5dcac", + "metadata": {}, + "source": [ + "## 4. Compile and Store the Dataset\n", + "\n", + "Once `run.py` and the `raw/` folder are in place, compile the raw data into \n", + "AtomDB's binary cache using `atomdb.compile()`. This calls your `run()` function \n", + "and writes the result to disk.\n", + "```python\n", + "atomdb.compile(\"H\", charge=0, mult=2, nexc=0, dataset=\"custom_ds\", datapath=datapath)\n", + "```\n", + "```bash\n", + "# CLI equivalent\n", + "atomdb compile custom_ds H 0 2\n", + "```\n", + "\n", + "\n", + "**Note on dependencies:** `compile()` imports and runs your dataset's `run.py` directly, \n", + "so any packages your `run.py` depends on must be installed before calling it. \n", + "For example, the built-in Slater dataset requires the `grid` package from the theochem \n", + "group. Make sure you document the dependencies of your custom dataset similarly so \n", + "other contributors know what to install.\n", + "\n", + "For built-in datasets like Slater, pre-compiled data is downloaded automatically \n", + "when you call `atomdb.load()` — you do not need to run `compile()` manually \n", + "unless you are modifying the dataset itself." + ] + }, + { + "cell_type": "markdown", + "id": "0ffcda12", + "metadata": {}, + "source": [ + "## 5. Load and Validate the Custom Dataset\n", + "\n", + "After compiling and dumping, use `atomdb.load()` to retrieve the species from the cache and confirm it was stored correctly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fbf65a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Element: H\n", + "Atomic number: 1\n", + "Num electrons: 1\n", + "Charge: 0\n", + "Multiplicity: 2\n", + "Dataset: slater\n", + "Energy: -0.5\n" + ] + } + ], + "source": [ + "import atomdb\n", + "\n", + "#Replace slater with your custom dataset\n", + "sp = atomdb.load(\"H\", charge=0, mult=2, nexc=0, dataset=\"slater\")\n", + "\n", + "# Inspect basic species properties\n", + "print(\"Element: \", sp.elem)\n", + "print(\"Atomic number: \", sp.atnum)\n", + "print(\"Num electrons: \", sp.nelec)\n", + "print(\"Charge: \", sp.charge)\n", + "print(\"Multiplicity: \", sp.mult)\n", + "print(\"Dataset: \", sp.dataset)\n", + "print(\"Energy: \", sp.energy)" + ] + }, + { + "cell_type": "markdown", + "id": "c046bf8e", + "metadata": {}, + "source": [ + "## 6. Example: Using the Custom Dataset\n", + "\n", + "Lets run through a full example on how to make a dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e6f53a45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset folder: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata\n", + "Raw folder: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata/raw\n" + ] + } + ], + "source": [ + "import os\n", + "import atomdb\n", + "\n", + "# This is where we need to put our run.py\n", + "atomdb_path = os.path.dirname(atomdb.__file__)\n", + "datapath = os.path.join(atomdb_path, \"datasets\") \n", + "dataset_path = os.path.join(atomdb_path, \"datasets\", \"mydata\")\n", + "raw_path = os.path.join(dataset_path, \"raw\")\n", + "\n", + "os.makedirs(dataset_path, exist_ok=True)\n", + "os.makedirs(raw_path, exist_ok=True)\n", + "\n", + "print(\"Dataset folder:\", dataset_path)\n", + "print(\"Raw folder: \", raw_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3c45640", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw file written: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata/raw/H_0_2.mydata\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# dummy radial grid and density values for hydrogen\n", + "r = np.linspace(0.001, 10.0, 1000)\n", + "dens = np.exp(-2 * r) / np.pi #H 1s density\n", + "\n", + "raw_file = os.path.join(raw_path, \"H_0_2.mydata\")\n", + "#H_0_2 is just a naming convention H is element symbol 0 is charge and 2 is multiplicity\n", + "np.savetxt(raw_file, np.column_stack([r, dens]))\n", + "print(\"Raw file written:\", raw_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f1187116", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "run.py written: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata/run.py\n" + ] + } + ], + "source": [ + "#Creating the run.py file\n", + "\n", + "run_py = '''\n", + "import os\n", + "import numpy as np\n", + "import atomdb\n", + "from atomdb.periodic import Element\n", + "\n", + "def run(elem, charge, mult, nexc, dataset, datapath):\n", + " # resolve element info\n", + " elem = atomdb.element_symbol(elem)\n", + " atnum = atomdb.element_number(elem)\n", + " nelec = atnum - charge\n", + " nspin = mult - 1\n", + "\n", + " # load raw data file\n", + " raw_file = os.path.join(datapath, dataset, \"raw\", f\"{elem}_{charge}_{mult}.mydata\")\n", + " data = np.loadtxt(raw_file)\n", + " rs, dens_tot = data[:, 0], data[:, 1]\n", + "\n", + " # build fields and return Species\n", + " fields = dict(\n", + " elem=elem,\n", + " atnum=atnum,\n", + " obasis_name=dataset,\n", + " nelec=nelec,\n", + " nspin=nspin,\n", + " nexc=nexc,\n", + " atmass=Element(elem).mass,\n", + " energy=-0.5,\n", + " rs=rs,\n", + " dens_tot=dens_tot,\n", + " )\n", + " return atomdb.Species(dataset, fields)\n", + "'''\n", + "\n", + "run_file = os.path.join(dataset_path, \"run.py\")\n", + "with open(run_file, \"w\") as f:\n", + " f.write(run_py)\n", + "\n", + "# also need an __init__.py so Python treats it as a package\n", + "with open(os.path.join(dataset_path, \"__init__.py\"), \"w\") as f:\n", + " f.write(\"\")\n", + "\n", + "print(\"run.py written:\", run_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1506c78a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Element: H\n", + "Atomic number: 1\n", + "Electrons: 1\n", + "Charge: 0\n", + "Mult: 2\n", + "Energy: -0.5\n" + ] + } + ], + "source": [ + "# Step 1 — compile() runs your run.py AND dumps to disk in one go\n", + "atomdb.compile(\"H\", charge=0, mult=2, nexc=0, dataset=\"mydata\", datapath=datapath)\n", + "\n", + "# Step 2 — load() reads what compile() wrote to disk\n", + "sp = atomdb.load(\"H\", charge=0, mult=2, nexc=0, dataset=\"mydata\", datapath=datapath)\n", + "print(\"Element: \", sp.elem)\n", + "print(\"Atomic number:\", sp.atnum)\n", + "print(\"Electrons: \", sp.nelec)\n", + "print(\"Charge: \", sp.charge)\n", + "print(\"Mult: \", sp.mult)\n", + "print(\"Energy: \", sp.energy)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}