From fce6df56e7b511d25a011126fefcdb049d5269f7 Mon Sep 17 00:00:00 2001
From: Satkar Juneja <satkarjuneja@gmail.com>
Date: Thu, 19 Mar 2026 13:30:24 +0530
Subject: [PATCH] Doc: Added Documentation for how to add datasets

---
 .../examples/how_to_add_custom_datasets.ipynb | 400 ++++++++++++++++++
 1 file changed, 400 insertions(+)
 create mode 100644 website/examples/how_to_add_custom_datasets.ipynb

diff --git a/website/examples/how_to_add_custom_datasets.ipynb b/website/examples/how_to_add_custom_datasets.ipynb
new file mode 100644
index 00000000..8be78606
--- /dev/null
+++ b/website/examples/how_to_add_custom_datasets.ipynb
@@ -0,0 +1,400 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3793337f",
+   "metadata": {},
+   "source": [
+    "# How to Add Custom Datasets to AtomDB\n",
+    "This notebook walks through the steps required to integrate a custom dataset into AtomDB."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b1092f0",
+   "metadata": {},
+   "source": [
+    "## 1. Import Required Libraries\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b4fd23a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "import numpy as np\n",
+    "import atomdb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a9eb2a6",
+   "metadata": {},
+   "source": [
+    "## 2. Define Custom Dataset Structure\n",
+    "\n",
+    "A custom dataset must live under `atomdb/datasets/<your_name>/` and include:\n",
+    "\n",
+    "- A `run.py` file exposing a `run(elem, charge, mult, nexc, dataset, datapath)` function that compiles raw data into a `Species` instance.\n",
+    "- A `raw/` folder containing the raw data files that `run()` will read. This is where AtomDB looks for the source data (e.g. `.slater` files, Gaussian output files, etc.) before compiling them into the database cache.\n",
+    "\n",
+    "```\n",
+    "atomdb/datasets/custom_ds/\n",
+    "├── run.py\n",
+    "└── raw/\n",
+    "    ├── H_0_1.mydata\n",
+    "    ├── He_0_1.mydata\n",
+    "    └── ...\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2011bc7d",
+   "metadata": {},
+   "source": [
+    "## 3. Implement the `run()` Function\n",
+    "\n",
+    "The `run()` function is the entry point AtomDB calls when compiling a species. It must read the raw data files from the `raw/` folder, compute the relevant properties, and return an `atomdb.Species` instance.\n",
+    "\n",
+    "There are two existing datasets that serve as good reference implementations:\n",
+    "\n",
+    "- **Slater** ([`atomdb/datasets/slater/run.py`](https://github.com/theochem/AtomDB/blob/master/atomdb/datasets/slater/run.py)): reads `.slater` files containing Hartree-Fock Slater-type orbital coefficients and exponents. A good template if your raw data describes atomic wavefunctions in terms of Slater-type orbitals.\n",
+    "\n",
+    "- **Gaussian** ([`atomdb/datasets/gaussian/run.py`](https://github.com/theochem/AtomDB/blob/master/atomdb/datasets/gaussian/run.py)): reads Gaussian basis set output files. A good template if your raw data comes from Gaussian calculations using a contracted Gaussian basis set.\n",
+    "\n",
+    "The key things `run()` must do:\n",
+    "1. Parse the raw data file for the given element, charge, and multiplicity.\n",
+    "2. Build a `fields` dictionary with the required `Species` fields (elem, atnum, nelec, nspin, etc.).\n",
+    "3. Return `atomdb.Species(dataset, fields)`.\n",
+    "\n",
+    "Note that the return value is **not a plain dictionary** — it must be an `atomdb.Species` instance constructed from the fields dict."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e22dcc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#run.py template\n",
+    "import atomdb\n",
+    "import numpy as np\n",
+    "from atomdb.periodic import Element\n",
+    "import os\n",
+    "\n",
+    "\n",
+    "def run(elem, charge, mult, nexc, dataset, datapath):\n",
+    "    \"\"\"Compile a species from custom raw data into AtomDB.\"\"\"\n",
+    "    #element info\n",
+    "    elem = atomdb.element_symbol(elem)\n",
+    "    atnum = atomdb.element_number(elem)\n",
+    "    nelec = atnum - charge\n",
+    "    nspin = mult - 1\n",
+    "\n",
+    "    # Load your raw data file from the raw/ folder\n",
+    "    raw_file = os.path.join(datapath, dataset, \"raw\", f\"{elem}_{charge}_{mult}.mydata\")\n",
+    "    data = np.loadtxt(raw_file)\n",
+    "    rs       = data[:, 0] \n",
+    "    dens_tot = data[:, 1] \n",
+    "    \n",
+    "    # Get element-level properties (radii, mass, etc.) for neutral species\n",
+    "    atom = Element(elem)\n",
+    "    cov_radius, vdw_radius, at_radius, polarizability, dispersion = [None] * 5\n",
+    "    if charge == 0:\n",
+    "        cov_radius = atom.cov_radius\n",
+    "        vdw_radius = atom.vdw_radius\n",
+    "        at_radius = atom.at_radius\n",
+    "        polarizability = atom.pold\n",
+    "        dispersion = {\"C6\": atom.c6}\n",
+    "\n",
+    "    # Build the fields dict  keys must match the Species constructor\n",
+    "    fields = dict(\n",
+    "        elem=elem,\n",
+    "        atnum=atnum,\n",
+    "        obasis_name=dataset, #name of your dataset\n",
+    "        nelec=nelec,\n",
+    "        nspin=nspin,\n",
+    "        nexc=nexc,\n",
+    "        atmass=atom.mass,\n",
+    "        cov_radius=cov_radius,\n",
+    "        vdw_radius=vdw_radius,\n",
+    "        at_radius=at_radius,\n",
+    "        polarizability=polarizability,\n",
+    "        dispersion=dispersion,\n",
+    "        energy=None,       # fill in from your raw data\n",
+    "        # ... add density, ked, mo arrays etc. as needed\n",
+    "    )\n",
+    "\n",
+    "    # Must return a Species instance, not a plain dict\n",
+    "    return atomdb.Species(dataset, fields)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6fd5dcac",
+   "metadata": {},
+   "source": [
+    "## 4. Compile and Store the Dataset\n",
+    "\n",
+    "Once `run.py` and the `raw/` folder are in place, compile the raw data into \n",
+    "AtomDB's binary cache using `atomdb.compile()`. This calls your `run()` function \n",
+    "and writes the result to disk.\n",
+    "```python\n",
+    "atomdb.compile(\"H\", charge=0, mult=2, nexc=0, dataset=\"custom_ds\", datapath=datapath)\n",
+    "```\n",
+    "```bash\n",
+    "# CLI equivalent\n",
+    "atomdb compile custom_ds H 0 2\n",
+    "```\n",
+    "\n",
+    "\n",
+    "**Note on dependencies:** `compile()` imports and runs your dataset's `run.py` directly, \n",
+    "so any packages your `run.py` depends on must be installed before calling it. \n",
+    "For example, the built-in Slater dataset requires the `grid` package from the theochem \n",
+    "group. Make sure you document the dependencies of your custom dataset similarly so \n",
+    "other contributors know what to install.\n",
+    "\n",
+    "For built-in datasets like Slater, pre-compiled data is downloaded automatically \n",
+    "when you call `atomdb.load()` — you do not need to run `compile()` manually \n",
+    "unless you are modifying the dataset itself."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ffcda12",
+   "metadata": {},
+   "source": [
+    "## 5. Load and Validate the Custom Dataset\n",
+    "\n",
+    "After compiling and dumping, use `atomdb.load()` to retrieve the species from the cache and confirm it was stored correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fbf65a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Element:         H\n",
+      "Atomic number:   1\n",
+      "Num electrons:   1\n",
+      "Charge:          0\n",
+      "Multiplicity:    2\n",
+      "Dataset:         slater\n",
+      "Energy:          -0.5\n"
+     ]
+    }
+   ],
+   "source": [
+    "import atomdb\n",
+    "\n",
+    "#Replace slater with your custom dataset\n",
+    "sp = atomdb.load(\"H\", charge=0, mult=2, nexc=0, dataset=\"slater\")\n",
+    "\n",
+    "# Inspect basic species properties\n",
+    "print(\"Element:        \", sp.elem)\n",
+    "print(\"Atomic number:  \", sp.atnum)\n",
+    "print(\"Num electrons:  \", sp.nelec)\n",
+    "print(\"Charge:         \", sp.charge)\n",
+    "print(\"Multiplicity:   \", sp.mult)\n",
+    "print(\"Dataset:        \", sp.dataset)\n",
+    "print(\"Energy:         \", sp.energy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c046bf8e",
+   "metadata": {},
+   "source": [
+    "## 6. Example: Using the Custom Dataset\n",
+    "\n",
+    "Lets run through a full example on how to make a dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e6f53a45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset folder: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata\n",
+      "Raw folder:     /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata/raw\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import atomdb\n",
+    "\n",
+    "# This is where we need to put our run.py\n",
+    "atomdb_path = os.path.dirname(atomdb.__file__)\n",
+    "datapath = os.path.join(atomdb_path, \"datasets\") \n",
+    "dataset_path = os.path.join(atomdb_path, \"datasets\", \"mydata\")\n",
+    "raw_path = os.path.join(dataset_path, \"raw\")\n",
+    "\n",
+    "os.makedirs(dataset_path, exist_ok=True)\n",
+    "os.makedirs(raw_path, exist_ok=True)\n",
+    "\n",
+    "print(\"Dataset folder:\", dataset_path)\n",
+    "print(\"Raw folder:    \", raw_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f3c45640",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Raw file written: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata/raw/H_0_2.mydata\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# dummy radial grid and density values for hydrogen\n",
+    "r = np.linspace(0.001, 10.0, 1000)\n",
+    "dens = np.exp(-2 * r) / np.pi  #H 1s density\n",
+    "\n",
+    "raw_file = os.path.join(raw_path, \"H_0_2.mydata\")\n",
+    "#H_0_2 is just a naming convention H is element symbol 0 is charge and 2 is multiplicity\n",
+    "np.savetxt(raw_file, np.column_stack([r, dens]))\n",
+    "print(\"Raw file written:\", raw_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "f1187116",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "run.py written: /home/username/.local/lib/python3.13/site-packages/atomdb/datasets/mydata/run.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Creating the run.py file\n",
+    "\n",
+    "run_py = '''\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import atomdb\n",
+    "from atomdb.periodic import Element\n",
+    "\n",
+    "def run(elem, charge, mult, nexc, dataset, datapath):\n",
+    "    # resolve element info\n",
+    "    elem = atomdb.element_symbol(elem)\n",
+    "    atnum = atomdb.element_number(elem)\n",
+    "    nelec = atnum - charge\n",
+    "    nspin = mult - 1\n",
+    "\n",
+    "    # load raw data file\n",
+    "    raw_file = os.path.join(datapath, dataset, \"raw\", f\"{elem}_{charge}_{mult}.mydata\")\n",
+    "    data = np.loadtxt(raw_file)\n",
+    "    rs, dens_tot = data[:, 0], data[:, 1]\n",
+    "\n",
+    "    # build fields and return Species\n",
+    "    fields = dict(\n",
+    "        elem=elem,\n",
+    "        atnum=atnum,\n",
+    "        obasis_name=dataset,\n",
+    "        nelec=nelec,\n",
+    "        nspin=nspin,\n",
+    "        nexc=nexc,\n",
+    "        atmass=Element(elem).mass,\n",
+    "        energy=-0.5,\n",
+    "        rs=rs,\n",
+    "        dens_tot=dens_tot,\n",
+    "    )\n",
+    "    return atomdb.Species(dataset, fields)\n",
+    "'''\n",
+    "\n",
+    "run_file = os.path.join(dataset_path, \"run.py\")\n",
+    "with open(run_file, \"w\") as f:\n",
+    "    f.write(run_py)\n",
+    "\n",
+    "# also need an __init__.py so Python treats it as a package\n",
+    "with open(os.path.join(dataset_path, \"__init__.py\"), \"w\") as f:\n",
+    "    f.write(\"\")\n",
+    "\n",
+    "print(\"run.py written:\", run_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1506c78a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Element:       H\n",
+      "Atomic number: 1\n",
+      "Electrons:     1\n",
+      "Charge:        0\n",
+      "Mult:          2\n",
+      "Energy:        -0.5\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step 1 — compile() runs your run.py AND dumps to disk in one go\n",
+    "atomdb.compile(\"H\", charge=0, mult=2, nexc=0, dataset=\"mydata\", datapath=datapath)\n",
+    "\n",
+    "# Step 2 — load() reads what compile() wrote to disk\n",
+    "sp = atomdb.load(\"H\", charge=0, mult=2, nexc=0, dataset=\"mydata\", datapath=datapath)\n",
+    "print(\"Element:      \", sp.elem)\n",
+    "print(\"Atomic number:\", sp.atnum)\n",
+    "print(\"Electrons:    \", sp.nelec)\n",
+    "print(\"Charge:       \", sp.charge)\n",
+    "print(\"Mult:         \", sp.mult)\n",
+    "print(\"Energy:       \", sp.energy)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}