-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprice1.py
More file actions
153 lines (123 loc) · 5.78 KB
/
price1.py
File metadata and controls
153 lines (123 loc) · 5.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import glob
import pandas as pd
from typing import Optional, List
from rdkit import Chem
# ---------------------------------------------------------------------------
# Path to the folder(s) containing your Mcule CSV files.
# Update MCULE_CSV_DIRS to point at whatever directories you unzip into.
# Every *.csv inside those directories will be searched.
# ---------------------------------------------------------------------------
MCULE_CSV_DIRS = [
r"C:/Users/dpsso/Downloads/mcule"
]
# Fallback buyables database (used by get_price in tree_search_global_greedy.py)
BUYABLES_CSV = "data/buyables.csv"
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _canonical(smiles: str) -> Optional[str]:
"""Return RDKit canonical SMILES, or None if the string is unparseable."""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
return Chem.MolToSmiles(mol)
except Exception:
return None
def _load_mcule_dataframes() -> pd.DataFrame:
"""
Read every CSV file found under MCULE_CSV_DIRS and concatenate them.
Expected columns (case-insensitive): 'smiles', 'best_price'
The 'Mcule ID' column is carried along but not required for lookup.
"""
frames = []
for directory in MCULE_CSV_DIRS:
pattern = os.path.join(directory, "**", "*.csv")
for csv_path in glob.glob(pattern, recursive=True):
try:
df = pd.read_csv(csv_path)
# Normalise column names to lowercase
df.columns = [c.strip().lower() for c in df.columns]
if "smiles" not in df.columns or "best_price" not in df.columns:
print(f"[price] Skipping {csv_path}: missing 'smiles' or 'best_price' column")
continue
df = df[["smiles", "best_price"]].dropna(subset=["smiles"])
frames.append(df)
except Exception as e:
print(f"[price] Could not read {csv_path}: {e}")
if not frames:
return pd.DataFrame(columns=["smiles", "best_price"])
combined = pd.concat(frames, ignore_index=True)
# Pre-compute canonical SMILES for the whole catalogue once at load time
combined["canonical"] = combined["smiles"].apply(_canonical)
return combined
# Load once at import time so repeated calls are fast
_MCULE_DB: Optional[pd.DataFrame] = None
def _get_db() -> pd.DataFrame:
global _MCULE_DB
if _MCULE_DB is None:
_MCULE_DB = _load_mcule_dataframes()
print(f"[price] Loaded {len(_MCULE_DB):,} Mcule entries from CSV files.")
return _MCULE_DB
# ---------------------------------------------------------------------------
# Public API (drop-in replacement for the original price.py)
# ---------------------------------------------------------------------------
def calculate_cost(smiles_list: List[str], save_path: str = BUYABLES_CSV) -> list:
"""
Look up chemical purchase costs from local Mcule CSV files.
Replaces the original API-based implementation. Prices are looked up by
canonical SMILES match; if no match is found the compound is returned as
None (unknown price).
Matched entries are also written back into the buyables.csv database so
that get_price() in tree_search_global_greedy.py can find them on the
next call without re-scanning the catalogue.
Parameters
----------
smiles_list : list of str
SMILES strings to price.
save_path : str
Path to the persistent buyables CSV (default: 'data/buyables.csv').
Returns
-------
list
USD/gram prices in the same order as smiles_list; None where unknown.
"""
db = _get_db()
# Load (or create) the persistent buyables database
if os.path.exists(save_path):
buyables = pd.read_csv(save_path)
else:
os.makedirs(os.path.dirname(save_path), exist_ok=True)
buyables = pd.DataFrame(columns=["smiles", "ppg", "source"])
usd_per_g_prices = []
for smiles in smiles_list:
canon = _canonical(smiles)
cost = None
if canon is not None and not db.empty:
# Match on canonical SMILES (handles tautomers / atom-map differences)
matches = db[db["canonical"] == canon]
if matches.empty:
# Fallback: exact string match on the raw SMILES column
matches = db[db["smiles"] == smiles]
if not matches.empty:
# Take the lowest available price across all catalogue files
best_price = pd.to_numeric(matches["best_price"], errors="coerce").min()
if pd.notna(best_price):
cost = float(best_price)
# ── Update the persistent buyables database ──────────────────────
if cost is not None:
if smiles in buyables["smiles"].values:
existing = buyables.loc[buyables["smiles"] == smiles, "ppg"].values[0]
if cost < existing:
buyables.loc[buyables["smiles"] == smiles, "ppg"] = cost
buyables.loc[buyables["smiles"] == smiles, "source"] = "mcule_csv"
else:
new_row = pd.DataFrame(
{"smiles": [smiles], "ppg": [cost], "source": ["mcule_csv"]}
)
buyables = pd.concat([buyables, new_row], ignore_index=True)
usd_per_g_prices.append(cost)
# Persist any new/updated entries
buyables.to_csv(save_path, index=False)
return usd_per_g_prices