Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions mdfactory/analysis/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,97 @@ class SlurmConfig:
constraint: str | None = None
job_name_prefix: str = "mdfactory-analysis"

@classmethod
def from_cluster(
cls,
*,
needs_gpu: bool = False,
time: str = "2h",
cpus_per_task: int = 4,
mem_gb: int = 8,
qos: str | None = None,
constraint: str | None = None,
job_name_prefix: str = "mdfactory-analysis",
) -> "SlurmConfig":
"""Create SlurmConfig from autodiscovered cluster info.

Uses lazy import of ``mdfactory.performance.cluster`` to avoid
hard dependency on SLURM commands at import time.

Parameters
----------
needs_gpu : bool
If True, select a GPU-enabled partition.
time : str
Job time limit (default: "2h").
cpus_per_task : int
CPUs per task (default: 4).
mem_gb : int
Memory per task in GB (default: 8).
qos : str or None
Quality of service. If None, uses first available from cluster.
constraint : str or None
SLURM constraint string.
job_name_prefix : str
Prefix for SLURM job names.

Returns
-------
SlurmConfig
Configured instance with autodiscovered account and partition.

Raises
------
RuntimeError
If SLURM is not available or no suitable partition found.
"""
from mdfactory.performance.cluster import discover_cluster, select_partition

cluster = discover_cluster()
if cluster is None:
raise RuntimeError(
"SLURM autodiscovery failed: not running on a SLURM cluster "
"or SLURM commands (sinfo) are not available."
)

# Select appropriate partition
partition = select_partition(
cluster,
needs_gpu=needs_gpu,
min_cpus=cpus_per_task,
min_mem_gb=mem_gb,
)
if partition is None:
raise RuntimeError(
f"No suitable partition found for requirements: "
f"needs_gpu={needs_gpu}, min_cpus={cpus_per_task}, min_mem_gb={mem_gb}"
)

# Get account (required)
account = cluster.default_account
if account is None:
raise RuntimeError(
"SLURM autodiscovery failed: no default account found. "
"Please specify --account explicitly."
)

# Use first QOS if available and not explicitly set
resolved_qos = qos
if resolved_qos is None and cluster.qos_policies:
# Don't auto-select QOS; let SLURM use partition default
pass

return cls(
account=account,
partition=partition.name,
time=time,
cpus_per_task=cpus_per_task,
mem_gb=mem_gb,
qos=resolved_qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)


def normalize_slurm_time(value: str) -> str:
"""Normalize SLURM time strings to accepted formats."""
Expand Down
212 changes: 188 additions & 24 deletions mdfactory/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1187,8 +1187,6 @@ def analysis_run(
if source is None:
raise ValueError("Provide SOURCE as a simulation directory or build summary YAML.")

if slurm and account is None:
raise ValueError("--account is required when using --slurm.")
if analysis_workers is not None and analysis_workers < 1:
raise ValueError("--analysis-workers must be >= 1.")

Expand Down Expand Up @@ -1233,16 +1231,49 @@ def analysis_run(
print(result_df)
return

slurm_cfg = SlurmConfig(
account=account or "",
partition=partition,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
# Use autodiscovery if account not provided
if account is None:
try:
slurm_cfg = SlurmConfig.from_cluster(
needs_gpu=False,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
# Override partition if user specified one explicitly
if partition != "cpu": # user changed from default
slurm_cfg = SlurmConfig(
account=slurm_cfg.account,
partition=partition,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
logger.info(
f"Using autodiscovered SLURM config: account={slurm_cfg.account}, "
f"partition={slurm_cfg.partition}"
)
except RuntimeError as e:
raise ValueError(
f"SLURM autodiscovery failed: {e}\nPlease specify --account explicitly."
) from e
else:
slurm_cfg = SlurmConfig(
account=account,
partition=partition,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
if log_dir is None:
log_dir = determine_log_dir(sim_paths)
result_df = submit_analyses_slurm(
Expand Down Expand Up @@ -1520,19 +1551,50 @@ def analysis_artifacts_run(
print(summary)
return

# Use autodiscovery if account not provided
if account is None:
raise ValueError("--account is required when using --slurm.")

slurm_cfg = SlurmConfig(
account=account,
partition=partition,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
try:
slurm_cfg = SlurmConfig.from_cluster(
needs_gpu=False,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
# Override partition if user specified one explicitly
if partition != "cpu": # user changed from default
slurm_cfg = SlurmConfig(
account=slurm_cfg.account,
partition=partition,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)
logger.info(
f"Using autodiscovered SLURM config: account={slurm_cfg.account}, "
f"partition={slurm_cfg.partition}"
)
except RuntimeError as e:
raise ValueError(
f"SLURM autodiscovery failed: {e}\nPlease specify --account explicitly."
) from e
else:
slurm_cfg = SlurmConfig(
account=account,
partition=partition,
time=time,
cpus_per_task=cpus,
mem_gb=mem_gb,
qos=qos,
constraint=constraint,
job_name_prefix=job_name_prefix,
)

if log_dir is None:
log_dir = determine_log_dir(sim_paths)
result_df = submit_artifacts_slurm(
Expand Down Expand Up @@ -1750,6 +1812,108 @@ def config_edit():
subprocess.run([editor, str(config_path)], check=False)


@config_app.command(name="cluster")
def config_cluster(
json_output: Annotated[bool, Parameter("--json", help="Output in JSON format.")] = False,
):
"""Show discovered SLURM cluster information.

Queries the local SLURM scheduler and displays available partitions,
accounts, and QOS policies. Useful for verifying autodiscovery works
and understanding cluster resources before submitting jobs.

On non-SLURM machines, prints a helpful message instead of failing.
"""
import json as json_module

from mdfactory.performance.cluster import discover_cluster

cluster = discover_cluster()

if cluster is None:
if json_output:
print(json_module.dumps({"error": "SLURM not available", "cluster": None}))
else:
print("SLURM cluster not detected.")
print()
print("This machine does not appear to be a SLURM cluster node,")
print("or SLURM commands (sinfo, sacctmgr) are not in PATH.")
print()
print("To use SLURM submission, run this command on a cluster login node.")
return

if json_output:
# Build JSON-serializable structure
data = {
"default_account": cluster.default_account,
"accounts": cluster.accounts,
"qos_policies": cluster.qos_policies,
"partitions": [
{
"name": p.name,
"state": p.state,
"is_default": p.is_default,
"max_time": p.max_time,
"default_time": p.default_time,
"total_nodes": p.total_nodes,
"node_types": [
{
"cpus": nt.cpus,
"memory_mb": nt.memory_mb,
"gpus": nt.gpus,
"gpu_type": nt.gpu_type,
"features": list(nt.features),
}
for nt in p.node_types
],
}
for p in cluster.partitions
],
}
print(json_module.dumps(data, indent=2))
return

# Human-readable output
print("SLURM Cluster Information")
print("=" * 50)
print()

# Account info
print(f"Default Account: {cluster.default_account or '(none)'}")
if cluster.accounts:
print(f"Available Accounts: {', '.join(cluster.accounts)}")
print()

# QOS info
if cluster.qos_policies:
print(f"QOS Policies: {', '.join(cluster.qos_policies)}")
print()

# Partition info
print("Partitions:")
print("-" * 50)
for partition in cluster.partitions:
default_marker = " (default)" if partition.is_default else ""
state_marker = f" [{partition.state}]" if partition.state != "up" else ""
print(f"\n {partition.name}{default_marker}{state_marker}")
print(f" Nodes: {partition.total_nodes}")
print(f" Max Time: {partition.max_time}")
if partition.default_time != partition.max_time:
print(f" Default Time: {partition.default_time}")

# Summarize node types
for nt in partition.node_types:
gpu_info = ""
if nt.gpus > 0:
gpu_type_str = nt.gpu_type or "GPU"
gpu_info = f", {nt.gpus}x {gpu_type_str}"
mem_gb = nt.memory_mb // 1024
features_str = ""
if nt.features:
features_str = f" [{', '.join(nt.features)}]"
print(f" - {nt.cpus} CPUs, {mem_gb} GB{gpu_info}{features_str}")


def main():
app()

Expand Down
Loading