Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
3f48d73
Implement --auto-parallelism
pefontana Jan 23, 2026
653d0eb
parallelism_data.json
pefontana Jan 23, 2026
01a6714
Merge branch 'main' into hardcode-parallelism-data
pefontana Jan 23, 2026
b541cc5
simplify code
pefontana Jan 23, 2026
4a6e1d2
Merge remote-tracking branch 'origin/hardcode-parallelism-data' into …
pefontana Jan 23, 2026
3af705e
clippy
pefontana Jan 23, 2026
9303f99
Merge branch 'main' into hardcode-parallelism-data
pefontana Jan 26, 2026
27f4ed5
add parallelism_data.json to Garnix
pefontana Jan 26, 2026
a8d5330
add hardware type to json
pefontana Jan 26, 2026
f265de9
update .json
pefontana Jan 26, 2026
1d3aafd
Fallback: read from /proc/driver/nvidia
pefontana Jan 26, 2026
50762e5
restore scripts/train-solana-test.sh
pefontana Jan 26, 2026
f6c1c7a
update documentation
pefontana Jan 26, 2026
42fd012
Change micro_batch_size for Meta-Llama-3.1 to 1
pefontana Jan 26, 2026
1695b93
nit
pefontana Jan 27, 2026
5175723
look data-parallelism.json in HF repo
pefontana Jan 27, 2026
8f6d14e
change json format
pefontana Jan 28, 2026
6644c67
nvml_wrapper
pefontana Jan 28, 2026
ec28547
Merge branch 'main' into hardcode-parallelism-data
pefontana Jan 28, 2026
fdf4f0d
se tch for GPU count (respects CUDA_VISIBLE_DEVICES)
pefontana Jan 29, 2026
1a454c5
Merge branch 'main' into hardcode-parallelism-data
pefontana Jan 30, 2026
6114eba
Merge branch 'main' into hardcode-parallelism-data
pefontana Feb 3, 2026
22f7b26
Merge branch 'main' into hardcode-parallelism-data
pefontana Feb 10, 2026
8a4dabd
Merge branch 'main' into hardcode-parallelism-data
pefontana Feb 19, 2026
6ac2e70
Merge branch 'main' into hardcode-parallelism-data
pefontana Feb 20, 2026
4dc974b
Merge branch 'main' into hardcode-parallelism-data
pefontana Feb 23, 2026
6f4ae53
Merge branch 'main' into hardcode-parallelism-data
pefontana Feb 27, 2026
e1f7276
Merge branch 'main' into hardcode-parallelism-data
pefontana Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions architectures/centralized/client/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ pub async fn build_app(
.await?;

let state_options: RunInitConfig<ClientId, ClientId> = RunInitConfig {
parallelism_auto: p.parallelism_auto,
data_parallelism: p.data_parallelism,
tensor_parallelism: p.tensor_parallelism,
micro_batch_size: p.micro_batch_size,
Expand Down
1 change: 1 addition & 0 deletions architectures/decentralized/solana-client/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ pub async fn build_app(

let state_options: RunInitConfig<psyche_solana_coordinator::ClientId, NetworkIdentity> =
RunInitConfig {
parallelism_auto: p.parallelism_auto,
data_parallelism: p.data_parallelism,
tensor_parallelism: p.tensor_parallelism,
micro_batch_size: p.micro_batch_size,
Expand Down
3 changes: 2 additions & 1 deletion nix/lib.nix
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ let
|| (builtins.match ".*tests/fixtures/.*$" path != null)
|| (builtins.match ".*.config/.*$" path != null)
|| (builtins.match ".*local-dev-keypair.json$" path != null)
|| (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null);
|| (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null)
|| (builtins.match ".*shared/client/src/parallelism_data\\.json$" path != null);

src = lib.cleanSourceWith {
src = ../.;
Expand Down
28 changes: 27 additions & 1 deletion psyche-book/src/enduser/create-run.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,37 @@ run-manager create-run \

At this point, your run has been successfully created.

### Adding parallelism configuration (required for --parallelism-auto)

If you want clients to use `PARALLELISM_AUTO=true` for automatic configuration, you must add a `parallelism_data.json` file to your model's HuggingFace repository.

```json
{
"H100": {
"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
"8": { "dp": 4, "tp": 2, "micro_batch_size": 4 }
},
"H200": {
"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
}
}
```

Format: `gpu_type` → `num_gpus` → config

- **gpu_type**: GPU model name (e.g., "H100", "H200")
- **num_gpus**: Number of GPUs available (e.g., "1", "8")
- **dp**: Data parallelism
- **tp**: Tensor parallelism
- **micro_batch_size**: Micro batch size per GPU

The config is shared via P2P when clients join a run.

### Initializing configuration

Initially, the run will not have any configuration defined and will remain paused, so no clients can join yet.

To set the run configuration, youll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md).
To set the run configuration, you'll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md).

```bash
run-manager update-config \
Expand Down
10 changes: 10 additions & 0 deletions psyche-book/src/enduser/join-run.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,19 +117,29 @@ though you might need to.

**`NVIDIA_DRIVER_CAPABILITIES`** - An environment variable that the NVIDIA Container Toolkit uses to determine which compute capabilities should be provided to your container. It is recommended to set it to 'all', e.g. `NVIDIA_DRIVER_CAPABILITIES=all`.

**`PARALLELISM_AUTO`** - Set to `true` to automatically detect optimal parallelism settings based on the model and your GPU hardware.

- When enabled, the client will look up the best `DATA_PARALLELISM`, `TENSOR_PARALLELISM`, and `MICRO_BATCH_SIZE` values from a [built-in configuration table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json)
- Your model and GPU hardware combination must be present in the table
- This is the recommended option for most users
- If set, manual parallelism settings below will be ignored

**`DATA_PARALLELISM`** - Number of GPUs to distribute training data across.

- If you have multiple GPUs, you can set this to 2, 4, etc. to speed up training
- If you have 1 GPU, set this to `1`
- Ignored if `PARALLELISM_AUTO=true`

**`TENSOR_PARALLELISM`** - Number of GPUs to distribute the model across, this lets you train a model you can't fit on one single GPU.

- If you have 1 GPU, set this to `1`
- If your have `n` GPUs you can distribute the model across all of them by setting it to `n`.
- Ignored if `PARALLELISM_AUTO=true`

**`MICRO_BATCH_SIZE`** - Number of samples processed per GPU per training step

- Set as high as your GPU memory allows
- Ignored if `PARALLELISM_AUTO=true`

**`AUTHORIZER`** - The Solana address that authorized your wallet to join this run

Expand Down
1 change: 1 addition & 0 deletions shared/client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ clap.workspace = true
sysinfo = "0.32.0"
iroh.workspace = true
iroh-blobs.workspace = true
nvml-wrapper = "0.11.0"

[features]
parallelism = ["psyche-modeling/parallelism"]
Expand Down
4 changes: 4 additions & 0 deletions shared/client/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ pub struct TrainArgs {
#[clap(long, env, value_parser = parse_trim_quotes)]
pub run_id: String,

/// Auto-detect parallelism settings from lookup table based on model and GPU count
#[clap(long, env)]
pub parallelism_auto: bool,

#[clap(long, default_value_t = 1, env)]
pub data_parallelism: usize,

Expand Down
1 change: 1 addition & 0 deletions shared/client/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod cli;
mod client;
mod fetch_data;
pub mod parallelism_lookup;
mod protocol;
mod state;
mod tui;
Expand Down
89 changes: 89 additions & 0 deletions shared/client/src/parallelism_lookup.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use anyhow::Result;
use hf_hub::{Repo, RepoType};
use nvml_wrapper::Nvml;
use serde::Deserialize;
use std::collections::HashMap;
use tracing::info;

const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json";

#[derive(Debug, Clone, Copy, Deserialize)]
pub struct ParallelismConfig {
pub dp: usize,
pub tp: usize,
pub micro_batch_size: usize,
}

// Table format: gpu_type -> num_gpus -> config
type Table = HashMap<String, HashMap<String, ParallelismConfig>>;

/// Get GPU type from NVML (reads first visible GPU)
fn get_gpu_type_from_nvml() -> Result<String> {
let nvml = Nvml::init()?;
let device = nvml.device_by_index(0)?;
Ok(device.name()?)
}

fn normalize_gpu_name(raw_name: &str) -> String {
let upper = raw_name.to_uppercase();
if upper.contains("H200") {
"H200".to_string()
} else if upper.contains("H100") {
"H100".to_string()
} else {
raw_name.to_string()
}
}

/// Try to load parallelism config JSON from the model's HuggingFace repo
fn load_json_from_model_repo(model_repo_id: &str) -> Option<String> {
let token = std::env::var("HF_TOKEN").ok();

let api = hf_hub::api::sync::ApiBuilder::new()
.with_token(token)
.build()
.ok()?
.repo(Repo::new(model_repo_id.to_string(), RepoType::Model));

let path = api.get(REMOTE_CONFIG_FILENAME).ok()?;
std::fs::read_to_string(path).ok()
}

/// Lookup config in a table
fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option<ParallelismConfig> {
table
.get(gpu_type)
.and_then(|n| n.get(&num_gpus.to_string()))
.copied()
}

/// Lookup parallelism config from the model's HuggingFace repo
pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
let device_count = tch::Cuda::device_count() as usize;
if device_count == 0 {
anyhow::bail!("No GPUs found!");
}

// Use NVML for GPU type detection
let gpu_type = normalize_gpu_name(&get_gpu_type_from_nvml()?);
info!("Detected {} x {} GPU(s)", device_count, gpu_type);

let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| {
anyhow::anyhow!(
"No parallelism_data.json found in model repo '{}'. \
Add this file to use --parallelism-auto",
model_repo_id
)
})?;

let table: Table = serde_json::from_str(&raw_json)
.map_err(|e| anyhow::anyhow!("Failed to parse parallelism_data.json: {}", e))?;

info!(
"Using parallelism config from model repo '{}'",
model_repo_id
);

lookup_in_table(&table, &gpu_type, device_count)
.ok_or_else(|| anyhow::anyhow!("No config for {} x {}", device_count, gpu_type))
}
Loading
Loading