Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 37 additions & 16 deletions examples/local/local-container-mnist.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,13 @@
"source": [
"def train_mnist():\n",
" import os\n",
" import shutil\n",
" import time\n",
" import torch\n",
" import torch.nn.functional as F\n",
" from torch import nn, optim\n",
" import torch.distributed as dist\n",
" from pathlib import Path\n",
" from torch.utils.data import DataLoader, DistributedSampler\n",
" from torchvision import datasets, transforms\n",
"\n",
Expand Down Expand Up @@ -108,24 +111,42 @@
" model = nn.parallel.DistributedDataParallel(model)\n",
" optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n",
"\n",
" transform = transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,)),\n",
" ])\n",
"\n",
" def load_fashion_mnist_with_retry(train: bool, download: bool):\n",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we see that FashionMNIST is constantly failing and we require retries?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we see that FashionMNIST is constantly failing and we require retries?

not constantly, this was seen in the failing 1.35.0 CI run artifacts

and I kept the retry only in local-container-mnist as a targeted stabilization for that failure mode

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we use FashionMNIST dataset in other examples too, do we need to introduce retries to all of them?

Copy link
Copy Markdown
Author

@neeraj542 neeraj542 Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we use FashionMNIST dataset in other examples too, do we need to introduce retries to all of them?

i agree, there is no strong evidence i found of repeated FashionMNIST failures yet, so I’ll remove retries here

" retries = 3\n",
" delay_seconds = 5\n",
" last_error = None\n",
"\n",
" for attempt in range(1, retries + 1):\n",
" try:\n",
" return datasets.FashionMNIST(\n",
" \"./data\",\n",
" train=train,\n",
" download=download,\n",
" transform=transform,\n",
" )\n",
" except RuntimeError as e:\n",
" last_error = e\n",
" if \"File not found or corrupted\" in str(e):\n",
" # Cleanup potentially partial downloads before retrying.\n",
" raw_dir = Path(\"./data\") / \"FashionMNIST\" / \"raw\"\n",
" if raw_dir.exists():\n",
" shutil.rmtree(raw_dir, ignore_errors=True)\n",
" if attempt < retries:\n",
" print(f\"FashionMNIST download failed (attempt {attempt}/{retries}): {e}\")\n",
" time.sleep(delay_seconds)\n",
" else:\n",
" raise last_error\n",
"\n",
" # Download FashionMNIST dataset only on local_rank=0 process.\n",
" if local_rank == 0:\n",
" dataset = datasets.FashionMNIST(\n",
" \"./data\",\n",
" train=True,\n",
" download=True,\n",
" transform=transforms.Compose([\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.1307,), (0.3081,)),\n",
" ]),\n",
" )\n",
" load_fashion_mnist_with_retry(train=True, download=True)\n",
" dist.barrier()\n",
" dataset = datasets.FashionMNIST(\n",
" \"./data\",\n",
" train=True,\n",
" download=False,\n",
" transform=transforms.Compose([transforms.ToTensor()]),\n",
" )\n",
" dataset = load_fashion_mnist_with_retry(train=True, download=False)\n",
"\n",
" # Shard the dataset accross workers.\n",
" train_loader = DataLoader(\n",
Expand Down Expand Up @@ -348,7 +369,7 @@
"metadata": {},
"outputs": [],
"source": [
"client.wait_for_job_status(name=job_name, timeout=20)"
"client.wait_for_job_status(name=job_name, timeout=300)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion examples/local/local-training-mnist.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@
"metadata": {},
"outputs": [],
"source": [
"client.wait_for_job_status(name=job_name, timeout=20)"
"client.wait_for_job_status(name=job_name, timeout=300)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@
"metadata": {},
"outputs": [],
"source": [
"client.wait_for_job_status(name=job_name, timeout=20)"
"client.wait_for_job_status(name=job_name, timeout=300)"
]
}
],
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/image-classification/mnist.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@
"metadata": {},
"outputs": [],
"source": [
"client.wait_for_job_status(name=job_name, timeout=20)"
"client.wait_for_job_status(name=job_name, timeout=300)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@
"metadata": {},
"outputs": [],
"source": [
"TrainerClient().wait_for_job_status(name=job_id, timeout=20)"
"TrainerClient().wait_for_job_status(name=job_id, timeout=300)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@
"metadata": {},
"outputs": [],
"source": [
"client.wait_for_job_status(name=k8s_job_name, timeout=20)"
"client.wait_for_job_status(name=k8s_job_name, timeout=300)"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@
"metadata": {},
"outputs": [],
"source": [
"client.wait_for_job_status(name=job_name, timeout=20)"
"client.wait_for_job_status(name=job_name, timeout=300)"
]
},
{
Expand Down
7 changes: 4 additions & 3 deletions hack/e2e-run-notebook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,10 @@ print_results() {
# Only run kubectl commands if we're testing Kubernetes notebooks
if command -v kubectl &> /dev/null && kubectl cluster-info &> /dev/null; then
# Always show TrainJob status
kubectl describe trainjob
kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
kubectl wait trainjob --for=condition=Complete --all --timeout 3s
kubectl describe trainjob || true
kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer || exit 1
# CI clusters can be a bit slower; debug output should not fail the whole run.
kubectl wait trainjob --for=condition=Complete --all --timeout 60s || exit 1

# Only check pod logs if pods exist (not for local backends)
if kubectl get pods -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node --no-headers 2>/dev/null | grep -q .; then
Expand Down
Loading