From b2ee72e78dc554d6a9cad8e975b199e719f6ec44 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 15 May 2025 18:11:36 +0200 Subject: [PATCH 1/2] [Examples] Renamed some example groups for better extensibility - [x] Renamed `Distributed training` to `Clusters` - [x] Renamed `Deployment` to `Inference` - [x] Renamed `Monitoring` to `Metrics`. - [x] Added redirects --- docs/assets/stylesheets/extra.css | 3 +- docs/blog/posts/amd-on-runpod.md | 2 +- ...d-kubernetes-2024-recap-and-whats-ahead.md | 2 +- docs/blog/posts/dstack-metrics.md | 2 +- docs/blog/posts/metrics-ui.md | 4 +- docs/blog/posts/mpi.md | 2 +- docs/blog/posts/prometheus.md | 6 +-- docs/blog/posts/tpu-on-gcp.md | 4 +- docs/changelog/index.md | 1 - docs/docs/concepts/gateways.md | 2 +- docs/docs/concepts/services.md | 6 +-- .../nim/index.md => docs/guides/clusters.md} | 0 .../docs/guides/{monitoring.md => metrics.md} | 0 docs/examples.md | 22 ++++----- .../sglang => clusters/a3high}/index.md | 0 .../tgi => clusters/a3mega}/index.md | 0 .../trtllm => clusters/nccl-tests}/index.md | 0 .../vllm => clusters/rccl-tests}/index.md | 0 .../nim}/index.md | 0 .../sglang}/index.md | 0 .../nccl-tests => inference/tgi}/index.md | 0 .../rccl-tests => inference/trtllm}/index.md | 0 docs/examples/inference/vllm/index.md | 0 docs/overrides/main.html | 4 +- examples/accelerators/amd/README.md | 12 ++--- examples/accelerators/intel/README.md | 6 +-- examples/accelerators/tpu/README.md | 8 ++-- .../a3high}/README.md | 2 +- .../a3high}/fleet.dstack.yml | 0 .../a3high}/nccl-tests.dstack.yml | 0 .../a3mega}/README.md | 2 +- .../a3mega}/fleet.dstack.yml | 0 .../a3mega}/nccl-tests.dstack.yml | 0 .../nccl-tests/.dstack.yml | 0 .../nccl-tests/README.md | 0 .../rccl-tests/.dstack.yml | 0 .../rccl-tests/README.md | 0 examples/deployment/infinity/.dstack.yml | 13 ------ examples/deployment/infinity/README.md | 11 ----- examples/deployment/lorax/.dstack.yml | 14 ------ examples/deployment/lorax/README.md | 14 ------ examples/deployment/ollama/.dstack.yml | 19 -------- examples/deployment/ollama/README.md | 11 ----- .../{deployment => inference}/nim/.dstack.yml | 0 .../{deployment => inference}/nim/README.md | 6 +-- .../sglang/README.md | 4 +- .../{deployment => inference}/tgi/.dstack.yml | 0 .../{deployment => inference}/tgi/README.md | 8 ++-- .../tgi/amd/.dstack.yml | 0 .../tgi/tpu/.dstack.yml | 0 .../trtllm/README.md | 20 ++++---- .../trtllm/build-image.dstack.yml | 0 .../trtllm/build-model.dstack.yml | 0 .../trtllm/convert-model.dstack.yml | 0 .../trtllm/serve-distill.dstack.yml | 0 .../trtllm/serve-r1.dstack.yml | 2 +- .../vllm/.dstack.yml | 0 .../{deployment => inference}/vllm/README.md | 10 ++-- .../vllm/amd/.dstack.yml | 0 .../vllm/amd/build-vllm.dstack.yml | 0 .../vllm/tpu/.dstack.yml | 0 mkdocs.yml | 46 +++++++++++-------- scripts/docs/gen_cli_reference.py | 5 +- scripts/docs/gen_schema_reference.py | 4 +- 64 files changed, 105 insertions(+), 172 deletions(-) delete mode 100644 docs/changelog/index.md rename docs/{examples/deployment/nim/index.md => docs/guides/clusters.md} (100%) rename docs/docs/guides/{monitoring.md => metrics.md} (100%) rename docs/examples/{deployment/sglang => clusters/a3high}/index.md (100%) rename docs/examples/{deployment/tgi => clusters/a3mega}/index.md (100%) rename docs/examples/{deployment/trtllm => clusters/nccl-tests}/index.md (100%) rename docs/examples/{deployment/vllm => clusters/rccl-tests}/index.md (100%) rename docs/examples/{distributed-training/a3high-clusters => inference/nim}/index.md (100%) rename docs/examples/{distributed-training/a3mega-clusters => inference/sglang}/index.md (100%) rename docs/examples/{distributed-training/nccl-tests => inference/tgi}/index.md (100%) rename docs/examples/{distributed-training/rccl-tests => inference/trtllm}/index.md (100%) create mode 100644 docs/examples/inference/vllm/index.md rename examples/{distributed-training/a3high-clusters => clusters/a3high}/README.md (99%) rename examples/{distributed-training/a3high-clusters => clusters/a3high}/fleet.dstack.yml (100%) rename examples/{distributed-training/a3high-clusters => clusters/a3high}/nccl-tests.dstack.yml (100%) rename examples/{distributed-training/a3mega-clusters => clusters/a3mega}/README.md (99%) rename examples/{distributed-training/a3mega-clusters => clusters/a3mega}/fleet.dstack.yml (100%) rename examples/{distributed-training/a3mega-clusters => clusters/a3mega}/nccl-tests.dstack.yml (100%) rename examples/{distributed-training => clusters}/nccl-tests/.dstack.yml (100%) rename examples/{distributed-training => clusters}/nccl-tests/README.md (100%) rename examples/{distributed-training => clusters}/rccl-tests/.dstack.yml (100%) rename examples/{distributed-training => clusters}/rccl-tests/README.md (100%) delete mode 100644 examples/deployment/infinity/.dstack.yml delete mode 100644 examples/deployment/infinity/README.md delete mode 100644 examples/deployment/lorax/.dstack.yml delete mode 100644 examples/deployment/lorax/README.md delete mode 100644 examples/deployment/ollama/.dstack.yml delete mode 100644 examples/deployment/ollama/README.md rename examples/{deployment => inference}/nim/.dstack.yml (100%) rename examples/{deployment => inference}/nim/README.md (91%) rename examples/{deployment => inference}/sglang/README.md (96%) rename examples/{deployment => inference}/tgi/.dstack.yml (100%) rename examples/{deployment => inference}/tgi/README.md (88%) rename examples/{deployment => inference}/tgi/amd/.dstack.yml (100%) rename examples/{deployment => inference}/tgi/tpu/.dstack.yml (100%) rename examples/{deployment => inference}/trtllm/README.md (94%) rename examples/{deployment => inference}/trtllm/build-image.dstack.yml (100%) rename examples/{deployment => inference}/trtllm/build-model.dstack.yml (100%) rename examples/{deployment => inference}/trtllm/convert-model.dstack.yml (100%) rename examples/{deployment => inference}/trtllm/serve-distill.dstack.yml (100%) rename examples/{deployment => inference}/trtllm/serve-r1.dstack.yml (90%) rename examples/{deployment => inference}/vllm/.dstack.yml (100%) rename examples/{deployment => inference}/vllm/README.md (88%) rename examples/{deployment => inference}/vllm/amd/.dstack.yml (100%) rename examples/{deployment => inference}/vllm/amd/build-vllm.dstack.yml (100%) rename examples/{deployment => inference}/vllm/tpu/.dstack.yml (100%) diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 4d97c82325..4952a41c58 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -790,6 +790,7 @@ body { display: flex; } + /* Comment to siwtch to sections in sidebar */ .md-sidebar__inner > .md-nav--primary > .md-nav__list:not(.md-post__meta) > .md-nav__item > .md-nav > .md-nav__list > .md-nav__item:not(.md-nav__item--section) { display: none; } @@ -861,7 +862,7 @@ body { } .md-sidebar--primary .md-nav__link, .md-sidebar--post .md-nav__link { - padding: 5px 15px 5px; + padding: 5px 15px 4px; margin-top: 0; } diff --git a/docs/blog/posts/amd-on-runpod.md b/docs/blog/posts/amd-on-runpod.md index aab962c9da..eb4668e506 100644 --- a/docs/blog/posts/amd-on-runpod.md +++ b/docs/blog/posts/amd-on-runpod.md @@ -42,7 +42,7 @@ you can now specify an AMD GPU under `resources`. Below are a few examples. Here's an example of a [service](../../docs/concepts/services.md) that deploys Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}. -
+
```yaml type: service diff --git a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md index be7868e285..79e026a0b4 100644 --- a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md +++ b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md @@ -110,7 +110,7 @@ efficient manner. ### NVIDIA -NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../examples/deployment/nim/index.md) +NVIDIA remains the top accelerator supported by `dstack`. Recently, we introduced a [NIM example](../../examples/inference/nim/index.md) for model deployment, and we continue to enhance support for the rest of NVIDIA's ecosystem. ### AMD diff --git a/docs/blog/posts/dstack-metrics.md b/docs/blog/posts/dstack-metrics.md index 459ef6d3e5..07d80ab0ab 100644 --- a/docs/blog/posts/dstack-metrics.md +++ b/docs/blog/posts/dstack-metrics.md @@ -7,7 +7,7 @@ image: https://dstack.ai/static-assets/static-assets/images/dstack-stats-v2.png categories: - AMD - NVIDIA - - Monitoring + - Metrics --- # Monitoring essential GPU metrics via CLI diff --git a/docs/blog/posts/metrics-ui.md b/docs/blog/posts/metrics-ui.md index ba0cfde6b6..032115e5a6 100644 --- a/docs/blog/posts/metrics-ui.md +++ b/docs/blog/posts/metrics-ui.md @@ -5,7 +5,7 @@ description: "TBA" slug: metrics-ui image: https://dstack.ai/static-assets/static-assets/images/dstack-metrics-ui-v3-min.png categories: - - Monitoring + - Metrics - AMD - NVIDIA --- @@ -55,6 +55,6 @@ For persistent storage and long-term access to metrics, we still recommend setti metrics from `dstack`. !!! info "What's next?" - 1. See [Monitoring](../../docs/guides/monitoring.md) + 1. See [Metrics](../../docs/guides/metrics.md) 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} diff --git a/docs/blog/posts/mpi.md b/docs/blog/posts/mpi.md index 4516c4297e..70b3ed1650 100644 --- a/docs/blog/posts/mpi.md +++ b/docs/blog/posts/mpi.md @@ -101,5 +101,5 @@ as well as use MPI for other tasks. !!! info "What's next?" 1. Learn more about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 2. Check the [NCCL tests](../../examples/distributed-training/nccl-tests/index.md) example + 2. Check the [NCCL tests](../../examples/clusters/nccl-tests/index.md) example 2. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} diff --git a/docs/blog/posts/prometheus.md b/docs/blog/posts/prometheus.md index 23e644bcfd..fbaee63c1e 100644 --- a/docs/blog/posts/prometheus.md +++ b/docs/blog/posts/prometheus.md @@ -5,7 +5,7 @@ description: "TBA" slug: prometheus image: https://dstack.ai/static-assets/static-assets/images/dstack-prometheus-v3.png categories: - - Monitoring + - Metrics - NVIDIA --- @@ -46,7 +46,7 @@ Overall, `dstack` collects three groups of metrics: | **Runs** | Run metrics include run counters for each user in each project. | | **Jobs** | A run consists of one or more jobs, each mapped to a container. Job metrics offer insights into execution time, cost, GPU model, NVIDIA DCGM telemetry, and more. | -For a full list of available metrics and labels, check out [Monitoring](../../docs/guides/monitoring.md). +For a full list of available metrics and labels, check out [Metrics](../../docs/guides/metrics.md). ??? info "NVIDIA" NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, @@ -60,7 +60,7 @@ For a full list of available metrics and labels, check out [Monitoring](../../do only accessible through the UI and the [`dstack metrics`](dstack-metrics.md) CLI. !!! info "What's next?" - 1. See [Metrics](../../docs/guides/monitoring.md) + 1. See [Metrics](../../docs/guides/metrics.md) 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/docs/blog/posts/tpu-on-gcp.md b/docs/blog/posts/tpu-on-gcp.md index 8cdf1a051a..24534c7e76 100644 --- a/docs/blog/posts/tpu-on-gcp.md +++ b/docs/blog/posts/tpu-on-gcp.md @@ -50,7 +50,7 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm- === "Optimum TPU" -
+
```yaml type: service @@ -83,7 +83,7 @@ and [vLLM :material-arrow-top-right-thin:{ .external }](https://github.com/vllm- the official Docker image can be used instead of `dstackai/optimum-tpu:llama31`. === "vLLM" -
+
```yaml type: service diff --git a/docs/changelog/index.md b/docs/changelog/index.md deleted file mode 100644 index 05761ac57f..0000000000 --- a/docs/changelog/index.md +++ /dev/null @@ -1 +0,0 @@ -# Blog diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 815eca4ae1..500d0d86cf 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -71,7 +71,7 @@ To delete a gateway, pass the gateway configuration to [`dstack delete`](../refe
```shell -$ dstack delete -f examples/deployment/gateway.dstack.yml +$ dstack delete -f examples/inference/gateway.dstack.yml ```
diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 1f7b515b84..70e0b2d5bd 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -558,6 +558,6 @@ If one replica of a multi-replica service fails with retry enabled, 1. Read about [dev environments](dev-environments.md), [tasks](tasks.md), and [repos](repos.md) 2. Learn how to manage [fleets](fleets.md) 3. See how to set up [gateways](gateways.md) - 4. Check the [TGI :material-arrow-top-right-thin:{ .external }](../../examples/deployment/tgi/index.md){:target="_blank"}, - [vLLM :material-arrow-top-right-thin:{ .external }](../../examples/deployment/vllm/index.md){:target="_blank"}, and - [NIM :material-arrow-top-right-thin:{ .external }](../../examples/deployment/nim/index.md){:target="_blank"} examples + 4. Check the [TGI :material-arrow-top-right-thin:{ .external }](../../examples/inference/tgi/index.md){:target="_blank"}, + [vLLM :material-arrow-top-right-thin:{ .external }](../../examples/inference/vllm/index.md){:target="_blank"}, and + [NIM :material-arrow-top-right-thin:{ .external }](../../examples/inference/nim/index.md){:target="_blank"} examples diff --git a/docs/examples/deployment/nim/index.md b/docs/docs/guides/clusters.md similarity index 100% rename from docs/examples/deployment/nim/index.md rename to docs/docs/guides/clusters.md diff --git a/docs/docs/guides/monitoring.md b/docs/docs/guides/metrics.md similarity index 100% rename from docs/docs/guides/monitoring.md rename to docs/docs/guides/metrics.md diff --git a/docs/examples.md b/docs/examples.md index 24881a7c09..128640b1ef 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -38,10 +38,10 @@ hide:
-## Distributed training +## Clusters -## Deployment +## Inference
-

SGLang @@ -95,7 +95,7 @@ hide: Deploy DeepSeek distilled models with SGLang

-

vLLM @@ -104,7 +104,7 @@ hide: Deploy Llama 3.1 with vLLM

-

TGI @@ -113,7 +113,7 @@ hide: Deploy Llama 4 with TGI

-

NIM @@ -122,7 +122,7 @@ hide: Deploy a DeepSeek distilled model with NIM

-

TensorRT-LLM diff --git a/docs/examples/deployment/sglang/index.md b/docs/examples/clusters/a3high/index.md similarity index 100% rename from docs/examples/deployment/sglang/index.md rename to docs/examples/clusters/a3high/index.md diff --git a/docs/examples/deployment/tgi/index.md b/docs/examples/clusters/a3mega/index.md similarity index 100% rename from docs/examples/deployment/tgi/index.md rename to docs/examples/clusters/a3mega/index.md diff --git a/docs/examples/deployment/trtllm/index.md b/docs/examples/clusters/nccl-tests/index.md similarity index 100% rename from docs/examples/deployment/trtllm/index.md rename to docs/examples/clusters/nccl-tests/index.md diff --git a/docs/examples/deployment/vllm/index.md b/docs/examples/clusters/rccl-tests/index.md similarity index 100% rename from docs/examples/deployment/vllm/index.md rename to docs/examples/clusters/rccl-tests/index.md diff --git a/docs/examples/distributed-training/a3high-clusters/index.md b/docs/examples/inference/nim/index.md similarity index 100% rename from docs/examples/distributed-training/a3high-clusters/index.md rename to docs/examples/inference/nim/index.md diff --git a/docs/examples/distributed-training/a3mega-clusters/index.md b/docs/examples/inference/sglang/index.md similarity index 100% rename from docs/examples/distributed-training/a3mega-clusters/index.md rename to docs/examples/inference/sglang/index.md diff --git a/docs/examples/distributed-training/nccl-tests/index.md b/docs/examples/inference/tgi/index.md similarity index 100% rename from docs/examples/distributed-training/nccl-tests/index.md rename to docs/examples/inference/tgi/index.md diff --git a/docs/examples/distributed-training/rccl-tests/index.md b/docs/examples/inference/trtllm/index.md similarity index 100% rename from docs/examples/distributed-training/rccl-tests/index.md rename to docs/examples/inference/trtllm/index.md diff --git a/docs/examples/inference/vllm/index.md b/docs/examples/inference/vllm/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 3ab9f7deff..51adf057be 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -117,9 +117,9 @@