From e86369b1609f2da0e68b28d8abfd1ba2836eac65 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 7 Jun 2026 20:27:44 +0000
Subject: [PATCH] fix: select integrated GPUs and allow PARAKEET_DEVICE to name
 a device

Backend device selection only accepted GGML_BACKEND_DEVICE_TYPE_GPU, so
integrated GPUs (Ryzen APUs and similar, reported as
GGML_BACKEND_DEVICE_TYPE_IGPU) were skipped and the engine fell back to
CPU on those machines.

The auto-pick now matches both discrete and integrated GPU devices.
PARAKEET_DEVICE also gains a third form: besides "cpu" (force CPU) and
being unset (auto-pick the first GPU/IGPU), it can now name a specific
registry device such as "CUDA0" or "Vulkan1" (case-insensitive). An
unknown name logs and falls back to CPU instead of failing. use_sched is
now derived from the chosen device type so any non-CPU device still
offloads unsupported ops to CPU.

Adds a regression test covering the env-var fallback paths (cpu, unknown
name, case-insensitive CPU), which run on a CPU-only build, and documents
the new behavior in the README.

Closes #17

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                     |  2 +-
 src/backend.cpp               | 61 +++++++++++++++++++++++++++--------
 tests/CMakeLists.txt          |  1 +
 tests/test_backend_device.cpp | 53 ++++++++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 15 deletions(-)
 create mode 100644 tests/test_backend_device.cpp
diff --git a/README.md b/README.md
index c21bf27..b0d17fc 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ To build for a GPU backend, forward its flag, e.g. Apple Metal:
 cmake -B build -DPARAKEET_GGML_METAL=ON && cmake --build build -j
 ```
 
-The CLI auto-selects the first GPU device the ggml registry reports, so no runtime flag is needed (set `PARAKEET_DEVICE=cpu` to force CPU). Ops the chosen backend has no kernel for run on the CPU automatically, so a model always runs even when one op lacks a GPU kernel. On an Apple M4, Metal is up to about 5x faster than CPU on the larger models; see [Apple Metal](benchmarks/BENCHMARK.md#apple-metal-m4).
+The CLI auto-selects the first GPU device the ggml registry reports (including integrated GPUs such as Ryzen APUs), so no runtime flag is needed. Use `PARAKEET_DEVICE` to override: set it to `cpu` to force CPU, or to a specific device name like `CUDA0` or `Vulkan1` (case-insensitive) to pick that device. Ops the chosen backend has no kernel for run on the CPU automatically, so a model always runs even when one op lacks a GPU kernel. On an Apple M4, Metal is up to about 5x faster than CPU on the larger models; see [Apple Metal](benchmarks/BENCHMARK.md#apple-metal-m4).
 
 ---
 
diff --git a/src/backend.cpp b/src/backend.cpp
index 88e2c8b..40055f9 100644
--- a/src/backend.cpp
+++ b/src/backend.cpp
@@ -9,6 +9,7 @@
 #include "ggml-cpu.h"
 
 #include <cassert>
+#include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <string>
@@ -68,27 +69,59 @@ struct Backend::Impl {
 static thread_local Backend* t_active = nullptr;
 
 Backend::Backend(int n_threads) : impl_(new Impl()) {
-    // Optional override: PARAKEET_DEVICE=cpu forces the CPU backend (used to take
-    // a CPU baseline on a GPU box without rebuilding).
+    // Optional override via PARAKEET_DEVICE:
+    //   - "cpu"            forces the CPU backend (CPU baseline on a GPU box).
+    //   - a device name    selects that specific registry device by name, e.g.
+    //                      "CUDA0", "Vulkan1", "Metal" (case-insensitive).
+    //   - unset            auto-pick the first GPU / integrated-GPU device.
     const char* force = std::getenv("PARAKEET_DEVICE");
-    const bool force_cpu = force && std::string(force) == "cpu";
+    const std::string want = force ? force : "";
+    const bool force_cpu = want == "cpu" || want == "CPU";
+
+    // Case-insensitive equality, used to match PARAKEET_DEVICE against the
+    // registry's device names (which are upper-case like "CUDA0"/"Vulkan0").
+    auto iequals = [](const std::string& a, const std::string& b) {
+        if (a.size() != b.size()) return false;
+        for (size_t i = 0; i < a.size(); ++i)
+            if (std::tolower((unsigned char)a[i]) != std::tolower((unsigned char)b[i]))
+                return false;
+        return true;
+    };
 
     if (!force_cpu) {
-        // Pick the first GPU device the registry reports. Whatever backend was
-        // compiled in (CUDA/Metal/Vulkan/HIP/SYCL) registers itself here, so this
-        // single path covers them all with no backend-specific includes.
+        // Walk the registry. Whatever backend was compiled in
+        // (CUDA/Metal/Vulkan/HIP/SYCL) registers itself here, so this single path
+        // covers them all with no backend-specific includes. Integrated GPUs
+        // (e.g. Ryzen APUs) report GGML_BACKEND_DEVICE_TYPE_IGPU and are eligible
+        // too. When PARAKEET_DEVICE names a device, match by name; otherwise pick
+        // the first GPU/IGPU device.
         for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
             ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-                impl_->backend = ggml_backend_dev_init(dev, nullptr);
-                if (impl_->backend) {
-                    device_name_ = ggml_backend_dev_name(dev);
-                    impl_->use_sched = true;   // GPU device: route compute through ggml_backend_sched
-                    PK_LOG("pk::Backend using GPU device: %s", device_name_.c_str());
-                    break;
-                }
+            const auto type = ggml_backend_dev_type(dev);
+            const char* name = ggml_backend_dev_name(dev);
+
+            bool selected;
+            if (!want.empty()) {
+                selected = name && iequals(want, name);  // explicit name match
+            } else {
+                selected = type == GGML_BACKEND_DEVICE_TYPE_GPU ||
+                           type == GGML_BACKEND_DEVICE_TYPE_IGPU;
+            }
+            if (!selected) continue;
+
+            impl_->backend = ggml_backend_dev_init(dev, nullptr);
+            if (impl_->backend) {
+                device_name_ = name ? name : "";
+                // Route compute through ggml_backend_sched for any non-CPU device
+                // so unsupported ops can fall back to CPU.
+                impl_->use_sched = type != GGML_BACKEND_DEVICE_TYPE_CPU;
+                PK_LOG("pk::Backend using device: %s", device_name_.c_str());
+                break;
             }
         }
+        if (!want.empty() && !impl_->backend)
+            PK_LOG("pk::Backend: PARAKEET_DEVICE=%s not found; falling back to CPU",
+                   want.c_str());
     }
     if (!impl_->backend) {              // CPU fallback (or CPU-only build)
         impl_->backend = ggml_backend_cpu_init();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9d71015..e1b7c93 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,6 +7,7 @@ function(pk_add_test name)
 endfunction()
 
 pk_add_test(test_smoke)
+pk_add_test(test_backend_device)
 pk_add_test(test_audio_io)
 pk_add_test(test_model_loader)
 pk_add_test(test_fft)
diff --git a/tests/test_backend_device.cpp b/tests/test_backend_device.cpp
new file mode 100644
index 0000000..266e147
--- /dev/null
+++ b/tests/test_backend_device.cpp
@@ -0,0 +1,53 @@
+// Device selection via PARAKEET_DEVICE (issue #17).
+//
+// Runs on any build, including the CPU-only one used in CI: it exercises the
+// env-var parsing and fallback paths that don't need a GPU to be present.
+//   - PARAKEET_DEVICE=cpu        -> CPU backend.
+//   - PARAKEET_DEVICE=<unknown>  -> no such device, falls back to CPU.
+//   - unset                      -> CPU on a CPU-only build (auto-pick).
+// On a GPU build the unset/auto case may select a GPU/integrated-GPU device, so
+// we don't assert a specific name there.
+#include "backend.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+static int failures = 0;
+
+static void expect_cpu(const char* label, const std::string& name) {
+    if (name != "cpu") {
+        std::fprintf(stderr, "FAIL [%s]: expected device 'cpu', got '%s'\n",
+                     label, name.c_str());
+        ++failures;
+    } else {
+        std::printf("ok [%s]: device = %s\n", label, name.c_str());
+    }
+}
+
+int main() {
+    // Forcing CPU must always yield the CPU backend.
+    setenv("PARAKEET_DEVICE", "cpu", 1);
+    {
+        pk::Backend b(1);
+        expect_cpu("PARAKEET_DEVICE=cpu", b.device_name());
+    }
+
+    // An unknown device name must not crash; it falls back to CPU.
+    setenv("PARAKEET_DEVICE", "definitely-not-a-real-device-9000", 1);
+    {
+        pk::Backend b(1);
+        expect_cpu("PARAKEET_DEVICE=<unknown>", b.device_name());
+    }
+
+    // Case-insensitive "CPU" is also honored as a CPU force.
+    setenv("PARAKEET_DEVICE", "CPU", 1);
+    {
+        pk::Backend b(1);
+        expect_cpu("PARAKEET_DEVICE=CPU", b.device_name());
+    }
+
+    unsetenv("PARAKEET_DEVICE");
+    return failures == 0 ? 0 : 1;
+}