From e86369b1609f2da0e68b28d8abfd1ba2836eac65 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 7 Jun 2026 20:27:44 +0000 Subject: [PATCH] fix: select integrated GPUs and allow PARAKEET_DEVICE to name a device Backend device selection only accepted GGML_BACKEND_DEVICE_TYPE_GPU, so integrated GPUs (Ryzen APUs and similar, reported as GGML_BACKEND_DEVICE_TYPE_IGPU) were skipped and the engine fell back to CPU on those machines. The auto-pick now matches both discrete and integrated GPU devices. PARAKEET_DEVICE also gains a third form: besides "cpu" (force CPU) and being unset (auto-pick the first GPU/IGPU), it can now name a specific registry device such as "CUDA0" or "Vulkan1" (case-insensitive). An unknown name logs and falls back to CPU instead of failing. use_sched is now derived from the chosen device type so any non-CPU device still offloads unsupported ops to CPU. Adds a regression test covering the env-var fallback paths (cpu, unknown name, case-insensitive CPU), which run on a CPU-only build, and documents the new behavior in the README. Closes #17 Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 2 +- src/backend.cpp | 61 +++++++++++++++++++++++++++-------- tests/CMakeLists.txt | 1 + tests/test_backend_device.cpp | 53 ++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 15 deletions(-) create mode 100644 tests/test_backend_device.cpp diff --git a/README.md b/README.md index c21bf27..b0d17fc 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ To build for a GPU backend, forward its flag, e.g. Apple Metal: cmake -B build -DPARAKEET_GGML_METAL=ON && cmake --build build -j ``` -The CLI auto-selects the first GPU device the ggml registry reports, so no runtime flag is needed (set `PARAKEET_DEVICE=cpu` to force CPU). Ops the chosen backend has no kernel for run on the CPU automatically, so a model always runs even when one op lacks a GPU kernel. On an Apple M4, Metal is up to about 5x faster than CPU on the larger models; see [Apple Metal](benchmarks/BENCHMARK.md#apple-metal-m4). +The CLI auto-selects the first GPU device the ggml registry reports (including integrated GPUs such as Ryzen APUs), so no runtime flag is needed. Use `PARAKEET_DEVICE` to override: set it to `cpu` to force CPU, or to a specific device name like `CUDA0` or `Vulkan1` (case-insensitive) to pick that device. Ops the chosen backend has no kernel for run on the CPU automatically, so a model always runs even when one op lacks a GPU kernel. On an Apple M4, Metal is up to about 5x faster than CPU on the larger models; see [Apple Metal](benchmarks/BENCHMARK.md#apple-metal-m4). --- diff --git a/src/backend.cpp b/src/backend.cpp index 88e2c8b..40055f9 100644 --- a/src/backend.cpp +++ b/src/backend.cpp @@ -9,6 +9,7 @@ #include "ggml-cpu.h" #include +#include #include #include #include @@ -68,27 +69,59 @@ struct Backend::Impl { static thread_local Backend* t_active = nullptr; Backend::Backend(int n_threads) : impl_(new Impl()) { - // Optional override: PARAKEET_DEVICE=cpu forces the CPU backend (used to take - // a CPU baseline on a GPU box without rebuilding). + // Optional override via PARAKEET_DEVICE: + // - "cpu" forces the CPU backend (CPU baseline on a GPU box). + // - a device name selects that specific registry device by name, e.g. + // "CUDA0", "Vulkan1", "Metal" (case-insensitive). + // - unset auto-pick the first GPU / integrated-GPU device. const char* force = std::getenv("PARAKEET_DEVICE"); - const bool force_cpu = force && std::string(force) == "cpu"; + const std::string want = force ? force : ""; + const bool force_cpu = want == "cpu" || want == "CPU"; + + // Case-insensitive equality, used to match PARAKEET_DEVICE against the + // registry's device names (which are upper-case like "CUDA0"/"Vulkan0"). + auto iequals = [](const std::string& a, const std::string& b) { + if (a.size() != b.size()) return false; + for (size_t i = 0; i < a.size(); ++i) + if (std::tolower((unsigned char)a[i]) != std::tolower((unsigned char)b[i])) + return false; + return true; + }; if (!force_cpu) { - // Pick the first GPU device the registry reports. Whatever backend was - // compiled in (CUDA/Metal/Vulkan/HIP/SYCL) registers itself here, so this - // single path covers them all with no backend-specific includes. + // Walk the registry. Whatever backend was compiled in + // (CUDA/Metal/Vulkan/HIP/SYCL) registers itself here, so this single path + // covers them all with no backend-specific includes. Integrated GPUs + // (e.g. Ryzen APUs) report GGML_BACKEND_DEVICE_TYPE_IGPU and are eligible + // too. When PARAKEET_DEVICE names a device, match by name; otherwise pick + // the first GPU/IGPU device. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - impl_->backend = ggml_backend_dev_init(dev, nullptr); - if (impl_->backend) { - device_name_ = ggml_backend_dev_name(dev); - impl_->use_sched = true; // GPU device: route compute through ggml_backend_sched - PK_LOG("pk::Backend using GPU device: %s", device_name_.c_str()); - break; - } + const auto type = ggml_backend_dev_type(dev); + const char* name = ggml_backend_dev_name(dev); + + bool selected; + if (!want.empty()) { + selected = name && iequals(want, name); // explicit name match + } else { + selected = type == GGML_BACKEND_DEVICE_TYPE_GPU || + type == GGML_BACKEND_DEVICE_TYPE_IGPU; + } + if (!selected) continue; + + impl_->backend = ggml_backend_dev_init(dev, nullptr); + if (impl_->backend) { + device_name_ = name ? name : ""; + // Route compute through ggml_backend_sched for any non-CPU device + // so unsupported ops can fall back to CPU. + impl_->use_sched = type != GGML_BACKEND_DEVICE_TYPE_CPU; + PK_LOG("pk::Backend using device: %s", device_name_.c_str()); + break; } } + if (!want.empty() && !impl_->backend) + PK_LOG("pk::Backend: PARAKEET_DEVICE=%s not found; falling back to CPU", + want.c_str()); } if (!impl_->backend) { // CPU fallback (or CPU-only build) impl_->backend = ggml_backend_cpu_init(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9d71015..e1b7c93 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,6 +7,7 @@ function(pk_add_test name) endfunction() pk_add_test(test_smoke) +pk_add_test(test_backend_device) pk_add_test(test_audio_io) pk_add_test(test_model_loader) pk_add_test(test_fft) diff --git a/tests/test_backend_device.cpp b/tests/test_backend_device.cpp new file mode 100644 index 0000000..266e147 --- /dev/null +++ b/tests/test_backend_device.cpp @@ -0,0 +1,53 @@ +// Device selection via PARAKEET_DEVICE (issue #17). +// +// Runs on any build, including the CPU-only one used in CI: it exercises the +// env-var parsing and fallback paths that don't need a GPU to be present. +// - PARAKEET_DEVICE=cpu -> CPU backend. +// - PARAKEET_DEVICE= -> no such device, falls back to CPU. +// - unset -> CPU on a CPU-only build (auto-pick). +// On a GPU build the unset/auto case may select a GPU/integrated-GPU device, so +// we don't assert a specific name there. +#include "backend.hpp" + +#include +#include +#include +#include + +static int failures = 0; + +static void expect_cpu(const char* label, const std::string& name) { + if (name != "cpu") { + std::fprintf(stderr, "FAIL [%s]: expected device 'cpu', got '%s'\n", + label, name.c_str()); + ++failures; + } else { + std::printf("ok [%s]: device = %s\n", label, name.c_str()); + } +} + +int main() { + // Forcing CPU must always yield the CPU backend. + setenv("PARAKEET_DEVICE", "cpu", 1); + { + pk::Backend b(1); + expect_cpu("PARAKEET_DEVICE=cpu", b.device_name()); + } + + // An unknown device name must not crash; it falls back to CPU. + setenv("PARAKEET_DEVICE", "definitely-not-a-real-device-9000", 1); + { + pk::Backend b(1); + expect_cpu("PARAKEET_DEVICE=", b.device_name()); + } + + // Case-insensitive "CPU" is also honored as a CPU force. + setenv("PARAKEET_DEVICE", "CPU", 1); + { + pk::Backend b(1); + expect_cpu("PARAKEET_DEVICE=CPU", b.device_name()); + } + + unsetenv("PARAKEET_DEVICE"); + return failures == 0 ? 0 : 1; +}