From 21dde6cbdc30b7e7710f71108188516c5e690504 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Wed, 13 May 2026 11:08:04 +0300
Subject: [PATCH 1/2] feat(embeddings): add parallel processing and thread
 count control

- Add rayon dependency for parallel execution support
- Update TextModel and model traits to include threads parameter
- Implement scoped rayon thread pools for candle-based models
- Update ONNX pipelined inference to respect worker limits
- Modify FFI interface and C++ wrapper to accept thread count
- Increment library version to 4 for breaking FFI changes
- Propagate thread settings through KNN and text-to-vector pipelines
- Update Jina, local models, and test suites for new signatures
- Allow zero threads to signify use of all available CPUs

BREAKING CHANGE: TextModel::predict and MakeVectEmbeddingsFn signatures
changed to include thread count parameter.
---
 embeddings/Cargo.lock                        |  1 +
 embeddings/Cargo.toml                        |  1 +
 embeddings/manticoresearch_text_embeddings.h |  5 +-
 embeddings/src/ffi.rs                        |  4 +-
 embeddings/src/model/ffi_test.rs             |  2 +-
 embeddings/src/model/jina.rs                 |  8 +-
 embeddings/src/model/jina_test.rs            |  4 +-
 embeddings/src/model/local.rs                | 85 ++++++++++++++++----
 embeddings/src/model/local_test.rs           | 36 ++++-----
 embeddings/src/model/mod.rs                  | 16 ++--
 embeddings/src/model/openai.rs               |  8 +-
 embeddings/src/model/openai_test.rs          |  6 +-
 embeddings/src/model/text_model_wrapper.rs   |  5 +-
 embeddings/src/model/voyage.rs               |  8 +-
 embeddings/src/model/voyage_test.rs          |  6 +-
 knn/embeddings.cpp                           |  7 +-
 knn/knn.h                                    |  2 +-
 17 files changed, 141 insertions(+), 63 deletions(-)

diff --git a/embeddings/Cargo.lock b/embeddings/Cargo.lock
index e99f2e10..bf55a38b 100644
--- a/embeddings/Cargo.lock
+++ b/embeddings/Cargo.lock
@@ -1899,6 +1899,7 @@ dependencies = [
  "ndarray 0.16.1",
  "ort",
  "rand 0.8.5",
+ "rayon",
  "reqwest",
  "serde",
  "serde_json",
diff --git a/embeddings/Cargo.toml b/embeddings/Cargo.toml
index 6619a85b..c0b9abca 100644
--- a/embeddings/Cargo.toml
+++ b/embeddings/Cargo.toml
@@ -18,6 +18,7 @@ candle-core = "0.9.2"
 candle-nn = "0.9.2"
 candle-transformers = "0.9.2"
 ort = { version = "2.0.0-rc.9", default-features = false, features = ["std"] }
+rayon = "1.11"
 
 [features]
 default = []
diff --git a/embeddings/manticoresearch_text_embeddings.h b/embeddings/manticoresearch_text_embeddings.h
index c073e445..3a8d5ef3 100644
--- a/embeddings/manticoresearch_text_embeddings.h
+++ b/embeddings/manticoresearch_text_embeddings.h
@@ -47,7 +47,10 @@ struct StringItem {
   uintptr_t len;
 };
 
-using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*, const StringItem*, uintptr_t);
+using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*,
+                                               const StringItem*,
+                                               uintptr_t,
+                                               int32_t);
 
 using FreeVecResultFn = void(*)(FloatVecResult);
 
diff --git a/embeddings/src/ffi.rs b/embeddings/src/ffi.rs
index a104f948..0681b6ac 100644
--- a/embeddings/src/ffi.rs
+++ b/embeddings/src/ffi.rs
@@ -19,7 +19,7 @@ type LoadModelFn = extern "C" fn(
 type FreeModelResultFn = extern "C" fn(TextModelResult);
 
 type MakeVectEmbeddingsFn =
-    extern "C" fn(&TextModelWrapper, *const StringItem, usize) -> FloatVecResult;
+    extern "C" fn(&TextModelWrapper, *const StringItem, usize, i32) -> FloatVecResult;
 
 type FreeVecResultFn = extern "C" fn(FloatVecResult);
 
@@ -62,7 +62,7 @@ pub struct EmbedLib {
 const VERSION_STR: &[u8] = concat!(env!("EMBEDDINGS_VERSION_STR"), "\0").as_bytes();
 
 const LIB: EmbedLib = EmbedLib {
-    version: 3usize,
+    version: 4usize,
     version_str: VERSION_STR.as_ptr() as *const c_char,
     load_model: TextModelWrapper::load_model,
     free_model_result: TextModelWrapper::free_model_result,
diff --git a/embeddings/src/model/ffi_test.rs b/embeddings/src/model/ffi_test.rs
index 4833c525..322012fa 100644
--- a/embeddings/src/model/ffi_test.rs
+++ b/embeddings/src/model/ffi_test.rs
@@ -76,7 +76,7 @@ mod tests {
                         )
                     };
                     let vec_result =
-                        TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1);
+                        TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, 0);
                     assert!(vec_result.error.is_null());
                     assert_eq!(vec_result.len, 1);
                     TextModelWrapper::free_vec_result(vec_result);
diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs
index fa029193..5dd6bdfe 100644
--- a/embeddings/src/model/jina.rs
+++ b/embeddings/src/model/jina.rs
@@ -67,7 +67,11 @@ impl JinaModel {
 }
 
 impl TextModel for JinaModel {
-    fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn std::error::Error>> {
+    fn predict(
+        &self,
+        texts: &[&str],
+        _threads: usize,
+    ) -> Result<Vec<Vec<f32>>, Box<dyn std::error::Error>> {
         let url = self
             .api_url
             .as_deref()
@@ -308,7 +312,7 @@ impl TextModel for JinaModel {
     fn validate_api_key(&self) -> Result<(), Box<dyn std::error::Error>> {
         // Make a minimal test request with a single character to validate the API key
         // This is cheaper than a full embedding request but still validates the key
-        self.predict(&["test"])?;
+        self.predict(&["test"], 0)?;
         Ok(())
     }
 }
diff --git a/embeddings/src/model/jina_test.rs b/embeddings/src/model/jina_test.rs
index 3c6a2896..bef1aff1 100644
--- a/embeddings/src/model/jina_test.rs
+++ b/embeddings/src/model/jina_test.rs
@@ -119,7 +119,7 @@ mod tests {
 
         // Test with real API key
         let texts = vec!["test"];
-        let result = model.predict(&texts);
+        let result = model.predict(&texts, 0);
         match result {
             Ok(embeddings) => {
                 // Should have one embedding for one text
@@ -157,7 +157,7 @@ mod tests {
         let model = JinaModel::new("jina/jina-embeddings-v3", &api_key, None, None).unwrap();
 
         let empty_texts: Vec<&str> = vec![];
-        let result = model.predict(&empty_texts);
+        let result = model.predict(&empty_texts, 0);
         // Empty input should succeed with empty result
         match result {
             Ok(embeddings) => {
diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs
index 5f552c53..8947ba69 100644
--- a/embeddings/src/model/local.rs
+++ b/embeddings/src/model/local.rs
@@ -55,6 +55,38 @@ fn intra_threads() -> usize {
         .unwrap_or(DEFAULT_INTRA_THREADS)
 }
 
+/// Run `f` inside a scoped rayon thread pool of the requested size.
+///
+/// `threads == 0` means "no cap": just run `f` on the existing (global) rayon pool,
+/// which by default uses every available CPU. `threads > 0` builds a fresh pool
+/// of that size (clamped to available CPUs) and installs it for the duration of `f`,
+/// so candle's intra-op rayon work and tokenizers' parallelism both respect the limit.
+///
+/// Errors are stringified across the `pool.install` boundary because `Box<dyn Error>`
+/// is not `Send`; the original error message is preserved.
+fn with_thread_limit<F>(threads: usize, f: F) -> Result<Vec<Vec<f32>>, Box<dyn Error>>
+where
+    F: FnOnce() -> Result<Vec<Vec<f32>>, Box<dyn Error>> + Send,
+{
+    if threads == 0 {
+        return f();
+    }
+
+    let n = threads.min(available_cpus()).max(1);
+    let pool = match rayon::ThreadPoolBuilder::new().num_threads(n).build() {
+        Ok(p) => p,
+        // If the pool can't be built (extremely unlikely), fall back to the global pool
+        // rather than failing the whole inference.
+        Err(_) => return f(),
+    };
+
+    // pool.install requires `R: Send`, but Box<dyn Error> isn't Send. Stringify the
+    // error inside the closure and re-wrap it on the way out — keeps the message,
+    // satisfies the Send bound, and avoids forcing every model's error into Send+Sync.
+    pool.install(|| f().map_err(|e| e.to_string()))
+        .map_err(|s| -> Box<dyn Error> { s.into() })
+}
+
 /// Thread-safe session wrapper with platform-specific strategy:
 /// - Linux/macOS: UnsafeCell for concurrent Run() (ORT C API is thread-safe)
 /// - Windows: Mutex for serialized Run() (Windows ORT has threading issues)
@@ -967,7 +999,13 @@ impl OnnxEmbeddingModel {
     /// - Large input (> batch_size): splits into num_cpus concurrent single-doc workers.
     ///   Each worker tokenizes + infers 1 doc at a time through SessionWrapper,
     ///   mimicking the concurrent caller pattern that gives best throughput.
-    fn predict_pipelined(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
+    ///
+    /// `threads` caps the worker count. 0 means "use all available CPUs".
+    fn predict_pipelined(
+        &self,
+        texts: &[&str],
+        threads: usize,
+    ) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
         let bs = batch_size();
         let max_input = self.max_input_len;
         let session = &self.session;
@@ -984,8 +1022,14 @@ impl OnnxEmbeddingModel {
 
         // Adaptive parallelism: scale workers with input size.
         // Each worker needs at least batch_size docs to justify thread overhead.
-        // Cap at available CPUs — more workers than cores adds contention.
-        let num_workers = (texts.len() / bs).min(available_cpus()).max(1);
+        // Worker cap is the caller-supplied `threads` limit when > 0,
+        // otherwise fall back to all available CPUs.
+        let thread_cap = if threads > 0 {
+            threads.min(available_cpus())
+        } else {
+            available_cpus()
+        };
+        let num_workers = (texts.len() / bs).min(thread_cap).max(1);
         let docs_per_worker = texts.len().div_ceil(num_workers);
 
         let mut ordered_results: Vec<Vec<Vec<f32>>> = Vec::with_capacity(num_workers);
@@ -1146,19 +1190,15 @@ impl LocalModel {
     }
 }
 
-impl TextModel for LocalModel {
-    fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
-        // BERT and ONNX: batched path (batch_size up to batch_size() per forward pass)
-        match self {
-            LocalModel::Bert(m) => {
-                return Self::predict_batched(&m.tokenizer, m.max_input_len, texts, |chunks| {
-                    m.predict_chunks(chunks)
-                });
-            }
-            LocalModel::Onnx(m) => {
-                return m.predict_pipelined(texts);
-            }
-            _ => {} // fall through to sequential path
+impl LocalModel {
+    /// Inner predict body for non-ONNX local models (BERT / T5 / Causal / Quantized).
+    /// Pulled out of the trait impl so the caller can wrap it in a scoped rayon pool.
+    fn predict_local(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
+        // BERT: batched path (batch_size up to batch_size() per forward pass)
+        if let LocalModel::Bert(m) = self {
+            return Self::predict_batched(&m.tokenizer, m.max_input_len, texts, |chunks| {
+                m.predict_chunks(chunks)
+            });
         }
 
         // Sequential path for T5, Causal, Quantized (these use KV caches / mutexes)
@@ -1288,6 +1328,19 @@ impl TextModel for LocalModel {
 
         Ok(all_results)
     }
+}
+
+impl TextModel for LocalModel {
+    fn predict(&self, texts: &[&str], threads: usize) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
+        // ONNX manages its own worker count internally — no rayon pool involved.
+        if let LocalModel::Onnx(m) = self {
+            return m.predict_pipelined(texts, threads);
+        }
+
+        // BERT / T5 / Causal / Quantized go through candle, which uses rayon for
+        // intra-op parallelism. Scope the rayon pool so threads > 0 caps the worker count.
+        with_thread_limit(threads, || self.predict_local(texts))
+    }
 
     fn get_hidden_size(&self) -> usize {
         match self {
diff --git a/embeddings/src/model/local_test.rs b/embeddings/src/model/local_test.rs
index 4627cefc..6bd9dad1 100644
--- a/embeddings/src/model/local_test.rs
+++ b/embeddings/src/model/local_test.rs
@@ -417,7 +417,7 @@ mod tests {
 
         for sentence in &test_sentences {
             let local_model = LocalModel::new(model_id, cache_path.clone(), false, None).unwrap();
-            let embedding = local_model.predict(&[sentence]).unwrap();
+            let embedding = local_model.predict(&[sentence], 0).unwrap();
             check_embedding_properties(&embedding[0], local_model.get_hidden_size());
         }
     }
@@ -429,8 +429,8 @@ mod tests {
         let local_model = LocalModel::new(model_id, cache_path, false, None).unwrap();
 
         let sentence = &["This is a test sentence."];
-        let embedding1 = local_model.predict(sentence).unwrap();
-        let embedding2 = local_model.predict(sentence).unwrap();
+        let embedding1 = local_model.predict(sentence, 0).unwrap();
+        let embedding2 = local_model.predict(sentence, 0).unwrap();
 
         for (e1, e2) in embedding1[0].iter().zip(embedding2[0].iter()) {
             assert_abs_diff_eq!(e1, e2, epsilon = 1e-6);
@@ -466,7 +466,7 @@ mod tests {
 
         let test_text = &["This is a test sentence for Qwen embedding model."];
         let embeddings = local_model
-            .predict(test_text)
+            .predict(test_text, 0)
             .expect("Qwen model should generate embeddings");
 
         check_embedding_properties(&embeddings[0], local_model.get_hidden_size());
@@ -482,7 +482,7 @@ mod tests {
             .expect("Llama model should load");
 
         let test_text = &["This is a test sentence for Llama embedding model."];
-        let embeddings = local_model.predict(test_text).unwrap();
+        let embeddings = local_model.predict(test_text, 0).unwrap();
 
         check_embedding_properties(&embeddings[0], local_model.get_hidden_size());
     }
@@ -496,7 +496,7 @@ mod tests {
         let local_model = LocalModel::new(model_id, cache_path.clone(), false, None)
             .expect("Mistral model should load");
         let test_text = &["This is a test sentence for Mistral embedding model."];
-        let embeddings = local_model.predict(test_text).unwrap();
+        let embeddings = local_model.predict(test_text, 0).unwrap();
         check_embedding_properties(&embeddings[0], local_model.get_hidden_size());
     }
 
@@ -510,7 +510,7 @@ mod tests {
             .expect("Gemma model should load");
 
         let test_text = &["This is a test sentence for Gemma embedding model."];
-        let embeddings = local_model.predict(test_text).unwrap();
+        let embeddings = local_model.predict(test_text, 0).unwrap();
         check_embedding_properties(&embeddings[0], local_model.get_hidden_size());
     }
 
@@ -536,7 +536,7 @@ mod tests {
             "Third sentence for batch processing verification.",
         ];
 
-        let embeddings = local_model.predict(test_texts).unwrap();
+        let embeddings = local_model.predict(test_texts, 0).unwrap();
 
         assert_eq!(embeddings.len(), test_texts.len());
 
@@ -660,7 +660,7 @@ mod tests {
         // Test embedding generation
         let test_text = &["This is a test sentence for FRIDA embedding model."];
         let embeddings = local_model
-            .predict(test_text)
+            .predict(test_text, 0)
             .expect("FRIDA model should generate embeddings");
 
         assert_eq!(embeddings.len(), 1, "Should return one embedding");
@@ -697,7 +697,7 @@ mod tests {
         // Test embedding generation
         let test_text = &["This is a test sentence for Google embeddinggemma model."];
         let embeddings = local_model
-            .predict(test_text)
+            .predict(test_text, 0)
             .expect("Google embeddinggemma should generate embeddings");
 
         assert_eq!(embeddings.len(), 1, "Should return one embedding");
@@ -739,7 +739,7 @@ mod tests {
 
         let test_text = &["This is a test sentence for ONNX embedding model."];
         let embeddings = local_model
-            .predict(test_text)
+            .predict(test_text, 0)
             .expect("ONNX model should generate embeddings");
 
         assert_eq!(embeddings.len(), 1);
@@ -761,8 +761,8 @@ mod tests {
         };
 
         let sentence = &["This is a test sentence."];
-        let embedding1 = local_model.predict(sentence).unwrap();
-        let embedding2 = local_model.predict(sentence).unwrap();
+        let embedding1 = local_model.predict(sentence, 0).unwrap();
+        let embedding2 = local_model.predict(sentence, 0).unwrap();
 
         for (e1, e2) in embedding1[0].iter().zip(embedding2[0].iter()) {
             assert_abs_diff_eq!(e1, e2, epsilon = 1e-6);
@@ -789,7 +789,7 @@ mod tests {
             "Third sentence for batch processing.",
         ];
 
-        let embeddings = local_model.predict(test_texts).unwrap();
+        let embeddings = local_model.predict(test_texts, 0).unwrap();
         assert_eq!(embeddings.len(), test_texts.len());
 
         for embedding in &embeddings {
@@ -850,20 +850,20 @@ mod tests {
         };
 
         // Warmup
-        let _ = st.predict(sentences).unwrap();
-        let _ = onnx.predict(sentences).unwrap();
+        let _ = st.predict(sentences, 0).unwrap();
+        let _ = onnx.predict(sentences, 0).unwrap();
 
         // Safetensors timing
         let start = Instant::now();
         for _ in 0..iterations {
-            let _ = st.predict(sentences).unwrap();
+            let _ = st.predict(sentences, 0).unwrap();
         }
         let st_ms = start.elapsed().as_millis() as f64 / iterations as f64;
 
         // ONNX timing
         let start = Instant::now();
         for _ in 0..iterations {
-            let _ = onnx.predict(sentences).unwrap();
+            let _ = onnx.predict(sentences, 0).unwrap();
         }
         let onnx_ms = start.elapsed().as_millis() as f64 / iterations as f64;
 
diff --git a/embeddings/src/model/mod.rs b/embeddings/src/model/mod.rs
index d675cfba..c8769043 100644
--- a/embeddings/src/model/mod.rs
+++ b/embeddings/src/model/mod.rs
@@ -23,7 +23,11 @@ use std::error::Error;
 use std::path::PathBuf;
 
 pub trait TextModel {
-    fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn Error>>;
+    /// Generate embeddings for the given texts.
+    ///
+    /// `threads` caps the number of CPU threads used during generation.
+    /// 0 means "use all available CPUs" (default).
+    fn predict(&self, texts: &[&str], threads: usize) -> Result<Vec<Vec<f32>>, Box<dyn Error>>;
     fn get_hidden_size(&self) -> usize;
     fn get_max_input_len(&self) -> usize;
     /// Validates the API key by making a minimal test request to the API.
@@ -57,12 +61,12 @@ pub enum Model {
 }
 
 impl TextModel for Model {
-    fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
+    fn predict(&self, texts: &[&str], threads: usize) -> Result<Vec<Vec<f32>>, Box<dyn Error>> {
         match self {
-            Model::OpenAI(m) => m.predict(texts),
-            Model::Voyage(m) => m.predict(texts),
-            Model::Jina(m) => m.predict(texts),
-            Model::Local(m) => m.predict(texts),
+            Model::OpenAI(m) => m.predict(texts, threads),
+            Model::Voyage(m) => m.predict(texts, threads),
+            Model::Jina(m) => m.predict(texts, threads),
+            Model::Local(m) => m.predict(texts, threads),
         }
     }
 
diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs
index b880c116..c012c288 100644
--- a/embeddings/src/model/openai.rs
+++ b/embeddings/src/model/openai.rs
@@ -57,7 +57,11 @@ impl OpenAIModel {
 }
 
 impl TextModel for OpenAIModel {
-    fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn std::error::Error>> {
+    fn predict(
+        &self,
+        texts: &[&str],
+        _threads: usize,
+    ) -> Result<Vec<Vec<f32>>, Box<dyn std::error::Error>> {
         let url = self
             .api_url
             .as_deref()
@@ -182,7 +186,7 @@ impl TextModel for OpenAIModel {
     fn validate_api_key(&self) -> Result<(), Box<dyn std::error::Error>> {
         // Make a minimal test request with a single character to validate the API key
         // This is cheaper than a full embedding request but still validates the key
-        self.predict(&["test"])?;
+        self.predict(&["test"], 0)?;
         Ok(())
     }
 }
diff --git a/embeddings/src/model/openai_test.rs b/embeddings/src/model/openai_test.rs
index 8a24f8e0..16da9141 100644
--- a/embeddings/src/model/openai_test.rs
+++ b/embeddings/src/model/openai_test.rs
@@ -191,7 +191,7 @@ mod tests {
 
         // Test with real API key
         let texts = vec!["test"];
-        let result = model.predict(&texts);
+        let result = model.predict(&texts, 0);
 
         match result {
             Ok(embeddings) => {
@@ -231,7 +231,7 @@ mod tests {
             OpenAIModel::new("openai/text-embedding-ada-002", &api_key, None, None).unwrap();
 
         let empty_texts: Vec<&str> = vec![];
-        let result = model.predict(&empty_texts);
+        let result = model.predict(&empty_texts, 0);
 
         // Empty input should succeed with empty result
         match result {
@@ -263,7 +263,7 @@ mod tests {
         // Test that the client is initialized
         // We can't directly test the client, but we can verify it exists by using it
         let texts = vec!["test"];
-        let _result = model.predict(&texts); // This will fail but proves client exists
+        let _result = model.predict(&texts, 0); // This will fail but proves client exists
     }
 
     #[test]
diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs
index bcb7dc5d..e0ca99a4 100644
--- a/embeddings/src/model/text_model_wrapper.rs
+++ b/embeddings/src/model/text_model_wrapper.rs
@@ -127,6 +127,7 @@ impl TextModelWrapper {
         &self,
         texts: *const StringItem,
         count: usize,
+        threads: i32, // 0 = use all available CPUs, >0 = cap thread count
     ) -> FloatVecResult {
         let string_slice = unsafe { std::slice::from_raw_parts(texts, count) };
 
@@ -140,9 +141,11 @@ impl TextModelWrapper {
             })
             .collect();
 
+        let threads = if threads > 0 { threads as usize } else { 0 };
+
         let mut float_vec_list: Vec<FloatVec> = Vec::new();
         let model = self.as_model();
-        let embeddings_list = model.predict(&string_refs);
+        let embeddings_list = model.predict(&string_refs, threads);
         let c_error = match embeddings_list {
             Ok(embeddings_list) => {
                 for embeddings in embeddings_list.iter() {
diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs
index 1374dd4e..3d3d8744 100644
--- a/embeddings/src/model/voyage.rs
+++ b/embeddings/src/model/voyage.rs
@@ -58,7 +58,11 @@ impl VoyageModel {
 }
 
 impl TextModel for VoyageModel {
-    fn predict(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>, Box<dyn std::error::Error>> {
+    fn predict(
+        &self,
+        texts: &[&str],
+        _threads: usize,
+    ) -> Result<Vec<Vec<f32>>, Box<dyn std::error::Error>> {
         let url = self
             .api_url
             .as_deref()
@@ -221,7 +225,7 @@ impl TextModel for VoyageModel {
     fn validate_api_key(&self) -> Result<(), Box<dyn std::error::Error>> {
         // Make a minimal test request with a single character to validate the API key
         // This is cheaper than a full embedding request but still validates the key
-        self.predict(&["test"])?;
+        self.predict(&["test"], 0)?;
         Ok(())
     }
 }
diff --git a/embeddings/src/model/voyage_test.rs b/embeddings/src/model/voyage_test.rs
index 1a9a59a2..80c775cf 100644
--- a/embeddings/src/model/voyage_test.rs
+++ b/embeddings/src/model/voyage_test.rs
@@ -207,7 +207,7 @@ mod tests {
 
         // Test with real API key
         let texts = vec!["test"];
-        let result = model.predict(&texts);
+        let result = model.predict(&texts, 0);
         match result {
             Ok(embeddings) => {
                 // Should have one embedding for one text
@@ -245,7 +245,7 @@ mod tests {
         let model = VoyageModel::new("voyage/voyage-3-large", &api_key, None, None).unwrap();
 
         let empty_texts: Vec<&str> = vec![];
-        let result = model.predict(&empty_texts);
+        let result = model.predict(&empty_texts, 0);
         // Empty input should succeed with empty result
         match result {
             Ok(embeddings) => {
@@ -276,7 +276,7 @@ mod tests {
         // Test that the client is initialized
         // We can't directly test the client, but we can verify it exists by using it
         let texts = vec!["test"];
-        let _result = model.predict(&texts); // This will fail but proves client exists
+        let _result = model.predict(&texts, 0); // This will fail but proves client exists
     }
 
     #[test]
diff --git a/knn/embeddings.cpp b/knn/embeddings.cpp
index 2326e97a..caf30380 100644
--- a/knn/embeddings.cpp
+++ b/knn/embeddings.cpp
@@ -195,7 +195,7 @@ class TextToEmbeddings_c : public TextToEmbeddings_i
 			TextToEmbeddings_c ( const ModelSettings_t & tSettings ) : m_tSettings ( tSettings ) {}
 
 	bool	Initialize ( std::shared_ptr<LoadedLib_c> pLib, std::string & sError );
-	bool	Convert ( const std::vector<std::string_view> & dTexts, std::vector<std::vector<float>> & dEmbeddings, std::string & sError ) const override;
+	bool	Convert ( const std::vector<std::string_view> & dTexts, std::vector<std::vector<float>> & dEmbeddings, std::string & sError, int iThreads = 0 ) const override;
 	int		GetDims() const override;
 
 private:
@@ -255,7 +255,7 @@ bool TextToEmbeddings_c::Initialize ( std::shared_ptr<LoadedLib_c> pLib, std::st
 }
 
 
-bool TextToEmbeddings_c::Convert ( const std::vector<std::string_view> & dTexts, std::vector<std::vector<float>> & dEmbeddings, std::string & sError ) const
+bool TextToEmbeddings_c::Convert ( const std::vector<std::string_view> & dTexts, std::vector<std::vector<float>> & dEmbeddings, std::string & sError, int iThreads ) const
 {
 	std::vector<StringItem> dStringItems;
 	for ( const auto & i : dTexts )
@@ -264,7 +264,8 @@ bool TextToEmbeddings_c::Convert ( const std::vector<std::string_view> & dTexts,
 	auto * pFuncs = m_pLib->GetLibFuncs();
 	assert(pFuncs);
 
-	FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size() );
+	// iThreads: 0 = use all available CPUs (default), >0 = cap worker count in the embeddings lib
+	FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size(), iThreads );
 	if ( tVecResult.m_szError )
 	{
 		sError = tVecResult.m_szError;
diff --git a/knn/knn.h b/knn/knn.h
index 5eb92737..eb9bf369 100644
--- a/knn/knn.h
+++ b/knn/knn.h
@@ -138,7 +138,7 @@ class TextToEmbeddings_i
 public:
 	virtual			~TextToEmbeddings_i() = default;
 
-	virtual	bool	Convert ( const std::vector<std::string_view> & dTexts, std::vector<std::vector<float>> & dEmbeddings, std::string & sError ) const = 0;
+	virtual	bool	Convert ( const std::vector<std::string_view> & dTexts, std::vector<std::vector<float>> & dEmbeddings, std::string & sError, int iThreads = 0 ) const = 0;
 	virtual int		GetDims() const = 0;
 };
 

From 236abfadf97e69953bd0927653b5fba22c9e8289 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Wed, 13 May 2026 17:08:24 +0300
Subject: [PATCH 2/2] fix(knn): bump supported embeddings library version to 4

---
 knn/embeddings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/knn/embeddings.cpp b/knn/embeddings.cpp
index caf30380..9b3a1f48 100644
--- a/knn/embeddings.cpp
+++ b/knn/embeddings.cpp
@@ -312,7 +312,7 @@ knn::EmbeddingsLib_i * LoadEmbeddingsLib ( const std::string & sLibPath, std::st
 	if ( !pLib->Load(sError) )
 		return nullptr;
 
-	const int SUPPORTED_EMBEDDINGS_LIB_VER = 3;
+	const int SUPPORTED_EMBEDDINGS_LIB_VER = 4;
 	if ( pLib->GetVersion()!=SUPPORTED_EMBEDDINGS_LIB_VER )
 	{
 		sError = util::FormatStr ( "Unsupported embeddings library version %d (expected %d)", pLib->GetVersion(), SUPPORTED_EMBEDDINGS_LIB_VER );