From 21dde6cbdc30b7e7710f71108188516c5e690504 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Wed, 13 May 2026 11:08:04 +0300 Subject: [PATCH 1/2] feat(embeddings): add parallel processing and thread count control - Add rayon dependency for parallel execution support - Update TextModel and model traits to include threads parameter - Implement scoped rayon thread pools for candle-based models - Update ONNX pipelined inference to respect worker limits - Modify FFI interface and C++ wrapper to accept thread count - Increment library version to 4 for breaking FFI changes - Propagate thread settings through KNN and text-to-vector pipelines - Update Jina, local models, and test suites for new signatures - Allow zero threads to signify use of all available CPUs BREAKING CHANGE: TextModel::predict and MakeVectEmbeddingsFn signatures changed to include thread count parameter. --- embeddings/Cargo.lock | 1 + embeddings/Cargo.toml | 1 + embeddings/manticoresearch_text_embeddings.h | 5 +- embeddings/src/ffi.rs | 4 +- embeddings/src/model/ffi_test.rs | 2 +- embeddings/src/model/jina.rs | 8 +- embeddings/src/model/jina_test.rs | 4 +- embeddings/src/model/local.rs | 85 ++++++++++++++++---- embeddings/src/model/local_test.rs | 36 ++++----- embeddings/src/model/mod.rs | 16 ++-- embeddings/src/model/openai.rs | 8 +- embeddings/src/model/openai_test.rs | 6 +- embeddings/src/model/text_model_wrapper.rs | 5 +- embeddings/src/model/voyage.rs | 8 +- embeddings/src/model/voyage_test.rs | 6 +- knn/embeddings.cpp | 7 +- knn/knn.h | 2 +- 17 files changed, 141 insertions(+), 63 deletions(-) diff --git a/embeddings/Cargo.lock b/embeddings/Cargo.lock index e99f2e10..bf55a38b 100644 --- a/embeddings/Cargo.lock +++ b/embeddings/Cargo.lock @@ -1899,6 +1899,7 @@ dependencies = [ "ndarray 0.16.1", "ort", "rand 0.8.5", + "rayon", "reqwest", "serde", "serde_json", diff --git a/embeddings/Cargo.toml b/embeddings/Cargo.toml index 6619a85b..c0b9abca 100644 --- a/embeddings/Cargo.toml +++ b/embeddings/Cargo.toml @@ -18,6 +18,7 @@ candle-core = "0.9.2" candle-nn = "0.9.2" candle-transformers = "0.9.2" ort = { version = "2.0.0-rc.9", default-features = false, features = ["std"] } +rayon = "1.11" [features] default = [] diff --git a/embeddings/manticoresearch_text_embeddings.h b/embeddings/manticoresearch_text_embeddings.h index c073e445..3a8d5ef3 100644 --- a/embeddings/manticoresearch_text_embeddings.h +++ b/embeddings/manticoresearch_text_embeddings.h @@ -47,7 +47,10 @@ struct StringItem { uintptr_t len; }; -using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*, const StringItem*, uintptr_t); +using MakeVectEmbeddingsFn = FloatVecResult(*)(const TextModelWrapper*, + const StringItem*, + uintptr_t, + int32_t); using FreeVecResultFn = void(*)(FloatVecResult); diff --git a/embeddings/src/ffi.rs b/embeddings/src/ffi.rs index a104f948..0681b6ac 100644 --- a/embeddings/src/ffi.rs +++ b/embeddings/src/ffi.rs @@ -19,7 +19,7 @@ type LoadModelFn = extern "C" fn( type FreeModelResultFn = extern "C" fn(TextModelResult); type MakeVectEmbeddingsFn = - extern "C" fn(&TextModelWrapper, *const StringItem, usize) -> FloatVecResult; + extern "C" fn(&TextModelWrapper, *const StringItem, usize, i32) -> FloatVecResult; type FreeVecResultFn = extern "C" fn(FloatVecResult); @@ -62,7 +62,7 @@ pub struct EmbedLib { const VERSION_STR: &[u8] = concat!(env!("EMBEDDINGS_VERSION_STR"), "\0").as_bytes(); const LIB: EmbedLib = EmbedLib { - version: 3usize, + version: 4usize, version_str: VERSION_STR.as_ptr() as *const c_char, load_model: TextModelWrapper::load_model, free_model_result: TextModelWrapper::free_model_result, diff --git a/embeddings/src/model/ffi_test.rs b/embeddings/src/model/ffi_test.rs index 4833c525..322012fa 100644 --- a/embeddings/src/model/ffi_test.rs +++ b/embeddings/src/model/ffi_test.rs @@ -76,7 +76,7 @@ mod tests { ) }; let vec_result = - TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1); + TextModelWrapper::make_vect_embeddings(&wrapper, items.as_ptr(), 1, 0); assert!(vec_result.error.is_null()); assert_eq!(vec_result.len, 1); TextModelWrapper::free_vec_result(vec_result); diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs index fa029193..5dd6bdfe 100644 --- a/embeddings/src/model/jina.rs +++ b/embeddings/src/model/jina.rs @@ -67,7 +67,11 @@ impl JinaModel { } impl TextModel for JinaModel { - fn predict(&self, texts: &[&str]) -> Result>, Box> { + fn predict( + &self, + texts: &[&str], + _threads: usize, + ) -> Result>, Box> { let url = self .api_url .as_deref() @@ -308,7 +312,7 @@ impl TextModel for JinaModel { fn validate_api_key(&self) -> Result<(), Box> { // Make a minimal test request with a single character to validate the API key // This is cheaper than a full embedding request but still validates the key - self.predict(&["test"])?; + self.predict(&["test"], 0)?; Ok(()) } } diff --git a/embeddings/src/model/jina_test.rs b/embeddings/src/model/jina_test.rs index 3c6a2896..bef1aff1 100644 --- a/embeddings/src/model/jina_test.rs +++ b/embeddings/src/model/jina_test.rs @@ -119,7 +119,7 @@ mod tests { // Test with real API key let texts = vec!["test"]; - let result = model.predict(&texts); + let result = model.predict(&texts, 0); match result { Ok(embeddings) => { // Should have one embedding for one text @@ -157,7 +157,7 @@ mod tests { let model = JinaModel::new("jina/jina-embeddings-v3", &api_key, None, None).unwrap(); let empty_texts: Vec<&str> = vec![]; - let result = model.predict(&empty_texts); + let result = model.predict(&empty_texts, 0); // Empty input should succeed with empty result match result { Ok(embeddings) => { diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs index 5f552c53..8947ba69 100644 --- a/embeddings/src/model/local.rs +++ b/embeddings/src/model/local.rs @@ -55,6 +55,38 @@ fn intra_threads() -> usize { .unwrap_or(DEFAULT_INTRA_THREADS) } +/// Run `f` inside a scoped rayon thread pool of the requested size. +/// +/// `threads == 0` means "no cap": just run `f` on the existing (global) rayon pool, +/// which by default uses every available CPU. `threads > 0` builds a fresh pool +/// of that size (clamped to available CPUs) and installs it for the duration of `f`, +/// so candle's intra-op rayon work and tokenizers' parallelism both respect the limit. +/// +/// Errors are stringified across the `pool.install` boundary because `Box` +/// is not `Send`; the original error message is preserved. +fn with_thread_limit(threads: usize, f: F) -> Result>, Box> +where + F: FnOnce() -> Result>, Box> + Send, +{ + if threads == 0 { + return f(); + } + + let n = threads.min(available_cpus()).max(1); + let pool = match rayon::ThreadPoolBuilder::new().num_threads(n).build() { + Ok(p) => p, + // If the pool can't be built (extremely unlikely), fall back to the global pool + // rather than failing the whole inference. + Err(_) => return f(), + }; + + // pool.install requires `R: Send`, but Box isn't Send. Stringify the + // error inside the closure and re-wrap it on the way out — keeps the message, + // satisfies the Send bound, and avoids forcing every model's error into Send+Sync. + pool.install(|| f().map_err(|e| e.to_string())) + .map_err(|s| -> Box { s.into() }) +} + /// Thread-safe session wrapper with platform-specific strategy: /// - Linux/macOS: UnsafeCell for concurrent Run() (ORT C API is thread-safe) /// - Windows: Mutex for serialized Run() (Windows ORT has threading issues) @@ -967,7 +999,13 @@ impl OnnxEmbeddingModel { /// - Large input (> batch_size): splits into num_cpus concurrent single-doc workers. /// Each worker tokenizes + infers 1 doc at a time through SessionWrapper, /// mimicking the concurrent caller pattern that gives best throughput. - fn predict_pipelined(&self, texts: &[&str]) -> Result>, Box> { + /// + /// `threads` caps the worker count. 0 means "use all available CPUs". + fn predict_pipelined( + &self, + texts: &[&str], + threads: usize, + ) -> Result>, Box> { let bs = batch_size(); let max_input = self.max_input_len; let session = &self.session; @@ -984,8 +1022,14 @@ impl OnnxEmbeddingModel { // Adaptive parallelism: scale workers with input size. // Each worker needs at least batch_size docs to justify thread overhead. - // Cap at available CPUs — more workers than cores adds contention. - let num_workers = (texts.len() / bs).min(available_cpus()).max(1); + // Worker cap is the caller-supplied `threads` limit when > 0, + // otherwise fall back to all available CPUs. + let thread_cap = if threads > 0 { + threads.min(available_cpus()) + } else { + available_cpus() + }; + let num_workers = (texts.len() / bs).min(thread_cap).max(1); let docs_per_worker = texts.len().div_ceil(num_workers); let mut ordered_results: Vec>> = Vec::with_capacity(num_workers); @@ -1146,19 +1190,15 @@ impl LocalModel { } } -impl TextModel for LocalModel { - fn predict(&self, texts: &[&str]) -> Result>, Box> { - // BERT and ONNX: batched path (batch_size up to batch_size() per forward pass) - match self { - LocalModel::Bert(m) => { - return Self::predict_batched(&m.tokenizer, m.max_input_len, texts, |chunks| { - m.predict_chunks(chunks) - }); - } - LocalModel::Onnx(m) => { - return m.predict_pipelined(texts); - } - _ => {} // fall through to sequential path +impl LocalModel { + /// Inner predict body for non-ONNX local models (BERT / T5 / Causal / Quantized). + /// Pulled out of the trait impl so the caller can wrap it in a scoped rayon pool. + fn predict_local(&self, texts: &[&str]) -> Result>, Box> { + // BERT: batched path (batch_size up to batch_size() per forward pass) + if let LocalModel::Bert(m) = self { + return Self::predict_batched(&m.tokenizer, m.max_input_len, texts, |chunks| { + m.predict_chunks(chunks) + }); } // Sequential path for T5, Causal, Quantized (these use KV caches / mutexes) @@ -1288,6 +1328,19 @@ impl TextModel for LocalModel { Ok(all_results) } +} + +impl TextModel for LocalModel { + fn predict(&self, texts: &[&str], threads: usize) -> Result>, Box> { + // ONNX manages its own worker count internally — no rayon pool involved. + if let LocalModel::Onnx(m) = self { + return m.predict_pipelined(texts, threads); + } + + // BERT / T5 / Causal / Quantized go through candle, which uses rayon for + // intra-op parallelism. Scope the rayon pool so threads > 0 caps the worker count. + with_thread_limit(threads, || self.predict_local(texts)) + } fn get_hidden_size(&self) -> usize { match self { diff --git a/embeddings/src/model/local_test.rs b/embeddings/src/model/local_test.rs index 4627cefc..6bd9dad1 100644 --- a/embeddings/src/model/local_test.rs +++ b/embeddings/src/model/local_test.rs @@ -417,7 +417,7 @@ mod tests { for sentence in &test_sentences { let local_model = LocalModel::new(model_id, cache_path.clone(), false, None).unwrap(); - let embedding = local_model.predict(&[sentence]).unwrap(); + let embedding = local_model.predict(&[sentence], 0).unwrap(); check_embedding_properties(&embedding[0], local_model.get_hidden_size()); } } @@ -429,8 +429,8 @@ mod tests { let local_model = LocalModel::new(model_id, cache_path, false, None).unwrap(); let sentence = &["This is a test sentence."]; - let embedding1 = local_model.predict(sentence).unwrap(); - let embedding2 = local_model.predict(sentence).unwrap(); + let embedding1 = local_model.predict(sentence, 0).unwrap(); + let embedding2 = local_model.predict(sentence, 0).unwrap(); for (e1, e2) in embedding1[0].iter().zip(embedding2[0].iter()) { assert_abs_diff_eq!(e1, e2, epsilon = 1e-6); @@ -466,7 +466,7 @@ mod tests { let test_text = &["This is a test sentence for Qwen embedding model."]; let embeddings = local_model - .predict(test_text) + .predict(test_text, 0) .expect("Qwen model should generate embeddings"); check_embedding_properties(&embeddings[0], local_model.get_hidden_size()); @@ -482,7 +482,7 @@ mod tests { .expect("Llama model should load"); let test_text = &["This is a test sentence for Llama embedding model."]; - let embeddings = local_model.predict(test_text).unwrap(); + let embeddings = local_model.predict(test_text, 0).unwrap(); check_embedding_properties(&embeddings[0], local_model.get_hidden_size()); } @@ -496,7 +496,7 @@ mod tests { let local_model = LocalModel::new(model_id, cache_path.clone(), false, None) .expect("Mistral model should load"); let test_text = &["This is a test sentence for Mistral embedding model."]; - let embeddings = local_model.predict(test_text).unwrap(); + let embeddings = local_model.predict(test_text, 0).unwrap(); check_embedding_properties(&embeddings[0], local_model.get_hidden_size()); } @@ -510,7 +510,7 @@ mod tests { .expect("Gemma model should load"); let test_text = &["This is a test sentence for Gemma embedding model."]; - let embeddings = local_model.predict(test_text).unwrap(); + let embeddings = local_model.predict(test_text, 0).unwrap(); check_embedding_properties(&embeddings[0], local_model.get_hidden_size()); } @@ -536,7 +536,7 @@ mod tests { "Third sentence for batch processing verification.", ]; - let embeddings = local_model.predict(test_texts).unwrap(); + let embeddings = local_model.predict(test_texts, 0).unwrap(); assert_eq!(embeddings.len(), test_texts.len()); @@ -660,7 +660,7 @@ mod tests { // Test embedding generation let test_text = &["This is a test sentence for FRIDA embedding model."]; let embeddings = local_model - .predict(test_text) + .predict(test_text, 0) .expect("FRIDA model should generate embeddings"); assert_eq!(embeddings.len(), 1, "Should return one embedding"); @@ -697,7 +697,7 @@ mod tests { // Test embedding generation let test_text = &["This is a test sentence for Google embeddinggemma model."]; let embeddings = local_model - .predict(test_text) + .predict(test_text, 0) .expect("Google embeddinggemma should generate embeddings"); assert_eq!(embeddings.len(), 1, "Should return one embedding"); @@ -739,7 +739,7 @@ mod tests { let test_text = &["This is a test sentence for ONNX embedding model."]; let embeddings = local_model - .predict(test_text) + .predict(test_text, 0) .expect("ONNX model should generate embeddings"); assert_eq!(embeddings.len(), 1); @@ -761,8 +761,8 @@ mod tests { }; let sentence = &["This is a test sentence."]; - let embedding1 = local_model.predict(sentence).unwrap(); - let embedding2 = local_model.predict(sentence).unwrap(); + let embedding1 = local_model.predict(sentence, 0).unwrap(); + let embedding2 = local_model.predict(sentence, 0).unwrap(); for (e1, e2) in embedding1[0].iter().zip(embedding2[0].iter()) { assert_abs_diff_eq!(e1, e2, epsilon = 1e-6); @@ -789,7 +789,7 @@ mod tests { "Third sentence for batch processing.", ]; - let embeddings = local_model.predict(test_texts).unwrap(); + let embeddings = local_model.predict(test_texts, 0).unwrap(); assert_eq!(embeddings.len(), test_texts.len()); for embedding in &embeddings { @@ -850,20 +850,20 @@ mod tests { }; // Warmup - let _ = st.predict(sentences).unwrap(); - let _ = onnx.predict(sentences).unwrap(); + let _ = st.predict(sentences, 0).unwrap(); + let _ = onnx.predict(sentences, 0).unwrap(); // Safetensors timing let start = Instant::now(); for _ in 0..iterations { - let _ = st.predict(sentences).unwrap(); + let _ = st.predict(sentences, 0).unwrap(); } let st_ms = start.elapsed().as_millis() as f64 / iterations as f64; // ONNX timing let start = Instant::now(); for _ in 0..iterations { - let _ = onnx.predict(sentences).unwrap(); + let _ = onnx.predict(sentences, 0).unwrap(); } let onnx_ms = start.elapsed().as_millis() as f64 / iterations as f64; diff --git a/embeddings/src/model/mod.rs b/embeddings/src/model/mod.rs index d675cfba..c8769043 100644 --- a/embeddings/src/model/mod.rs +++ b/embeddings/src/model/mod.rs @@ -23,7 +23,11 @@ use std::error::Error; use std::path::PathBuf; pub trait TextModel { - fn predict(&self, texts: &[&str]) -> Result>, Box>; + /// Generate embeddings for the given texts. + /// + /// `threads` caps the number of CPU threads used during generation. + /// 0 means "use all available CPUs" (default). + fn predict(&self, texts: &[&str], threads: usize) -> Result>, Box>; fn get_hidden_size(&self) -> usize; fn get_max_input_len(&self) -> usize; /// Validates the API key by making a minimal test request to the API. @@ -57,12 +61,12 @@ pub enum Model { } impl TextModel for Model { - fn predict(&self, texts: &[&str]) -> Result>, Box> { + fn predict(&self, texts: &[&str], threads: usize) -> Result>, Box> { match self { - Model::OpenAI(m) => m.predict(texts), - Model::Voyage(m) => m.predict(texts), - Model::Jina(m) => m.predict(texts), - Model::Local(m) => m.predict(texts), + Model::OpenAI(m) => m.predict(texts, threads), + Model::Voyage(m) => m.predict(texts, threads), + Model::Jina(m) => m.predict(texts, threads), + Model::Local(m) => m.predict(texts, threads), } } diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs index b880c116..c012c288 100644 --- a/embeddings/src/model/openai.rs +++ b/embeddings/src/model/openai.rs @@ -57,7 +57,11 @@ impl OpenAIModel { } impl TextModel for OpenAIModel { - fn predict(&self, texts: &[&str]) -> Result>, Box> { + fn predict( + &self, + texts: &[&str], + _threads: usize, + ) -> Result>, Box> { let url = self .api_url .as_deref() @@ -182,7 +186,7 @@ impl TextModel for OpenAIModel { fn validate_api_key(&self) -> Result<(), Box> { // Make a minimal test request with a single character to validate the API key // This is cheaper than a full embedding request but still validates the key - self.predict(&["test"])?; + self.predict(&["test"], 0)?; Ok(()) } } diff --git a/embeddings/src/model/openai_test.rs b/embeddings/src/model/openai_test.rs index 8a24f8e0..16da9141 100644 --- a/embeddings/src/model/openai_test.rs +++ b/embeddings/src/model/openai_test.rs @@ -191,7 +191,7 @@ mod tests { // Test with real API key let texts = vec!["test"]; - let result = model.predict(&texts); + let result = model.predict(&texts, 0); match result { Ok(embeddings) => { @@ -231,7 +231,7 @@ mod tests { OpenAIModel::new("openai/text-embedding-ada-002", &api_key, None, None).unwrap(); let empty_texts: Vec<&str> = vec![]; - let result = model.predict(&empty_texts); + let result = model.predict(&empty_texts, 0); // Empty input should succeed with empty result match result { @@ -263,7 +263,7 @@ mod tests { // Test that the client is initialized // We can't directly test the client, but we can verify it exists by using it let texts = vec!["test"]; - let _result = model.predict(&texts); // This will fail but proves client exists + let _result = model.predict(&texts, 0); // This will fail but proves client exists } #[test] diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs index bcb7dc5d..e0ca99a4 100644 --- a/embeddings/src/model/text_model_wrapper.rs +++ b/embeddings/src/model/text_model_wrapper.rs @@ -127,6 +127,7 @@ impl TextModelWrapper { &self, texts: *const StringItem, count: usize, + threads: i32, // 0 = use all available CPUs, >0 = cap thread count ) -> FloatVecResult { let string_slice = unsafe { std::slice::from_raw_parts(texts, count) }; @@ -140,9 +141,11 @@ impl TextModelWrapper { }) .collect(); + let threads = if threads > 0 { threads as usize } else { 0 }; + let mut float_vec_list: Vec = Vec::new(); let model = self.as_model(); - let embeddings_list = model.predict(&string_refs); + let embeddings_list = model.predict(&string_refs, threads); let c_error = match embeddings_list { Ok(embeddings_list) => { for embeddings in embeddings_list.iter() { diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs index 1374dd4e..3d3d8744 100644 --- a/embeddings/src/model/voyage.rs +++ b/embeddings/src/model/voyage.rs @@ -58,7 +58,11 @@ impl VoyageModel { } impl TextModel for VoyageModel { - fn predict(&self, texts: &[&str]) -> Result>, Box> { + fn predict( + &self, + texts: &[&str], + _threads: usize, + ) -> Result>, Box> { let url = self .api_url .as_deref() @@ -221,7 +225,7 @@ impl TextModel for VoyageModel { fn validate_api_key(&self) -> Result<(), Box> { // Make a minimal test request with a single character to validate the API key // This is cheaper than a full embedding request but still validates the key - self.predict(&["test"])?; + self.predict(&["test"], 0)?; Ok(()) } } diff --git a/embeddings/src/model/voyage_test.rs b/embeddings/src/model/voyage_test.rs index 1a9a59a2..80c775cf 100644 --- a/embeddings/src/model/voyage_test.rs +++ b/embeddings/src/model/voyage_test.rs @@ -207,7 +207,7 @@ mod tests { // Test with real API key let texts = vec!["test"]; - let result = model.predict(&texts); + let result = model.predict(&texts, 0); match result { Ok(embeddings) => { // Should have one embedding for one text @@ -245,7 +245,7 @@ mod tests { let model = VoyageModel::new("voyage/voyage-3-large", &api_key, None, None).unwrap(); let empty_texts: Vec<&str> = vec![]; - let result = model.predict(&empty_texts); + let result = model.predict(&empty_texts, 0); // Empty input should succeed with empty result match result { Ok(embeddings) => { @@ -276,7 +276,7 @@ mod tests { // Test that the client is initialized // We can't directly test the client, but we can verify it exists by using it let texts = vec!["test"]; - let _result = model.predict(&texts); // This will fail but proves client exists + let _result = model.predict(&texts, 0); // This will fail but proves client exists } #[test] diff --git a/knn/embeddings.cpp b/knn/embeddings.cpp index 2326e97a..caf30380 100644 --- a/knn/embeddings.cpp +++ b/knn/embeddings.cpp @@ -195,7 +195,7 @@ class TextToEmbeddings_c : public TextToEmbeddings_i TextToEmbeddings_c ( const ModelSettings_t & tSettings ) : m_tSettings ( tSettings ) {} bool Initialize ( std::shared_ptr pLib, std::string & sError ); - bool Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError ) const override; + bool Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError, int iThreads = 0 ) const override; int GetDims() const override; private: @@ -255,7 +255,7 @@ bool TextToEmbeddings_c::Initialize ( std::shared_ptr pLib, std::st } -bool TextToEmbeddings_c::Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError ) const +bool TextToEmbeddings_c::Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError, int iThreads ) const { std::vector dStringItems; for ( const auto & i : dTexts ) @@ -264,7 +264,8 @@ bool TextToEmbeddings_c::Convert ( const std::vector & dTexts, auto * pFuncs = m_pLib->GetLibFuncs(); assert(pFuncs); - FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size() ); + // iThreads: 0 = use all available CPUs (default), >0 = cap worker count in the embeddings lib + FloatVecResult tVecResult = pFuncs->make_vect_embeddings ( &m_pModel, dStringItems.data(), dStringItems.size(), iThreads ); if ( tVecResult.m_szError ) { sError = tVecResult.m_szError; diff --git a/knn/knn.h b/knn/knn.h index 5eb92737..eb9bf369 100644 --- a/knn/knn.h +++ b/knn/knn.h @@ -138,7 +138,7 @@ class TextToEmbeddings_i public: virtual ~TextToEmbeddings_i() = default; - virtual bool Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError ) const = 0; + virtual bool Convert ( const std::vector & dTexts, std::vector> & dEmbeddings, std::string & sError, int iThreads = 0 ) const = 0; virtual int GetDims() const = 0; }; From 236abfadf97e69953bd0927653b5fba22c9e8289 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Wed, 13 May 2026 17:08:24 +0300 Subject: [PATCH 2/2] fix(knn): bump supported embeddings library version to 4 --- knn/embeddings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knn/embeddings.cpp b/knn/embeddings.cpp index caf30380..9b3a1f48 100644 --- a/knn/embeddings.cpp +++ b/knn/embeddings.cpp @@ -312,7 +312,7 @@ knn::EmbeddingsLib_i * LoadEmbeddingsLib ( const std::string & sLibPath, std::st if ( !pLib->Load(sError) ) return nullptr; - const int SUPPORTED_EMBEDDINGS_LIB_VER = 3; + const int SUPPORTED_EMBEDDINGS_LIB_VER = 4; if ( pLib->GetVersion()!=SUPPORTED_EMBEDDINGS_LIB_VER ) { sError = util::FormatStr ( "Unsupported embeddings library version %d (expected %d)", pLib->GetVersion(), SUPPORTED_EMBEDDINGS_LIB_VER );