CloudLLM-ai · Copilot · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,8 @@ Cargo.lock
 *.pdb
 
 .idea/
-run_example
+run_example
+
+# Temporary test binaries
+verify_pooling
+/tmp/*.rs
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,4 +17,6 @@ openai-rust2 = { version = "1.6.0" }
 
 async-trait = "0.1.88"
 log = "0.4.27"
-env_logger = "0.11.8"
+env_logger = "0.11.8"
+once_cell = "1.21"
+reqwest = { version = "0.12", features = ["json", "stream"] }
diff --git a/changelog.txt b/changelog.txt
@@ -1,3 +1,12 @@
+0.2.13 (Upcoming)
+ - Implemented HTTP client connection pooling for all providers:
+   - Created http_pool module that maintains singleton reqwest::Client instances per base URL
+   - Configured clients with persistent connections (90s idle timeout, 10 max idle per host)
+   - Added TCP keepalive (60s) to prevent connection drops
+   - Minimizes DNS lookups, TLS handshakes, and TCP connection overhead
+   - All clients (OpenAI, Gemini, Claude, Grok) automatically benefit from connection pooling
+   - Updated documentation to reflect connection pooling behavior
+
 0.2.12 SEP/21/2025
  - Added Claude client implementation at src/cloudllm/clients/claude.rs:
    - ClaudeClient struct follows the same delegate pattern as GrokClient, using OpenAIClient internally

diff --git a/src/cloudllm/clients/claude.rs b/src/cloudllm/clients/claude.rs
@@ -1,15 +1,20 @@
 use crate::client_wrapper::TokenUsage;
-use crate::clients::claude::Model::ClaudeSonnet4;
 use crate::clients::openai::OpenAIClient;
-use crate::{ClientWrapper, LLMSession, Message, Role};
+use crate::{ClientWrapper, Message};
 use async_trait::async_trait;
-use log::{error, info};
 use openai_rust2 as openai_rust;
-use openai_rust2::chat::SearchMode;
-use std::env;
 use std::error::Error;
 use std::sync::Mutex;
-use tokio::runtime::Runtime;
+
+#[cfg(test)]
+use {
+    std::env,
+    tokio::runtime::Runtime,
+    crate::LLMSession,
+    crate::Role,
+    crate::clients::claude::Model::ClaudeSonnet4,
+    log::{error, info},
+};
 
 pub struct ClaudeClient {
     delegate_client: OpenAIClient,

diff --git a/src/cloudllm/clients/gemini.rs b/src/cloudllm/clients/gemini.rs
@@ -1,15 +1,20 @@
 use crate::client_wrapper::TokenUsage;
 use crate::clients::common::send_and_track;
-use crate::clients::openai::OpenAIClient;
-use crate::{ClientWrapper, LLMSession, Message, Role};
+use crate::clients::http_pool::get_http_client;
+use crate::{ClientWrapper, Message, Role};
 use async_trait::async_trait;
-use log::{error, info};
+use log::error;
 use openai_rust::chat;
 use openai_rust2 as openai_rust;
-use std::env;
-use std::error::Error;
 use std::sync::Mutex;
-use tokio::runtime::Runtime;
+
+#[cfg(test)]
+use {
+    std::env,
+    tokio::runtime::Runtime,
+    crate::LLMSession,
+    log::info,
+};
 
 pub struct GeminiClient {
     client: openai_rust::Client,
@@ -148,10 +153,14 @@ pub fn model_to_string(model: Model) -> String {
 
 impl GeminiClient {
     pub fn new_with_model_string(secret_key: &str, model_name: &str) -> Self {
+        let base_url = "https://generativelanguage.googleapis.com/v1beta/";
+        let http_client = get_http_client(base_url);
+
         GeminiClient {
-            client: openai_rust::Client::new_with_base_url(
+            client: openai_rust::Client::new_with_client_and_base_url(
                 secret_key,
-                "https://generativelanguage.googleapis.com/v1beta/",
+                http_client,
+                base_url,
             ),
             model: model_name.to_string(),
             token_usage: Mutex::new(None),
@@ -165,8 +174,10 @@ impl GeminiClient {
     /// This function is used to create a GeminiClient with a custom base URL
     /// The default base URL is "<https://generativelanguage.googleapis.com/v1beta/>"
     pub fn new_with_base_url(secret_key: &str, model_name: &str, base_url: &str) -> Self {
+        let http_client = get_http_client(base_url);
+
         GeminiClient {
-            client: openai_rust::Client::new_with_base_url(secret_key, base_url),
+            client: openai_rust::Client::new_with_client_and_base_url(secret_key, http_client, base_url),
             model: model_name.to_string(),
             token_usage: Mutex::new(None),
         }

diff --git a/src/cloudllm/clients/grok.rs b/src/cloudllm/clients/grok.rs
@@ -1,15 +1,21 @@
 use crate::client_wrapper::TokenUsage;
-use crate::clients::grok::Model::Grok4_0709;
 use crate::clients::openai::OpenAIClient;
-use crate::{ClientWrapper, LLMSession, Message, Role};
+use crate::{ClientWrapper, Message};
 use async_trait::async_trait;
-use log::{error, info};
 use openai_rust2 as openai_rust;
-use openai_rust2::chat::SearchMode;
-use std::env;
 use std::error::Error;
 use std::sync::Mutex;
-use tokio::runtime::Runtime;
+
+#[cfg(test)]
+use {
+    std::env,
+    tokio::runtime::Runtime,
+    crate::LLMSession,
+    crate::Role,
+    crate::clients::grok::Model::Grok4_0709,
+    openai_rust2::chat::SearchMode,
+    log::{error, info},
+};
 
 pub struct GrokClient {
     delegate_client: OpenAIClient,

diff --git a/src/cloudllm/clients/http_pool.rs b/src/cloudllm/clients/http_pool.rs
@@ -0,0 +1,57 @@
+//! HTTP Client Pool for maintaining persistent connections per base URL.
+//!
+//! This module provides a singleton pool of reqwest::Client instances, one per base URL.
+//! This ensures that:
+//! - HTTP connections are reused across multiple requests (connection pooling)
+//! - DNS lookups are minimized
+//! - TLS handshakes are reused where possible
+//! - TCP connections are kept alive to avoid reconnection overhead
+//!
+//! The reqwest::Client is configured with optimal settings for persistent connections:
+//! - `pool_idle_timeout`: Keeps idle connections alive for 90 seconds
+//! - `pool_max_idle_per_host`: Allows up to 10 idle connections per host
+//! - `tcp_keepalive`: Sends keepalive packets every 60 seconds to prevent connection closure
+
+use once_cell::sync::Lazy;
+use reqwest;
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::time::Duration;
+
+/// Global HTTP client pool, lazily initialized on first access.
+static HTTP_CLIENT_POOL: Lazy<Mutex<HashMap<String, reqwest::Client>>> =
+    Lazy::new(|| Mutex::new(HashMap::new()));
+
+/// Get or create a shared HTTP client for the given base URL.
+///
+/// This function maintains a singleton pool of reqwest::Client instances.
+/// Each base URL gets its own client to ensure proper connection pooling.
+///
+/// # Arguments
+/// * `base_url` - The base URL for which to get/create an HTTP client
+///
+/// # Returns
+/// A cloned reqwest::Client configured for persistent connections
+pub fn get_http_client(base_url: &str) -> reqwest::Client {
+    let mut pool = HTTP_CLIENT_POOL.lock().unwrap();
+
+    if let Some(client) = pool.get(base_url) {
+        return client.clone();
+    }
+
+    // Create a new client with optimal settings for persistent connections
+    let client = reqwest::ClientBuilder::new()
+        // Keep idle connections alive for 90 seconds
+        .pool_idle_timeout(Some(Duration::from_secs(90)))
+        // Allow up to 10 idle connections per host for better throughput
+        .pool_max_idle_per_host(10)
+        // Enable TCP keepalive to prevent connection drops
+        .tcp_keepalive(Some(Duration::from_secs(60)))
+        // Set a reasonable timeout for the entire request
+        .timeout(Duration::from_secs(300))
+        .build()
+        .expect("Failed to build HTTP client");
+
+    pool.insert(base_url.to_string(), client.clone());
+    client
+}
diff --git a/src/cloudllm/clients/mod.rs b/src/cloudllm/clients/mod.rs
@@ -1,5 +1,6 @@
 // src/clients/mod.rs
 pub mod common;
+pub mod http_pool;
 
 pub mod claude;
 pub mod gemini;

diff --git a/src/cloudllm/clients/openai.rs b/src/cloudllm/clients/openai.rs
@@ -7,6 +7,8 @@
 //! - **send_message(...)**: unchanged signature; returns a `Message` as before.
 //! - **Automatic Usage Capture**: stores the latest `TokenUsage` (input_tokens, output_tokens, total_tokens) internally.
 //! - **Inspect Usage**: call `get_last_usage()` after `send_message()` to retrieve actual usage stats.
+//! - **Connection Pooling**: Automatically uses persistent HTTP connections. Multiple `OpenAIClient` instances
+//!   with the same base URL share a connection pool, minimizing DNS lookups, TLS handshakes, and TCP overhead.
 //!
 //! # Example
 //!
@@ -41,21 +43,27 @@
 //!
 //! Make sure `OPENAI_API_KEY` is set and pick a valid model name (e.g. `"gpt-4.1-nano"`).
 
-use std::env;
 use std::error::Error;
 
 use async_trait::async_trait;
-use log::{error, info};
+use log::error;
 use openai_rust::chat;
 use openai_rust2 as openai_rust;
 
 use crate::client_wrapper::TokenUsage;
 use crate::clients::common::send_and_track;
+use crate::clients::http_pool::get_http_client;
 use crate::cloudllm::client_wrapper::{ClientWrapper, Message, Role};
 use std::sync::Mutex;
-use tokio::runtime::Runtime;
-use crate::clients::openai::Model::GPT5Nano;
-use crate::LLMSession;
+
+#[cfg(test)]
+use {
+    std::env,
+    tokio::runtime::Runtime,
+    crate::LLMSession,
+    crate::clients::openai::Model::GPT5Nano,
+    log::info,
+};
 
 pub enum Model {
     GPT5,            // Higher Reasoning, Medium speed, Text+Image input, Text output; input $1.25/1M tokens, cached input $0.125/1M tokens, output $10/1M tokens
@@ -119,16 +127,22 @@ impl OpenAIClient {
     }
 
     pub fn new_with_model_string(secret_key: &str, model_name: &str) -> Self {
+        // Use default OpenAI base URL
+        let base_url = "https://api.openai.com";
+        let http_client = get_http_client(base_url);
+
         OpenAIClient {
-            client: openai_rust::Client::new(secret_key),
+            client: openai_rust::Client::new_with_client(secret_key, http_client),
             model: model_name.to_string(),
             token_usage: Mutex::new(None),
         }
     }
 
     pub fn new_with_base_url(secret_key: &str, model_name: &str, base_url: &str) -> Self {
+        let http_client = get_http_client(base_url);
+
         OpenAIClient {
-            client: openai_rust::Client::new_with_base_url(secret_key, base_url),
+            client: openai_rust::Client::new_with_client_and_base_url(secret_key, http_client, base_url),
             model: model_name.to_string(),
             token_usage: Mutex::new(None),
         }

diff --git a/src/lib.rs b/src/lib.rs
@@ -14,6 +14,14 @@
 //!   via client wrappers. For example, `OpenAIClient` serves as a client for OpenAI's ChatGPT, abstracting the interaction
 //!   specifics and presenting a unified interface.
 //!
+//! - **Connection Pooling**: All HTTP clients automatically use persistent connection pooling to minimize latency.
+//!   Each base URL (e.g., api.openai.com, api.anthropic.com) maintains its own connection pool with:
+//!   - Reused HTTP connections to avoid TCP handshake overhead
+//!   - Minimized DNS lookups through connection reuse
+//!   - Persistent TLS sessions to skip expensive handshakes
+//!   - TCP keepalive to prevent connection timeouts
+//!   This design ensures optimal performance in co-located and distributed deployments.
+//!
 //! ## The Road Ahead: LLM-VM Architecture
 //!
 //! The library is poised to evolve into a more sophisticated toolset with the introduction of the "LLM-VM" architecture.