diff --git a/OpenAI_Open_Source_Pro_Mode.ipynb b/OpenAI_Open_Source_Pro_Mode_Groq.ipynb
similarity index 100%
rename from OpenAI_Open_Source_Pro_Mode.ipynb
rename to OpenAI_Open_Source_Pro_Mode_Groq.ipynb
diff --git a/OpenAI_Open_Source_Pro_Mode_Ollama_Local.ipynb b/OpenAI_Open_Source_Pro_Mode_Ollama_Local.ipynb
new file mode 100644
index 0000000..fe4a172
--- /dev/null
+++ b/OpenAI_Open_Source_Pro_Mode_Ollama_Local.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "view-in-github"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3LLlSvTtU0Y3"
+ },
+ "source": [
+ "Made by Matt Shumer ([@mattshumer_](https://x.com/mattshumer_) on X)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "cellView": "form",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dzWS8LaqUw3N",
+ "outputId": "d4031e82-2131-4208-8f62-3923baca7cc4"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: ollama in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (0.4.5)\n",
+ "Requirement already satisfied: httpx<0.28.0,>=0.27.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from ollama) (0.27.2)\n",
+ "Requirement already satisfied: pydantic<3.0.0,>=2.9.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from ollama) (2.11.7)\n",
+ "Requirement already satisfied: anyio in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (4.9.0)\n",
+ "Requirement already satisfied: certifi in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (2024.8.30)\n",
+ "Requirement already satisfied: httpcore==1.* in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (1.0.5)\n",
+ "Requirement already satisfied: idna in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (3.8)\n",
+ "Requirement already satisfied: sniffio in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (1.3.1)\n",
+ "Requirement already satisfied: h11<0.15,>=0.13 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama) (0.14.0)\n",
+ "Requirement already satisfied: annotated-types>=0.6.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (0.7.0)\n",
+ "Requirement already satisfied: pydantic-core==2.33.2 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (2.33.2)\n",
+ "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (4.12.2)\n",
+ "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (0.4.1)\n",
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "# @title Run this cell to set up Pro Mode\n",
+ "!pip3 install ollama\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "from typing import List, Dict, Any\n",
+ "import time, os\n",
+ "import concurrent.futures as cf\n",
+ "import ollama\n",
+ "\n",
+ "MODEL = \"gpt-oss:120b\"\n",
+ "MAX_COMPLETION_TOKENS = 30000\n",
+ "\n",
+ "\n",
+ "def _one_completion(client: ollama, question: str, temperature: float ) -> str:\n",
+ " \"\"\"\n",
+ " \n",
+ " Sends a question to the Ollama API and returns the response.\n",
+ " \"\"\"\n",
+ " delay = 0.5\n",
+ " for attempt in range(3):\n",
+ " try:\n",
+ " response = client.chat(\n",
+ " model=MODEL,\n",
+ " messages=[\n",
+ " {'role': 'user', 'content': question},\n",
+ " ],\n",
+ " options={'temperature': temperature, 'max_completion_tokens':MAX_COMPLETION_TOKENS}\n",
+ " )\n",
+ " return response['message']['content']\n",
+ " except Exception as e:\n",
+ " if attempt == 2:\n",
+ " raise\n",
+ " time.sleep(delay)\n",
+ " delay *= 2\n",
+ "\n",
+ "\n",
+ "def _build_synthesis_messages(candidates: List[str]) -> List[Dict[str, str]]:\n",
+ " numbered = \"\\n\\n\".join(\n",
+ " f\"\\n{txt}\\n\" for i, txt in enumerate(candidates)\n",
+ " )\n",
+ " system = (\n",
+ " \"You are an expert editor. Synthesize ONE best answer from the candidate \"\n",
+ " \"answers provided, merging strengths, correcting errors, and removing repetition. \"\n",
+ " \"Do not mention the candidates or the synthesis process. Be decisive and clear.\"\n",
+ " )\n",
+ " user = (\n",
+ " f\"You are given {len(candidates)} candidate answers delimited by tags.\\n\\n\"\n",
+ " f\"{numbered}\\n\\nReturn the single best final answer.\"\n",
+ " )\n",
+ " return [{\"role\": \"system\", \"content\": system},\n",
+ " {\"role\": \"user\", \"content\": user}]\n",
+ "\n",
+ "def pro_mode(client: ollama, prompt: str, n_runs: int) -> Dict[str, Any]:\n",
+ " \"\"\"\n",
+ " Fan out n_runs parallel generations at T=0.9 and synthesize a final answer at T=0.2.\n",
+ " If groq_api_key is provided, it will be used; otherwise GROQ_API_KEY env var is used.\n",
+ " Returns: {\"final\": str, \"candidates\": List[str]}\n",
+ " \"\"\"\n",
+ " assert n_runs >= 1, \"n_runs must be >= 1\"\n",
+ "\n",
+ " # Parallel candidate generations (threaded; Colab-friendly)\n",
+ " max_workers = min(n_runs, 16)\n",
+ " candidates: List[str] = [None] * n_runs # preserve order\n",
+ " with cf.ThreadPoolExecutor(max_workers=max_workers) as ex:\n",
+ " fut_to_idx = {\n",
+ " ex.submit(_one_completion, client, prompt, 0.9): i\n",
+ " for i in range(n_runs)\n",
+ " }\n",
+ " for fut in cf.as_completed(fut_to_idx):\n",
+ " i = fut_to_idx[fut]\n",
+ " candidates[i] = fut.result()\n",
+ "\n",
+ " # Synthesis pass\n",
+ " messages = _build_synthesis_messages(candidates)\n",
+ " final_resp = client.chat(\n",
+ " model=MODEL,\n",
+ " messages=messages,\n",
+ " options={'temperature': 0.2, 'max_completion_tokens':MAX_COMPLETION_TOKENS}\n",
+ " )\n",
+ " final = final_resp['message']['content']\n",
+ "\n",
+ " return {\"final\": final, \"candidates\": candidates}\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "OoBF5UbSVksz"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=== FINAL ===\n",
+ " **Self‑Play in Reinforcement Learning**\n",
+ "\n",
+ "Self‑play is a training paradigm in which an RL agent learns by repeatedly playing against a copy of itself (or against a mixture of its past copies). Because the opponent’s strength automatically tracks the learner’s strength, the task provides a **self‑generated curriculum**: the game is never too easy and never hopelessly hard. The only requirement is a simulator of the environment; no external data or hand‑crafted opponent is needed.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 1. Core Loop\n",
+ "\n",
+ "| Step | What happens | Why it matters |\n",
+ "|------|--------------|----------------|\n",
+ "| **1️⃣ Initialise** | Randomly initialise a policy (and optionally a value) network θ. | Gives the agent a starting point. |\n",
+ "| **2️⃣ Create opponent** | Keep a *lagged* copy θ̄ (the “target” network) that will act as the opponent. | Prevents the opponent from being identical at every move, which would give no learning signal. |\n",
+ "| **3️⃣ Self‑play** | Play many episodes where the current policy θ and the opponent θ̄ alternate turns, recording every (state, action, reward, done) tuple. | Generates the data on which the agent will improve. |\n",
+ "| **4️⃣ Compute targets** | From the trajectories compute returns (or advantages) and, if using a value head, a baseline. | Provides the supervised signal for policy‑gradient or value‑based updates. |\n",
+ "| **5️⃣ Update θ** | Perform an RL update (PPO, A2C, REINFORCE + baseline, or AlphaZero‑style loss) on the collected data. | Improves the policy so that actions leading to higher returns become more likely. |\n",
+ "| **6️⃣ Refresh opponent** | Every *K* training steps copy the updated θ into θ̄ (or perform a soft update θ̄←τθ+(1‑τ)θ̄). | Keeps the opponent slightly behind the learner, preserving a moving‑target curriculum. |\n",
+ "| **7️⃣ Evaluate** | Periodically test θ against a simple baseline (random, minimax, or the previous version) and log win‑rates. | Gives an external sanity check and a metric of progress. |\n",
+ "\n",
+ "The loop repeats until performance plateaus or a desired level is reached.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 2. Concrete Example – Connect‑4 with PPO\n",
+ "\n",
+ "Below is a compact, runnable‑style pseudocode that demonstrates the full pipeline for the classic two‑player board game **Connect‑4** (7 × 6 grid, win by aligning four discs). The same structure works for any turn‑based, zero‑sum game.\n",
+ "\n",
+ "```python\n",
+ "# -------------------------------------------------\n",
+ "# 1. Initialise networks\n",
+ "# -------------------------------------------------\n",
+ "policy_net = PolicyCNN() # outputs logits for 7 columns\n",
+ "optimizer = torch.optim.Adam(policy_net.parameters(), lr=1e-3)\n",
+ "\n",
+ "# -------------------------------------------------\n",
+ "# 2. Lagged opponent (target network)\n",
+ "# -------------------------------------------------\n",
+ "target_net = copy.deepcopy(policy_net) # frozen copy\n",
+ "\n",
+ "# -------------------------------------------------\n",
+ "# 3. Hyper‑parameters\n",
+ "# -------------------------------------------------\n",
+ "NUM_SELFPLAY_GAMES = 1000 # games per training epoch\n",
+ "UPDATE_EVERY = 200 # refresh opponent every 200 epochs\n",
+ "BATCH_SIZE = 64\n",
+ "CLIP_EPS = 0.2 # PPO clipping\n",
+ "GAMMA = 0.99\n",
+ "\n",
+ "# -------------------------------------------------\n",
+ "# 4. Experience buffer\n",
+ "# -------------------------------------------------\n",
+ "replay = [] # list of (state, action, reward, done)\n",
+ "\n",
+ "def play_one_game(policy, opponent):\n",
+ " \"\"\"Play a full Connect‑4 game, store (s,a,r,done) from the learner’s view.\"\"\"\n",
+ " env = Connect4Env()\n",
+ " traj = []\n",
+ " player = 1 # 1 = policy, -1 = opponent\n",
+ "\n",
+ " while not env.done:\n",
+ " # Choose action with the appropriate network\n",
+ " net = policy if player == 1 else opponent\n",
+ " logits = net(env.board_tensor())\n",
+ " dist = torch.distributions.Categorical(logits=logits)\n",
+ " action = dist.sample().item()\n",
+ "\n",
+ " # Step the environment (reward is from the *current* player’s perspective)\n",
+ " next_state, reward, done, _ = env.step(action, player)\n",
+ "\n",
+ " traj.append((env.board_tensor(), action, reward, done))\n",
+ " player *= -1 # swap turn\n",
+ "\n",
+ " # Convert opponent‑reward to learner‑reward and store\n",
+ " for i, (s,a,r,d) in enumerate(traj):\n",
+ " if i % 2 == 1: # opponent’s move\n",
+ " r = -r\n",
+ " replay.append((s, a, r, d))\n",
+ "\n",
+ " return traj\n",
+ "\n",
+ "# -------------------------------------------------\n",
+ "# 5. Main training loop\n",
+ "# -------------------------------------------------\n",
+ "for epoch in range(1, 10001):\n",
+ " # ---- Self‑play -------------------------------------------------\n",
+ " for _ in range(NUM_SELFPLAY_GAMES):\n",
+ " play_one_game(policy_net, target_net)\n",
+ "\n",
+ " # ---- PPO update ------------------------------------------------\n",
+ " # Sample a random minibatch\n",
+ " batch = random.sample(replay, BATCH_SIZE)\n",
+ " states, actions, rewards, dones = zip(*batch)\n",
+ "\n",
+ " states = torch.stack(states) # (B, C, H, W)\n",
+ " actions = torch.tensor(actions)\n",
+ " rewards = torch.tensor(rewards, dtype=torch.float)\n",
+ "\n",
+ " # Compute discounted returns\n",
+ " returns = []\n",
+ " G = 0.0\n",
+ " for r in reversed(rewards):\n",
+ " G = r + GAMMA * G\n",
+ " returns.insert(0, G)\n",
+ " returns = torch.tensor(returns)\n",
+ "\n",
+ " # Policy forward pass\n",
+ " logits = policy_net(states) # (B, 7)\n",
+ " logp = torch.distributions.Categorical(logits=logits)\\\n",
+ " .log_prob(actions)\n",
+ "\n",
+ " # PPO surrogate loss\n",
+ " old_logp = logp.detach()\n",
+ " ratio = torch.exp(logp - old_logp)\n",
+ " surr1 = ratio * (returns - returns.mean()) # advantage ≈ return‑baseline\n",
+ " surr2 = torch.clamp(ratio, 1-CLIP_EPS, 1+CLIP_EPS) * (returns - returns.mean())\n",
+ " loss_pi = -torch.min(surr1, surr2).mean()\n",
+ "\n",
+ " optimizer.zero_grad()\n",
+ " loss_pi.backward()\n",
+ " optimizer.step()\n",
+ "\n",
+ " # ---- Refresh opponent -------------------------------------------\n",
+ " if epoch % UPDATE_EVERY == 0:\n",
+ " target_net = copy.deepcopy(policy_net)\n",
+ "\n",
+ " # ---- Logging ----------------------------------------------------\n",
+ " if epoch % 100 == 0:\n",
+ " win_rate = evaluate_against_random(policy_net, n=200)\n",
+ " print(f\"Epoch {epoch:5d} PPO loss {loss_pi.item():.4f} vs‑random win {win_rate:.2%}\")\n",
+ "\n",
+ " # ---- Optional buffer trimming to keep memory bounded ------------\n",
+ " if len(replay) > 50000:\n",
+ " replay = replay[-25000:]\n",
+ "```\n",
+ "\n",
+ "**Key points illustrated by the code**\n",
+ "\n",
+ "| Part of the loop | Self‑play role |\n",
+ "|------------------|----------------|\n",
+ "| `play_one_game` | Two copies of the same algorithm (`policy_net` vs. `target_net`) generate the experience. |\n",
+ "| Reward sign flip (`r = -r` on opponent moves) | Guarantees that the learner receives a reward signal that reflects the zero‑sum nature of the game. |\n",
+ "| PPO clipping (`CLIP_EPS`) | Stabilises policy updates while still allowing improvement. |\n",
+ "| Periodic copy of `policy_net` into `target_net` | Provides the “slightly older” opponent that keeps the curriculum moving. |\n",
+ "| Evaluation against a random player | Gives a quick sanity check of learning progress. |\n",
+ "\n",
+ "After a few hundred thousand self‑play games the agent converges to a **near‑optimal Connect‑4 strategy**: it never loses and wins whenever the opponent makes a mistake.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 3. AlphaZero Blueprint (MCTS + Neural Nets)\n",
+ "\n",
+ "For larger games (Go, Chess, Shogi) the same self‑play loop is enriched with **Monte‑Carlo Tree Search (MCTS)**:\n",
+ "\n",
+ "1. **Policy head** supplies a prior over legal moves; MCTS uses these priors to bias rollouts.\n",
+ "2. **Search** is run for each move, producing a *search‑enhanced* policy distribution π\\*_t (the proportion of visits to each child).\n",
+ "3. The training target for the policy network is the **search‑enhanced distribution**, while the value head is trained on the game outcome.\n",
+ "4. The loss (used in AlphaZero) is \n",
+ "\n",
+ "\\[\n",
+ "L = (z - V_\\phi(s))^2 \\;-\\; \\pi^\\*_t \\cdot \\log \\pi_\\theta(a|s) \\;+\\; c \\| \\theta \\|^2,\n",
+ "\\]\n",
+ "\n",
+ "where *z* is the final game result (±1 or 0). \n",
+ "5. The opponent is simply the **latest checkpoint** of the same network (or a soft‑updated mixture of several past checkpoints).\n",
+ "\n",
+ "This combination of **self‑play + MCTS + deep nets** is what enabled AlphaGo Zero, AlphaZero, and MuZero to achieve superhuman performance from scratch.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 4. Why Self‑Play Works\n",
+ "\n",
+ "| Benefit | Explanation |\n",
+ "|---------|-------------|\n",
+ "| **Automatic curriculum** | The opponent improves together with the learner, so the difficulty scales naturally. |\n",
+ "| **No external data** | The agent creates its own training set; only a simulator is needed. |\n",
+ "| **Symmetry & fairness** | Both sides share the same inductive biases, eliminating bias from a hand‑crafted opponent. |\n",
+ "| **Domain‑agnostic** | Works for any environment with a simulator—board games, video games, multi‑agent tasks, robotics competitions, etc. |\n",
+ "| **Proven track record** | TD‑Gammon → AlphaGo/AlphaZero → OpenAI Five → MuZero. |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 5. Common Pitfalls & Stabilisation Tricks\n",
+ "\n",
+ "| Pitfall | Mitigation |\n",
+ "|---------|------------|\n",
+ "| **Cyclic policies** (the learner only beats its immediate predecessor) | Keep a *population* of the last K checkpoints and sample one as the opponent each game. |\n",
+ "| **High variance in policy‑gradient updates** | Use a value baseline, advantage estimation, or entropy regularisation. |\n",
+ "| **Instability when the opponent is exactly equal** | Refresh the opponent **slowly** (soft update with τ ≈ 0.01) or keep it a few steps behind. |\n",
+ "| **Sparse rewards** (only at the end of the episode) | Combine self‑play with MCTS, epsilon‑greedy exploration, or add a small shaping reward for progress. |\n",
+ "| **Over‑fitting to a single opponent** | Randomise which copy starts (X vs. O) and optionally randomise the opponent from a buffer of past policies. |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 6. Checklist for a Self‑Play Project\n",
+ "\n",
+ "1. **Environment** – deterministic or stochastic simulator, turn‑based or simultaneous actions. \n",
+ "2. **Network architecture** – policy head (softmax over legal moves) and optionally a value head; for board games a small CNN works well. \n",
+ "3. **Opponent handling** – lagged copy, soft update, or a replay buffer of past policies. \n",
+ "4. **RL algorithm** – PPO/A2C for on‑policy updates, REINFORCE + baseline for simplicity, or AlphaZero loss for MCTS‑augmented training. \n",
+ "5. **Exploration** – entropy bonus, temperature annealing, or MCTS depth limits for early stages. \n",
+ "6. **Training schedule** – number of self‑play games per epoch, batch size, opponent refresh frequency. \n",
+ "7. **Evaluation** – against random/minimax players and against previous checkpoints; log win‑rates, loss curves, and compute Elo if appropriate. \n",
+ "8. **Compute resources** – self‑play can be parallelised across many CPU cores or GPUs; MCTS adds CPU load, while deep‑net inference can be batched on GPUs.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 7. TL;DR\n",
+ "\n",
+ "* **Self‑play** lets an RL agent generate its own training data by competing against a slightly older copy of itself. \n",
+ "* The opponent’s skill automatically tracks the learner’s skill, providing a **self‑adjusting curriculum** without any human data. \n",
+ "* A minimal implementation (e.g., Connect‑4 with PPO) follows the loop: initialise → lagged opponent → self‑play → compute returns → update policy → refresh opponent → evaluate. \n",
+ "* When combined with Monte‑Carlo Tree Search and a value head, this loop becomes the **AlphaZero** algorithm that has mastered Chess, Go, Shogi, and many other complex games. \n",
+ "* Benefits: data on‑demand, automatic curriculum, symmetry, and broad applicability. \n",
+ "* Pitfalls (cycling, high variance, instability) are mitigated with target networks, replay buffers of past policies, soft updates, and entropy or baseline techniques. \n",
+ "\n",
+ "Self‑play is therefore a cornerstone of modern game‑playing AI and a versatile tool for any RL problem where a simulator exists.\n"
+ ]
+ }
+ ],
+ "source": [
+ "PROMPT = \"Explain self-play in reinforcement learning with a concrete example.\"\n",
+ "NUMBER_OF_CANDIDATES = 2 # start with five, go up if you need more intelligence!\n",
+ "\n",
+ "client = ollama.Client()\n",
+ "\n",
+ "\n",
+ "\n",
+ "result = pro_mode(client, PROMPT, NUMBER_OF_CANDIDATES)\n",
+ "\n",
+ "print(\"\\n=== FINAL ===\\n\", result[\"final\"])\n",
+ "# To inspect candidates:\n",
+ "# for i, c in enumerate(result[\"candidates\"], 1): print(f\"\\n--- Candidate {i} ---\\n{c}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "authorship_tag": "ABX9TyMEoKFi+0SrPi/OXiwQ1i58",
+ "include_colab_link": true,
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/OpenAI_Open_Source_Pro_Mode_Ollama_Turbo.ipynb b/OpenAI_Open_Source_Pro_Mode_Ollama_Turbo.ipynb
new file mode 100644
index 0000000..632544a
--- /dev/null
+++ b/OpenAI_Open_Source_Pro_Mode_Ollama_Turbo.ipynb
@@ -0,0 +1,441 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "view-in-github"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3LLlSvTtU0Y3"
+ },
+ "source": [
+ "Made by Matt Shumer ([@mattshumer_](https://x.com/mattshumer_) on X)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "cellView": "form",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dzWS8LaqUw3N",
+ "outputId": "d4031e82-2131-4208-8f62-3923baca7cc4"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: ollama in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (0.4.5)\n",
+ "Requirement already satisfied: httpx<0.28.0,>=0.27.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from ollama) (0.27.2)\n",
+ "Requirement already satisfied: pydantic<3.0.0,>=2.9.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from ollama) (2.11.7)\n",
+ "Requirement already satisfied: anyio in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (4.9.0)\n",
+ "Requirement already satisfied: certifi in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (2024.8.30)\n",
+ "Requirement already satisfied: httpcore==1.* in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (1.0.5)\n",
+ "Requirement already satisfied: idna in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (3.8)\n",
+ "Requirement already satisfied: sniffio in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpx<0.28.0,>=0.27.0->ollama) (1.3.1)\n",
+ "Requirement already satisfied: h11<0.15,>=0.13 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama) (0.14.0)\n",
+ "Requirement already satisfied: annotated-types>=0.6.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (0.7.0)\n",
+ "Requirement already satisfied: pydantic-core==2.33.2 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (2.33.2)\n",
+ "Requirement already satisfied: typing-extensions>=4.12.2 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (4.12.2)\n",
+ "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/meirm/.pyenv/versions/3.12.8/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.9.0->ollama) (0.4.1)\n",
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "# @title Run this cell to set up Pro Mode\n",
+ "!pip3 install ollama\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "from typing import List, Dict, Any\n",
+ "import time, os\n",
+ "import concurrent.futures as cf\n",
+ "import ollama\n",
+ "\n",
+ "MODEL = \"gpt-oss:120b\"\n",
+ "MAX_COMPLETION_TOKENS = 30000\n",
+ "\n",
+ "\n",
+ "def _one_completion(client: ollama, question: str, temperature: float ) -> str:\n",
+ " \"\"\"\n",
+ " \n",
+ " Sends a question to the Ollama API and returns the response.\n",
+ " \"\"\"\n",
+ " delay = 0.5\n",
+ " for attempt in range(3):\n",
+ " try:\n",
+ " response = client.chat(\n",
+ " model=MODEL,\n",
+ " messages=[\n",
+ " {'role': 'user', 'content': question},\n",
+ " ],\n",
+ " options={'temperature': temperature, 'max_completion_tokens':MAX_COMPLETION_TOKENS}\n",
+ " )\n",
+ " return response['message']['content']\n",
+ " except Exception as e:\n",
+ " if attempt == 2:\n",
+ " raise\n",
+ " time.sleep(delay)\n",
+ " delay *= 2\n",
+ "\n",
+ "\n",
+ "def _build_synthesis_messages(candidates: List[str]) -> List[Dict[str, str]]:\n",
+ " numbered = \"\\n\\n\".join(\n",
+ " f\"\\n{txt}\\n\" for i, txt in enumerate(candidates)\n",
+ " )\n",
+ " system = (\n",
+ " \"You are an expert editor. Synthesize ONE best answer from the candidate \"\n",
+ " \"answers provided, merging strengths, correcting errors, and removing repetition. \"\n",
+ " \"Do not mention the candidates or the synthesis process. Be decisive and clear.\"\n",
+ " )\n",
+ " user = (\n",
+ " f\"You are given {len(candidates)} candidate answers delimited by tags.\\n\\n\"\n",
+ " f\"{numbered}\\n\\nReturn the single best final answer.\"\n",
+ " )\n",
+ " return [{\"role\": \"system\", \"content\": system},\n",
+ " {\"role\": \"user\", \"content\": user}]\n",
+ "\n",
+ "def pro_mode(client: ollama, prompt: str, n_runs: int) -> Dict[str, Any]:\n",
+ " \"\"\"\n",
+ " Fan out n_runs parallel generations at T=0.9 and synthesize a final answer at T=0.2.\n",
+ " If groq_api_key is provided, it will be used; otherwise GROQ_API_KEY env var is used.\n",
+ " Returns: {\"final\": str, \"candidates\": List[str]}\n",
+ " \"\"\"\n",
+ " assert n_runs >= 1, \"n_runs must be >= 1\"\n",
+ "\n",
+ " # Parallel candidate generations (threaded; Colab-friendly)\n",
+ " max_workers = min(n_runs, 16)\n",
+ " candidates: List[str] = [None] * n_runs # preserve order\n",
+ " with cf.ThreadPoolExecutor(max_workers=max_workers) as ex:\n",
+ " fut_to_idx = {\n",
+ " ex.submit(_one_completion, client, prompt, 0.9): i\n",
+ " for i in range(n_runs)\n",
+ " }\n",
+ " for fut in cf.as_completed(fut_to_idx):\n",
+ " i = fut_to_idx[fut]\n",
+ " candidates[i] = fut.result()\n",
+ "\n",
+ " # Synthesis pass\n",
+ " messages = _build_synthesis_messages(candidates)\n",
+ " final_resp = client.chat(\n",
+ " model=MODEL,\n",
+ " messages=messages,\n",
+ " options={'temperature': 0.2, 'max_completion_tokens':MAX_COMPLETION_TOKENS}\n",
+ " )\n",
+ " final = final_resp['message']['content']\n",
+ "\n",
+ " return {\"final\": final, \"candidates\": candidates}\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "OoBF5UbSVksz"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "=== FINAL ===\n",
+ " **Self‑play in reinforcement learning** \n",
+ "Self‑play turns a two‑player (or multi‑agent) game into its own data‑generator: the learning agent repeatedly plays against a copy of itself (or a past version). Because the opponent improves together with the learner, the difficulty of the task automatically adapts, eliminating the need for hand‑crafted opponents or expert demonstrations.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 1. Why self‑play works\n",
+ "\n",
+ "| Reason | Effect on learning |\n",
+ "|--------|--------------------|\n",
+ "| **Automatic curriculum** | Early games are easy (both agents are weak); later games become harder as the policy improves, keeping the learning signal informative. |\n",
+ "| **No external labels** | The only reward needed is the game outcome (win = +1, loss = ‑1, draw = 0). |\n",
+ "| **Full‑tree exploration** | An evolving opponent forces the learner to discover strategies that would never appear against a static opponent. |\n",
+ "| **Convergence to equilibrium** | In deterministic zero‑sum games the process drives both sides toward a Nash (minimax) policy; for solved games this is the optimal strategy. |\n",
+ "| **Sample efficiency** | One episode yields a training example for *both* players, doubling the amount of useful data per simulation. |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 2. Core ingredients\n",
+ "\n",
+ "| Component | Typical implementation |\n",
+ "|-----------|------------------------|\n",
+ "| **Environment** | Any perfect‑information game (board, video‑game, robot duel). Must expose state, legal actions, transition, and a terminal reward. |\n",
+ "| **Policy‑value network** `fθ(s) → (π, v)` | Small MLP for toy games; deep residual CNN or Transformer for large board games. `π` is a probability distribution over legal moves, `v∈[-1,1]` estimates the expected outcome from the current player’s perspective. |\n",
+ "| **Opponent** | Usually the *current* network itself. For stability a lagged copy (target network) or a pool of older checkpoints can be used. |\n",
+ "| **Search layer (optional)** | Monte‑Carlo Tree Search (MCTS) that uses `π` and `v` to produce a stronger move distribution `π_MCTS`. The search result becomes the training target for the policy head. |\n",
+ "| **Replay buffer** `D` | Stores tuples `(s, π_target, z)` where `z` is the final game result (+1/0/‑1) from the player who acted in `s`. |\n",
+ "| **Loss** | `L(θ) = (z‑v)² – π_target·log π + λ‖θ‖²` (value MSE + policy cross‑entropy + L2 regularisation). |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 3. General self‑play training loop (high‑level pseudocode)\n",
+ "\n",
+ "```python\n",
+ "initialize network fθ randomly\n",
+ "D = empty replay buffer\n",
+ "target = copy of fθ # optional lagged opponent\n",
+ "\n",
+ "while not done:\n",
+ " # ---------- self‑play ----------\n",
+ " for _ in range(num_episodes):\n",
+ " game = new_game()\n",
+ " states, policies = [], []\n",
+ " while not game.terminal():\n",
+ " s = game.state()\n",
+ " # run MCTS if available, otherwise use raw policy\n",
+ " π = MCTS(s, fθ) if use_mcts else softmax(fθ(s).policy)\n",
+ " a = sample_move(π) # ε‑greedy or temperature >0\n",
+ " states.append(s)\n",
+ " policies.append(π)\n",
+ " game.step(a) # opponent = same net, other side\n",
+ " z = game.result() # +1 / 0 / -1 from the player who moved first\n",
+ " # store every position with the perspective‑corrected outcome\n",
+ " for i, s in enumerate(states):\n",
+ " zi = z if i % 2 == 0 else -z # flip sign every move\n",
+ " D.add((s, policies[i], zi))\n",
+ "\n",
+ " # ---------- learning ----------\n",
+ " for _ in range(train_steps):\n",
+ " batch = D.sample(batch_size)\n",
+ " loss = compute_loss(fθ, batch) # value + policy loss\n",
+ " optimizer.zero_grad()\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ "\n",
+ " # ---------- opponent update ----------\n",
+ " if iteration % target_update == 0:\n",
+ " target.load_state_dict(fθ.state_dict()) # or add fθ to a pool of past nets\n",
+ "```\n",
+ "\n",
+ "*Both sides query the **same** network; the only difference is the sign of the value (or a flip of the policy logits) to reflect which player is acting.*\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 4. Minimal working example – Tic‑Tac‑Toe\n",
+ "\n",
+ "### 4.1 Game definition (Python‑like)\n",
+ "\n",
+ "```python\n",
+ "class TicTacToe:\n",
+ " def __init__(self):\n",
+ " self.board = np.zeros(9, dtype=int) # 0 empty, 1 X, -1 O\n",
+ " self.player = 1 # X starts\n",
+ "\n",
+ " def legal(self):\n",
+ " return np.where(self.board == 0)[0]\n",
+ "\n",
+ " def step(self, a):\n",
+ " self.board[a] = self.player\n",
+ " win = self.check_winner()\n",
+ " done = win is not None or not self.legal().size\n",
+ " reward = 0\n",
+ " if done:\n",
+ " reward = 0 if win == 0 else (1 if win == self.player else -1)\n",
+ " self.player *= -1\n",
+ " return self.board.copy(), reward, done\n",
+ "\n",
+ " # returns 1 if X wins, -1 if O wins, 0 for draw, None otherwise\n",
+ " def check_winner(self):\n",
+ " lines = [(0,1,2),(3,4,5),(6,7,8),\n",
+ " (0,3,6),(1,4,7),(2,5,8),\n",
+ " (0,4,8),(2,4,6)]\n",
+ " for a,b,c in lines:\n",
+ " s = self.board[a] + self.board[b] + self.board[c]\n",
+ " if s == 3: return 1\n",
+ " if s == -3: return -1\n",
+ " if not (self.board == 0).any(): return 0\n",
+ " return None\n",
+ "```\n",
+ "\n",
+ "### 4.2 Tiny policy‑value net (MLP)\n",
+ "\n",
+ "```python\n",
+ "class Net(nn.Module):\n",
+ " def __init__(self):\n",
+ " super().__init__()\n",
+ " self.fc = nn.Sequential(\n",
+ " nn.Linear(9, 128), nn.ReLU(),\n",
+ " nn.Linear(128, 128), nn.ReLU()\n",
+ " )\n",
+ " self.policy_head = nn.Linear(128, 9) # logits\n",
+ " self.value_head = nn.Linear(128, 1) # tanh output\n",
+ "\n",
+ " def forward(self, board):\n",
+ " x = self.fc(board.float())\n",
+ " return self.policy_head(x), torch.tanh(self.value_head(x))\n",
+ "```\n",
+ "\n",
+ "### 4.3 Self‑play episode (no MCTS)\n",
+ "\n",
+ "```python\n",
+ "def play_episode(net, eps=0.1):\n",
+ " env = TicTacToe()\n",
+ " traj = [] # (state, player, policy)\n",
+ " while True:\n",
+ " s = torch.from_numpy(env.board).float()\n",
+ " logits, _ = net(s)\n",
+ " # mask illegal moves\n",
+ " mask = (env.board == 0).astype(float)\n",
+ " probs = torch.softmax(logits, dim=0) * torch.from_numpy(mask)\n",
+ " probs = probs / probs.sum()\n",
+ "\n",
+ " # ε‑greedy exploration\n",
+ " if random.random() < eps:\n",
+ " a = random.choice(env.legal())\n",
+ " else:\n",
+ " a = torch.multinomial(probs, 1).item()\n",
+ "\n",
+ " traj.append((s.clone(), env.player, probs.clone()))\n",
+ " _, reward, done = env.step(a)\n",
+ " if done:\n",
+ " # propagate final outcome to every step\n",
+ " for state, player, pi in traj:\n",
+ " z = reward * player # perspective‑corrected result\n",
+ " replay.add(state, pi, z)\n",
+ " break\n",
+ "```\n",
+ "\n",
+ "### 4.4 Training step\n",
+ "\n",
+ "```python\n",
+ "def train_step(net, batch):\n",
+ " states, pis, zs = zip(*batch) # each is a list of tensors\n",
+ " states = torch.stack(states)\n",
+ " target_pi = torch.stack(pis)\n",
+ " target_z = torch.tensor(zs).unsqueeze(1).float()\n",
+ "\n",
+ " logits, values = net(states)\n",
+ " policy_loss = -torch.mean(torch.sum(target_pi * torch.log_softmax(logits, dim=1), dim=1))\n",
+ " value_loss = torch.mean((values - target_z) ** 2)\n",
+ " loss = policy_loss + value_loss\n",
+ "\n",
+ " optimizer.zero_grad()\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ " return loss.item()\n",
+ "```\n",
+ "\n",
+ "Running the loop for a few hundred thousand episodes drives the network to a **draw‑only** policy—the optimal solution for Tic‑Tac‑Toe. The same code, with a deeper net and MCTS, scales to Go, Chess, Shogi (AlphaZero) and even to multi‑agent video games.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 5. Scaling up – what changes for AlphaZero‑style systems\n",
+ "\n",
+ "| Change | Reason |\n",
+ "|--------|--------|\n",
+ "| **Monte‑Carlo Tree Search** | Provides a strong move distribution `π_MCTS` that is used as the policy target; dramatically improves data efficiency in huge state spaces. |\n",
+ "| **Large replay buffer (≈10⁶–10⁷ games)** | Guarantees a diverse training set and smooths the non‑stationarity caused by the moving opponent. |\n",
+ "| **Lagged or pooled opponents** | Instead of always playing against the current net, sample from a *league* of older checkpoints; prevents cyclic strategies and stabilises convergence. |\n",
+ "| **Curriculum on temperature & Dirichlet noise** | Early games use high temperature (more random) and added Dirichlet noise to explore openings; later games become more deterministic. |\n",
+ "| **Distributed generation** | Self‑play is parallelised over thousands of CPU cores / GPUs; training is performed on GPUs. |\n",
+ "| **Regularisation & target network** | L2 weight decay, dropout, and a slowly‑updated target net reduce catastrophic policy swings. |\n",
+ "\n",
+ "With these additions AlphaZero learned master‑level play in Chess, Shogi and Go from scratch in a few days of compute.\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 6. Common pitfalls & practical tips\n",
+ "\n",
+ "| Pitfall | Mitigation |\n",
+ "|---------|------------|\n",
+ "| **Mode collapse (repeating the same line)** | Keep a pool of past opponents, inject exploration noise, or use a temperature schedule. |\n",
+ "| **Non‑stationary targets** | Use a target network for generating `π_target` and `v_target`, or sample from a replay buffer that contains older games. |\n",
+ "| **Sparse reward** (only win/loss) | Add intermediate shaping (e.g., material advantage in Chess) *only* for the value head; keep the policy target derived from search. |\n",
+ "| **Credit assignment in long games** | Use bootstrapped value estimates from the network inside MCTS, or employ TD(λ) / n‑step returns instead of pure Monte‑Carlo returns. |\n",
+ "| **Over‑fitting to a single opponent** | Periodically evaluate against a fixed strong baseline (e.g., a handcrafted engine) to detect regression. |\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## 7. Quick checklist for a new self‑play project\n",
+ "\n",
+ "1. **Define a two‑player environment** with clear win/loss reward. \n",
+ "2. **Choose a network architecture** (MLP for tiny games, CNN/Transformer for board games). \n",
+ "3. **Implement a copy‑as‑opponent** routine (direct clone, lagged copy, or opponent pool). \n",
+ "4. **Write a self‑play generator** that records `(state, policy, outcome)` for every move. \n",
+ "5. **Select an RL algorithm** – REINFORCE, PPO, or Q‑learning; combine with a value head if possible. \n",
+ "6. **Set up a replay buffer** and a training step that minimises the combined policy‑value loss. \n",
+ "7. **Add stabilisation tricks** – target network, opponent pool, exploration noise, temperature schedule. \n",
+ "8. **Periodically evaluate** against older checkpoints or a known baseline to monitor progress. \n",
+ "\n",
+ "---\n",
+ "\n",
+ "## TL;DR\n",
+ "\n",
+ "*Self‑play* lets an RL agent improve by repeatedly playing a game against a copy of itself. The opponent’s strength rises together with the learner, providing an automatic curriculum and removing the need for external data. The basic loop is:\n",
+ "\n",
+ "1. **Generate games** with the current policy (optionally guided by MCTS). \n",
+ "2. **Store** for each move the state, the improved move distribution, and the final win/loss outcome. \n",
+ "3. **Update** a shared policy‑value network with a loss that combines value regression and policy cross‑entropy. \n",
+ "4. **Refresh** the opponent (lagged copy or pool) and repeat.\n",
+ "\n",
+ "A tiny neural net trained on self‑play Tic‑Tac‑Toe converges to perfect (draw‑only) play; the same framework, augmented with MCTS, large replay buffers, and opponent leagues, underlies world‑class systems such as AlphaZero, AlphaStar, OpenAI Five, and many multi‑agent robotics or language‑game projects.\n"
+ ]
+ }
+ ],
+ "source": [
+ "PROMPT = \"Explain self-play in reinforcement learning with a concrete example.\"\n",
+ "NUMBER_OF_CANDIDATES = 5 # start with five, go up if you need more intelligence!\n",
+ "OLLAMA_API_KEY = \"yourkey\"\n",
+ "\n",
+ "client = ollama.Client(\n",
+ " host=\"https://ollama.com\",\n",
+ " headers={'Authorization': OLLAMA_API_KEY}\n",
+ ")\n",
+ "\n",
+ "\n",
+ "\n",
+ "result = pro_mode(client, PROMPT, NUMBER_OF_CANDIDATES)\n",
+ "\n",
+ "print(\"\\n=== FINAL ===\\n\", result[\"final\"])\n",
+ "# To inspect candidates:\n",
+ "# for i, c in enumerate(result[\"candidates\"], 1): print(f\"\\n--- Candidate {i} ---\\n{c}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "authorship_tag": "ABX9TyMEoKFi+0SrPi/OXiwQ1i58",
+ "include_colab_link": true,
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/README.md b/README.md
index e3de401..31b27f2 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,151 @@
+# GPT-OSS Pro Mode
+
+A collection of Jupyter notebooks that implement **Pro Mode** - an advanced AI reasoning technique that generates multiple candidate responses in parallel and then synthesizes them into a single, high-quality answer.
+
+## 🎯 What is Pro Mode?
+
+Pro Mode is a sophisticated approach to AI reasoning that mimics how expert humans think through complex problems:
+
+1. **Generate Multiple Perspectives**: Creates several candidate responses to the same question
+2. **Parallel Processing**: Uses multiple AI instances simultaneously for efficiency
+3. **Intelligent Synthesis**: Combines the best parts of each candidate into a final, refined answer
+
+This technique significantly improves answer quality, reduces errors, and provides more comprehensive responses compared to single-shot AI interactions.
+
+## 🚀 Key Benefits
+
+- **Higher Quality Answers**: Multiple perspectives lead to more thorough and accurate responses
+- **Error Reduction**: Synthesis process catches and corrects individual mistakes
+- **Better Reasoning**: Combines strengths from different approaches
+- **Comprehensive Coverage**: Addresses aspects that single responses might miss
+
+## 📁 Available Implementations
+
+This repository contains three different implementations of Pro Mode:
+
+### 1. **Groq Implementation** (`OpenAI_Open_Source_Pro_Mode_Groq.ipynb`)
+- Uses Groq's fast inference API
+- Requires Groq API key
+- Optimized for speed and cost-effectiveness
+- Best for production use cases
+
+### 2. **Ollama Local Implementation** (`OpenAI_Open_Source_Pro_Mode_Ollama_Local.ipynb`)
+- Runs locally using Ollama
+- No API costs or internet required
+- Uses the `gpt-oss:120b` model
+- Perfect for privacy-conscious users
+
+### 3. **Ollama Turbo Implementation** (`OpenAI_Open_Source_Pro_Mode_Ollama_Turbo.ipynb`)
+- Enhanced version with additional features
+- More sophisticated synthesis process
+- Better error handling and retry logic
+
+## 🔧 How It Works
+
+### The Pro Mode Process
+
+1. **Parallel Generation**:
+ - Takes your prompt and generates `n_runs` candidate responses simultaneously
+ - Uses high temperature (0.9) for creative diversity
+ - Runs in parallel threads for efficiency
+
+2. **Synthesis Phase**:
+ - An expert editor AI analyzes all candidates
+ - Merges strengths, corrects errors, removes repetition
+ - Uses low temperature (0.2) for focused synthesis
+ - Produces a single, refined final answer
+
+3. **Quality Output**:
+ - Returns both the final synthesized answer and all candidates
+ - Allows inspection of individual candidates if needed
+
+### Example Usage
+
+```python
+# Set up your preferred implementation
+# (Groq, Ollama Local, or Ollama Turbo)
+
+# Define your question
+PROMPT = "Explain self-play in reinforcement learning with a concrete example."
+NUMBER_OF_CANDIDATES = 3 # Adjust based on complexity
+
+# Run Pro Mode
+result = pro_mode(client, PROMPT, NUMBER_OF_CANDIDATES)
+
+# Get the final synthesized answer
+print("=== FINAL ANSWER ===")
+print(result["final"])
+
+# Optionally inspect individual candidates
+for i, candidate in enumerate(result["candidates"], 1):
+ print(f"\n--- Candidate {i} ---")
+ print(candidate)
+```
+
+## 🛠️ Setup Instructions
+
+### For Groq Implementation:
+1. Get a Groq API key from [groq.com](https://groq.com)
+2. Set environment variable: `export GROQ_API_KEY="your-key-here"`
+3. Run the notebook: `OpenAI_Open_Source_Pro_Mode_Groq.ipynb`
+
+### For Ollama Local Implementation:
+1. Install Ollama: `curl -fsSL https://ollama.ai/install.sh | sh`
+2. Pull the model: `ollama pull gpt-oss:120b`
+3. Run the notebook: `OpenAI_Open_Source_Pro_Mode_Ollama_Local.ipynb`
+
+### For Ollama Turbo Implementation:
+1. Follow same setup as Ollama Local
+2. Run the notebook: `OpenAI_Open_Source_Pro_Mode_Ollama_Turbo.ipynb`
+
+## ⚙️ Configuration Options
+
+- **`n_runs`**: Number of candidate responses (2-5 recommended)
+- **`MAX_COMPLETION_TOKENS`**: Maximum response length (default: 30000)
+- **Temperature**: 0.9 for candidates, 0.2 for synthesis
+- **Model**: `openai/gpt-oss-120b` (Groq) or `gpt-oss:120b` (Ollama)
+
+## 🎯 When to Use Pro Mode
+
+**Best for:**
+- Complex reasoning problems
+- Technical explanations
+- Creative writing tasks
+- Problem-solving scenarios
+- Research and analysis
+
+**Not needed for:**
+- Simple factual questions
+- Basic text generation
+- Real-time applications (due to parallel processing time)
+
+## 📊 Performance Tips
+
+- Start with 2-3 candidates for most questions
+- Increase to 4-5 for very complex problems
+- Monitor API costs when using Groq
+- Consider local Ollama for privacy-sensitive tasks
+
+## 🤝 Contributing
+
+Created by Matt Shumer ([@mattshumer_](https://x.com/mattshumer_) on X)
+
+Feel free to:
+- Submit issues and feature requests
+- Contribute improvements to the synthesis logic
+- Add new model implementations
+- Share your use cases and results
+
+## 📄 License
+
+This project is open source. Please check individual notebook headers for specific licensing information.
+
+---
+
+**Pro Mode transforms single-shot AI interactions into collaborative reasoning sessions, delivering significantly higher quality results through intelligent synthesis of multiple perspectives.**
+
# gpt-oss-pro-mode
[](https://x.com/mattshumer_)
-[](https://colab.research.google.com/drive/1XeYmOHJwACtavCjJM-eOqlPxHgTD2KNP?usp=sharing)
-
-Run the attached notebook to access Pro mode! Star this repo and let me know what you want me to add!
+[](https://colab.research.google.com/drive/1XeYmOHJwACtavCjJM-eOqlPxHgTD2KNP?usp=sharing)
\ No newline at end of file