From e502e51d78e31d7dd00f92f3d626216da111672f Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Apr 2026 17:46:44 +0000 Subject: [PATCH 1/3] feat(llama.cpp): add turboquant support This PR adds patchset from the great work of @TheTom in https://github.com/TheTom/llama-cpp-turboquant and creates a pipeline that updates the patches against upstream automatically. It also creates necessary scaffolding for doing this with other patches sources. Signed-off-by: Ettore Di Giacinto --- .github/workflows/bump_deps.yaml | 3 - backend/cpp/llama-cpp/patches/sources.yaml | 9 ++ backend/cpp/llama-cpp/prepare.sh | 17 ++-- scripts/patch_utils/apply_patches.sh | 109 +++++++++++++++++++++ 4 files changed, 124 insertions(+), 14 deletions(-) create mode 100644 backend/cpp/llama-cpp/patches/sources.yaml create mode 100755 scripts/patch_utils/apply_patches.sh diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 49e489beb00b..ae63fd232683 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -63,6 +63,3 @@ jobs: branch: "update/${{ matrix.variable }}" body: ${{ steps.bump.outputs.message }} signoff: true - - - diff --git a/backend/cpp/llama-cpp/patches/sources.yaml b/backend/cpp/llama-cpp/patches/sources.yaml new file mode 100644 index 000000000000..9c13ae7a4ba9 --- /dev/null +++ b/backend/cpp/llama-cpp/patches/sources.yaml @@ -0,0 +1,9 @@ +# Patch sources for the llama-cpp backend. +# Each source declares a fork whose commits are extracted as patches +# and applied on top of upstream llama.cpp during the build. +# See scripts/patch_utils/apply_patches.sh for the generic patch engine. +sources: + - name: turboquant + repo: https://github.com/TheTom/llama-cpp-turboquant.git + branch: feature/turboquant-kv-cache + upstream_repo: https://github.com/ggml-org/llama.cpp.git diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh index f9b7e3dd2651..b6f2c25c891c 100644 --- a/backend/cpp/llama-cpp/prepare.sh +++ b/backend/cpp/llama-cpp/prepare.sh @@ -1,17 +1,13 @@ #!/bin/bash +set -e -## Patches +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$SCRIPT_DIR/../../.." -## Apply patches from the `patches` directory -if [ -d "patches" ]; then - for patch in $(ls patches); do - echo "Applying patch $patch" - patch -d llama.cpp/ -p1 < patches/$patch - done -fi - -set -e +## Apply patches from sources and/or local .patch files +"$REPO_ROOT/scripts/patch_utils/apply_patches.sh" "$SCRIPT_DIR" llama.cpp +## Copy server files into grpc-server build directory for file in $(ls llama.cpp/tools/server/); do cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/ done @@ -28,4 +24,3 @@ else echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt fi set -e - diff --git a/scripts/patch_utils/apply_patches.sh b/scripts/patch_utils/apply_patches.sh new file mode 100755 index 000000000000..9386b3ac6662 --- /dev/null +++ b/scripts/patch_utils/apply_patches.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# apply_patches.sh — Generic patch fetcher and applier for any backend. +# +# Usage: ./apply_patches.sh +# +# Directory containing a patches/ folder (with optional sources.yaml) +# The cloned upstream repo to patch (e.g., llama.cpp/) +# +# Behavior (idempotent): +# 1. If patches/sources.yaml exists and yq is available, for each source: +# - If patches// already has .patch files: skip fetching (vendored) +# - Otherwise: clone the fork, auto-detect the fork base via merge-base +# with the upstream repo, and generate patches +# 2. Apply all patches from source subdirectories (alphabetical), then top-level .patch files +# 3. Fails fast on any patch application error + +set -e + +apply_patches() { + local SOURCE_DIR="$(cd "$1" && pwd)" + local TARGET_DIR="$2" + local PATCHES_DIR="$SOURCE_DIR/patches" + + if [ ! -d "$PATCHES_DIR" ]; then + return 0 + fi + + # Phase 1: Generate missing patches from fork sources + if [ -f "$PATCHES_DIR/sources.yaml" ] && command -v yq &>/dev/null; then + local SOURCE_COUNT + SOURCE_COUNT=$(yq '.sources | length' "$PATCHES_DIR/sources.yaml") + + for i in $(seq 0 $((SOURCE_COUNT - 1))); do + local NAME REPO BRANCH UPSTREAM_REPO + NAME=$(yq ".sources[$i].name" "$PATCHES_DIR/sources.yaml") + REPO=$(yq ".sources[$i].repo" "$PATCHES_DIR/sources.yaml") + BRANCH=$(yq ".sources[$i].branch" "$PATCHES_DIR/sources.yaml") + UPSTREAM_REPO=$(yq ".sources[$i].upstream_repo" "$PATCHES_DIR/sources.yaml") + + local SOURCE_PATCH_DIR="$PATCHES_DIR/$NAME" + local EXISTING + EXISTING=$(ls "$SOURCE_PATCH_DIR"/*.patch 2>/dev/null | wc -l) + + if [ "$EXISTING" -gt 0 ]; then + echo "Patches [$NAME]: $EXISTING patches already present — skipping fetch." + else + echo "Patches [$NAME]: fetching from $REPO ($BRANCH)" + + local TMPDIR + TMPDIR=$(mktemp -d) + + if git clone --single-branch -b "$BRANCH" "$REPO" "$TMPDIR/fork" 2>&1; then + cd "$TMPDIR/fork" + + # Auto-detect fork base: merge-base between fork and upstream + git remote add upstream "$UPSTREAM_REPO" + git fetch upstream 2>&1 + + local FORK_BASE + FORK_BASE=$(git merge-base HEAD upstream/master 2>/dev/null || \ + git merge-base HEAD upstream/main 2>/dev/null || echo "") + + if [ -z "$FORK_BASE" ]; then + echo "WARNING: Could not find merge-base with upstream — skipping source '$NAME'" + cd "$SOURCE_DIR" + rm -rf "$TMPDIR" + continue + fi + + local PATCH_COUNT + PATCH_COUNT=$(git rev-list --count "$FORK_BASE"..HEAD 2>/dev/null || echo "0") + echo " Fork base: ${FORK_BASE:0:12} ($PATCH_COUNT commits to extract)" + + if [ "$PATCH_COUNT" -gt 0 ]; then + mkdir -p "$SOURCE_PATCH_DIR" + git format-patch "$FORK_BASE"..HEAD -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1 + echo " Generated $PATCH_COUNT patches in patches/$NAME/" + fi + cd "$SOURCE_DIR" + else + echo "WARNING: Failed to clone $REPO — skipping source '$NAME'" + fi + + rm -rf "$TMPDIR" + fi + done + elif [ -f "$PATCHES_DIR/sources.yaml" ]; then + echo "WARNING: yq not found — skipping source-based patch generation." + fi + + # Phase 2: Apply patches (subdirectories first, then top-level) + for source_dir in $(find "$PATCHES_DIR" -mindepth 1 -maxdepth 1 -type d | sort); do + for p in $(ls "$source_dir"/*.patch 2>/dev/null | sort); do + echo "Applying: $(basename "$source_dir")/$(basename "$p")" + patch -d "$TARGET_DIR" -p1 < "$p" || { echo "FAILED: $p"; exit 1; } + done + done + for p in $(ls "$PATCHES_DIR"/*.patch 2>/dev/null | sort); do + echo "Applying: $(basename "$p")" + patch -d "$TARGET_DIR" -p1 < "$p" || { echo "FAILED: $p"; exit 1; } + done +} + +# Run with arguments +if [ $# -lt 2 ]; then + echo "Usage: $0 " + exit 1 +fi +apply_patches "$1" "$2" From a7a142b6510310cd1391fc62cd048c868c4f07e3 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Apr 2026 19:42:16 +0000 Subject: [PATCH 2/3] refactor, macOS fixes Signed-off-by: Ettore Di Giacinto --- scripts/patch_utils/apply_patches.sh | 39 ++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/scripts/patch_utils/apply_patches.sh b/scripts/patch_utils/apply_patches.sh index 9386b3ac6662..360240214ef8 100755 --- a/scripts/patch_utils/apply_patches.sh +++ b/scripts/patch_utils/apply_patches.sh @@ -11,11 +11,30 @@ # - If patches// already has .patch files: skip fetching (vendored) # - Otherwise: clone the fork, auto-detect the fork base via merge-base # with the upstream repo, and generate patches -# 2. Apply all patches from source subdirectories (alphabetical), then top-level .patch files +# 2. Apply all patches using patch --forward (skips already-applied patches) # 3. Fails fast on any patch application error set -e +# Use /tmp for patch temp files to avoid macOS long-path issues +export TMPDIR="${TMPDIR_OVERRIDE:-/tmp}" + +apply_one_patch() { + local target_dir="$1" + local patch_file="$2" + local label="$3" + + # --forward skips patches that appear already applied (exit code 0 with a message) + # We check dry-run first: if reverse-apply succeeds, patch is already in. + if patch -d "$target_dir" -p1 --reverse --dry-run < "$patch_file" >/dev/null 2>&1; then + echo " Already applied, skipping: $label" + return 0 + fi + + echo " Applying: $label" + patch -d "$target_dir" -p1 --forward < "$patch_file" || { echo "FAILED: $patch_file"; exit 1; } +} + apply_patches() { local SOURCE_DIR="$(cd "$1" && pwd)" local TARGET_DIR="$2" @@ -46,11 +65,11 @@ apply_patches() { else echo "Patches [$NAME]: fetching from $REPO ($BRANCH)" - local TMPDIR - TMPDIR=$(mktemp -d) + local TMPDIR_CLONE + TMPDIR_CLONE=$(mktemp -d) - if git clone --single-branch -b "$BRANCH" "$REPO" "$TMPDIR/fork" 2>&1; then - cd "$TMPDIR/fork" + if git clone --single-branch -b "$BRANCH" "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then + cd "$TMPDIR_CLONE/fork" # Auto-detect fork base: merge-base between fork and upstream git remote add upstream "$UPSTREAM_REPO" @@ -63,7 +82,7 @@ apply_patches() { if [ -z "$FORK_BASE" ]; then echo "WARNING: Could not find merge-base with upstream — skipping source '$NAME'" cd "$SOURCE_DIR" - rm -rf "$TMPDIR" + rm -rf "$TMPDIR_CLONE" continue fi @@ -81,7 +100,7 @@ apply_patches() { echo "WARNING: Failed to clone $REPO — skipping source '$NAME'" fi - rm -rf "$TMPDIR" + rm -rf "$TMPDIR_CLONE" fi done elif [ -f "$PATCHES_DIR/sources.yaml" ]; then @@ -91,13 +110,11 @@ apply_patches() { # Phase 2: Apply patches (subdirectories first, then top-level) for source_dir in $(find "$PATCHES_DIR" -mindepth 1 -maxdepth 1 -type d | sort); do for p in $(ls "$source_dir"/*.patch 2>/dev/null | sort); do - echo "Applying: $(basename "$source_dir")/$(basename "$p")" - patch -d "$TARGET_DIR" -p1 < "$p" || { echo "FAILED: $p"; exit 1; } + apply_one_patch "$TARGET_DIR" "$p" "$(basename "$source_dir")/$(basename "$p")" done done for p in $(ls "$PATCHES_DIR"/*.patch 2>/dev/null | sort); do - echo "Applying: $(basename "$p")" - patch -d "$TARGET_DIR" -p1 < "$p" || { echo "FAILED: $p"; exit 1; } + apply_one_patch "$TARGET_DIR" "$p" "$(basename "$p")" done } From 659636195c9ece31edd033f6fed4b4df00783b08 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Apr 2026 19:45:31 +0000 Subject: [PATCH 3/3] deterministic builds Signed-off-by: Ettore Di Giacinto --- .github/workflows/bump_deps.yaml | 4 ++ backend/cpp/llama-cpp/Makefile | 2 + backend/cpp/llama-cpp/patches/sources.yaml | 9 ++- scripts/patch_utils/apply_patches.sh | 77 ++++++++++++++-------- 4 files changed, 64 insertions(+), 28 deletions(-) diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index ae63fd232683..9507129c70dc 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -14,6 +14,10 @@ jobs: variable: "LLAMA_VERSION" branch: "master" file: "backend/cpp/llama-cpp/Makefile" + - repository: "TheTom/llama-cpp-turboquant" + variable: "TURBOQUANT_VERSION" + branch: "feature/turboquant-kv-cache" + file: "backend/cpp/llama-cpp/Makefile" - repository: "ggml-org/whisper.cpp" variable: "WHISPER_CPP_VERSION" branch: "master" diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 7dd393ccac95..02dbc32f2a11 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -2,6 +2,8 @@ LLAMA_VERSION?=0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f LLAMA_REPO?=https://github.com/ggerganov/llama.cpp +TURBOQUANT_VERSION?=8ad0f00e9a38df6c29fc10363341dde300f92ae4 + CMAKE_ARGS?= BUILD_TYPE?= NATIVE?=false diff --git a/backend/cpp/llama-cpp/patches/sources.yaml b/backend/cpp/llama-cpp/patches/sources.yaml index 9c13ae7a4ba9..4e671f5cc11d 100644 --- a/backend/cpp/llama-cpp/patches/sources.yaml +++ b/backend/cpp/llama-cpp/patches/sources.yaml @@ -2,8 +2,13 @@ # Each source declares a fork whose commits are extracted as patches # and applied on top of upstream llama.cpp during the build. # See scripts/patch_utils/apply_patches.sh for the generic patch engine. +# +# version_var: Makefile variable with the pinned fork commit SHA +# base_var: Makefile variable with the upstream base commit SHA +# Both are read from version_file (relative to backend dir) to compute the diff. sources: - name: turboquant repo: https://github.com/TheTom/llama-cpp-turboquant.git - branch: feature/turboquant-kv-cache - upstream_repo: https://github.com/ggml-org/llama.cpp.git + version_var: TURBOQUANT_VERSION + base_var: LLAMA_VERSION + version_file: Makefile diff --git a/scripts/patch_utils/apply_patches.sh b/scripts/patch_utils/apply_patches.sh index 360240214ef8..3733743d458b 100755 --- a/scripts/patch_utils/apply_patches.sh +++ b/scripts/patch_utils/apply_patches.sh @@ -9,23 +9,32 @@ # Behavior (idempotent): # 1. If patches/sources.yaml exists and yq is available, for each source: # - If patches// already has .patch files: skip fetching (vendored) -# - Otherwise: clone the fork, auto-detect the fork base via merge-base -# with the upstream repo, and generate patches -# 2. Apply all patches using patch --forward (skips already-applied patches) +# - Otherwise: clone the fork at a pinned SHA, diff against the pinned +# upstream SHA, and generate patches +# 2. Apply all patches (skips already-applied ones) # 3. Fails fast on any patch application error +# +# sources.yaml fields: +# name — subdirectory name for this source's patches +# repo — fork git URL +# version_var — Makefile variable holding the pinned fork commit SHA +# base_var — Makefile variable holding the pinned upstream commit SHA +# version_file — Makefile path (relative to backend dir) set -e # Use /tmp for patch temp files to avoid macOS long-path issues export TMPDIR="${TMPDIR_OVERRIDE:-/tmp}" +read_makefile_var() { + grep -m1 "^${1}?=" "$2" | cut -d'=' -f2 +} + apply_one_patch() { local target_dir="$1" local patch_file="$2" local label="$3" - # --forward skips patches that appear already applied (exit code 0 with a message) - # We check dry-run first: if reverse-apply succeeds, patch is already in. if patch -d "$target_dir" -p1 --reverse --dry-run < "$patch_file" >/dev/null 2>&1; then echo " Already applied, skipping: $label" return 0 @@ -50,11 +59,22 @@ apply_patches() { SOURCE_COUNT=$(yq '.sources | length' "$PATCHES_DIR/sources.yaml") for i in $(seq 0 $((SOURCE_COUNT - 1))); do - local NAME REPO BRANCH UPSTREAM_REPO + local NAME REPO VERSION_VAR BASE_VAR VERSION_FILE NAME=$(yq ".sources[$i].name" "$PATCHES_DIR/sources.yaml") REPO=$(yq ".sources[$i].repo" "$PATCHES_DIR/sources.yaml") - BRANCH=$(yq ".sources[$i].branch" "$PATCHES_DIR/sources.yaml") - UPSTREAM_REPO=$(yq ".sources[$i].upstream_repo" "$PATCHES_DIR/sources.yaml") + VERSION_VAR=$(yq ".sources[$i].version_var" "$PATCHES_DIR/sources.yaml") + BASE_VAR=$(yq ".sources[$i].base_var" "$PATCHES_DIR/sources.yaml") + VERSION_FILE=$(yq ".sources[$i].version_file" "$PATCHES_DIR/sources.yaml") + + local MAKEFILE="$SOURCE_DIR/$VERSION_FILE" + local FORK_SHA BASE_SHA + FORK_SHA=$(read_makefile_var "$VERSION_VAR" "$MAKEFILE") + BASE_SHA=$(read_makefile_var "$BASE_VAR" "$MAKEFILE") + + if [ -z "$FORK_SHA" ] || [ -z "$BASE_SHA" ]; then + echo "WARNING: Could not read $VERSION_VAR or $BASE_VAR from $MAKEFILE — skipping '$NAME'" + continue + fi local SOURCE_PATCH_DIR="$PATCHES_DIR/$NAME" local EXISTING @@ -63,36 +83,41 @@ apply_patches() { if [ "$EXISTING" -gt 0 ]; then echo "Patches [$NAME]: $EXISTING patches already present — skipping fetch." else - echo "Patches [$NAME]: fetching from $REPO ($BRANCH)" + echo "Patches [$NAME]: generating from $REPO" + echo " base (upstream): ${BASE_SHA:0:12}" + echo " head (fork): ${FORK_SHA:0:12}" local TMPDIR_CLONE TMPDIR_CLONE=$(mktemp -d) - if git clone --single-branch -b "$BRANCH" "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then + if git clone "$REPO" "$TMPDIR_CLONE/fork" 2>&1; then cd "$TMPDIR_CLONE/fork" - # Auto-detect fork base: merge-base between fork and upstream - git remote add upstream "$UPSTREAM_REPO" - git fetch upstream 2>&1 - - local FORK_BASE - FORK_BASE=$(git merge-base HEAD upstream/master 2>/dev/null || \ - git merge-base HEAD upstream/main 2>/dev/null || echo "") - - if [ -z "$FORK_BASE" ]; then - echo "WARNING: Could not find merge-base with upstream — skipping source '$NAME'" - cd "$SOURCE_DIR" - rm -rf "$TMPDIR_CLONE" - continue + # Fetch the upstream base commit (may not be in the fork's history) + git fetch origin "$FORK_SHA" 2>&1 || true + git checkout "$FORK_SHA" 2>&1 + + # We need the base commit in the history to compute the diff. + # If the fork is a real GitHub fork, it shares history with upstream. + # Otherwise, fetch it explicitly. + if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then + echo " Base commit not in fork history — fetching from upstream" + local UPSTREAM_URL + # Derive upstream URL from base_var context or use llama.cpp default + UPSTREAM_URL=$(yq ".sources[$i].upstream_repo // \"\"" "$PATCHES_DIR/sources.yaml") + if [ -n "$UPSTREAM_URL" ] && [ "$UPSTREAM_URL" != "null" ]; then + git remote add upstream "$UPSTREAM_URL" 2>/dev/null || true + git fetch upstream 2>&1 + fi fi local PATCH_COUNT - PATCH_COUNT=$(git rev-list --count "$FORK_BASE"..HEAD 2>/dev/null || echo "0") - echo " Fork base: ${FORK_BASE:0:12} ($PATCH_COUNT commits to extract)" + PATCH_COUNT=$(git rev-list --count "$BASE_SHA".."$FORK_SHA" 2>/dev/null || echo "0") + echo " $PATCH_COUNT commits in diff" if [ "$PATCH_COUNT" -gt 0 ]; then mkdir -p "$SOURCE_PATCH_DIR" - git format-patch "$FORK_BASE"..HEAD -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1 + git format-patch "$BASE_SHA".."$FORK_SHA" -o "$SOURCE_PATCH_DIR/" >/dev/null 2>&1 echo " Generated $PATCH_COUNT patches in patches/$NAME/" fi cd "$SOURCE_DIR"