From a72325b0d5be7592afe87f01571e913e04a8e394 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 7 Jun 2026 18:48:40 -0700
Subject: [PATCH 1/4] fix(example): avoid duplicate streamed response deltas
 (#2285)

---
 examples/server/server.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/server/server.py b/examples/server/server.py
index 64a16f0bd..ec83fe6e3 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -8695,7 +8695,7 @@ def _ensure_reasoning_stream_item(
                 state,
                 "response.output_item.added",
                 output_index=item_state.output_index,
-                item=item,
+                item=copy.deepcopy(item),
             ),
             self._response_event(
                 state,
@@ -8703,7 +8703,7 @@ def _ensure_reasoning_stream_item(
                 item_id=cast(str, item["id"]),
                 output_index=item_state.output_index,
                 content_index=0,
-                part=part,
+                part=copy.deepcopy(part),
             ),
         ], item_state
 
@@ -8727,7 +8727,7 @@ def _ensure_message_stream_item(
                 state,
                 "response.output_item.added",
                 output_index=item_state.output_index,
-                item=item,
+                item=copy.deepcopy(item),
             ),
             self._response_event(
                 state,
@@ -8735,7 +8735,7 @@ def _ensure_message_stream_item(
                 item_id=cast(str, item["id"]),
                 output_index=item_state.output_index,
                 content_index=0,
-                part=part,
+                part=copy.deepcopy(part),
             ),
         ], item_state
 
@@ -8777,7 +8777,7 @@ def _ensure_tool_stream_item(
                 state,
                 "response.output_item.added",
                 output_index=item_state.output_index,
-                item=item,
+                item=copy.deepcopy(item),
             )
         ], item_state
 

From 411e0f40e767b2ac3b1155be73916ea94c2ddfc6 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 7 Jun 2026 20:29:46 -0700
Subject: [PATCH 2/4] fix(example): derive streaming response parser boundaries
 from schema (#2287)

---
 examples/server/server.py | 329 +++++++++++++++++++++++++++++---------
 1 file changed, 249 insertions(+), 80 deletions(-)

diff --git a/examples/server/server.py b/examples/server/server.py
index ec83fe6e3..28fc8f4eb 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -4344,6 +4344,11 @@ def capture(match: re.Match[str]) -> str:
 
     @staticmethod
     def _regex_literal_prefix(pattern: str) -> str:
+        literal, _ = ResponseParser._regex_literal_prefix_and_remainder(pattern)
+        return literal
+
+    @staticmethod
+    def _regex_literal_prefix_and_remainder(pattern: str) -> Tuple[str, str]:
         literal: List[str] = []
         index = 0
         while index < len(pattern):
@@ -4365,7 +4370,181 @@ def _regex_literal_prefix(pattern: str) -> str:
                 break
             literal.append(char)
             index += 1
-        return "".join(literal)
+        return "".join(literal), pattern[index:]
+
+    @staticmethod
+    def _find_regex_group_end(pattern: str, start: int) -> int:
+        depth = 0
+        escaped = False
+        in_character_class = False
+        for index in range(start, len(pattern)):
+            char = pattern[index]
+            if escaped:
+                escaped = False
+                continue
+            if char == "\\":
+                escaped = True
+                continue
+            if char == "[":
+                in_character_class = True
+                continue
+            if char == "]" and in_character_class:
+                in_character_class = False
+                continue
+            if in_character_class:
+                continue
+            if char == "(":
+                depth += 1
+                continue
+            if char == ")":
+                depth -= 1
+                if depth == 0:
+                    return index
+        return -1
+
+    @classmethod
+    def _consume_optional_literal_prefix(
+        cls,
+        pattern: str,
+    ) -> Optional[Tuple[str, str]]:
+        if not pattern.startswith("(?:"):
+            return None
+        group_end = cls._find_regex_group_end(pattern, 0)
+        if group_end < 0 or group_end + 1 >= len(pattern) or pattern[group_end + 1] != "?":
+            return None
+        literal, remainder = cls._regex_literal_prefix_and_remainder(pattern[3:group_end])
+        if not literal or remainder:
+            return None
+        return literal, pattern[group_end + 2 :]
+
+    @staticmethod
+    def _split_regex_alternatives(pattern: str) -> List[str]:
+        alternatives: List[str] = []
+        start = 0
+        depth = 0
+        escaped = False
+        in_character_class = False
+        for index, char in enumerate(pattern):
+            if escaped:
+                escaped = False
+                continue
+            if char == "\\":
+                escaped = True
+                continue
+            if char == "[":
+                in_character_class = True
+                continue
+            if char == "]" and in_character_class:
+                in_character_class = False
+                continue
+            if in_character_class:
+                continue
+            if char == "(":
+                depth += 1
+                continue
+            if char == ")":
+                depth -= 1
+                continue
+            if char == "|" and depth == 0:
+                alternatives.append(pattern[start:index])
+                start = index + 1
+        alternatives.append(pattern[start:])
+        return alternatives
+
+    @classmethod
+    def _regex_lookahead_literal_specs(cls, pattern: str) -> List[Tuple[str, bool]]:
+        if not pattern.startswith("(?="):
+            return []
+        group_end = cls._find_regex_group_end(pattern, 0)
+        if group_end < 0:
+            return []
+        literals: List[Tuple[str, bool]] = []
+        for alternative in cls._split_regex_alternatives(pattern[3:group_end]):
+            strip_leading_whitespace = False
+            while alternative.startswith(r"\s*"):
+                strip_leading_whitespace = True
+                alternative = alternative[3:]
+            if alternative == "$":
+                continue
+            if alternative.endswith("$"):
+                alternative = alternative[:-1]
+            literal, _ = cls._regex_literal_prefix_and_remainder(alternative)
+            if literal:
+                literals.append((literal, strip_leading_whitespace))
+        return literals
+
+    @classmethod
+    def _regex_capture_parts(
+        cls,
+        pattern: str,
+    ) -> Optional[Tuple[str, str]]:
+        normalized = pattern.lstrip("^")
+        captures = [
+            (index, token)
+            for token in ("(.*?)", "(.*)")
+            if (index := normalized.find(token)) >= 0
+        ]
+        if not captures:
+            return None
+        capture_index, capture_token = min(captures, key=lambda item: item[0])
+        return normalized[:capture_index], normalized[capture_index + len(capture_token) :]
+
+    @classmethod
+    def _regex_capture_end_literal_specs(cls, pattern: str) -> List[Tuple[str, bool]]:
+        capture_parts = cls._regex_capture_parts(pattern)
+        if capture_parts is None:
+            return []
+        _, suffix_pattern = capture_parts
+        literal_specs = cls._regex_lookahead_literal_specs(suffix_pattern)
+        if literal_specs:
+            return literal_specs
+        literal, _ = cls._regex_literal_prefix_and_remainder(suffix_pattern)
+        return [(literal, False)] if literal else []
+
+    @classmethod
+    def _regex_capture_end_literals(cls, pattern: str) -> List[str]:
+        return [literal for literal, _ in cls._regex_capture_end_literal_specs(pattern)]
+
+    @classmethod
+    def _regex_leading_capture(
+        cls,
+        *,
+        field_name: str,
+        field_regex: str,
+        content_regex: Optional[str],
+    ) -> Optional[Dict[str, Any]]:
+        capture_parts = cls._regex_capture_parts(field_regex)
+        if capture_parts is None:
+            return None
+        prefix_pattern, _ = capture_parts
+        prefix_pattern = prefix_pattern.lstrip("^")
+        optional_prefix = cls._consume_optional_literal_prefix(prefix_pattern)
+        if optional_prefix is not None:
+            prefix_pattern = optional_prefix[1]
+        implicit_at_start = False
+        optional_capture_start = cls._consume_optional_literal_prefix(prefix_pattern)
+        if optional_capture_start is not None:
+            capture_start, prefix_pattern = optional_capture_start
+            implicit_at_start = True
+        else:
+            capture_start, prefix_pattern = cls._regex_literal_prefix_and_remainder(prefix_pattern)
+        if not capture_start or prefix_pattern:
+            return None
+        end_literals = cls._regex_capture_end_literals(field_regex)
+        if not end_literals:
+            return None
+        capture_end = end_literals[0]
+        strip_after = False
+        if isinstance(content_regex, str):
+            escaped_end = re.escape(capture_end)
+            strip_after = bool(re.search(escaped_end + r"\\s\*", content_regex))
+        return {
+            "field": field_name,
+            "start": capture_start,
+            "end": capture_end,
+            "strip_after": strip_after,
+            "implicit_at_start": implicit_at_start,
+        }
 
     @staticmethod
     def _literal_suffix_prefix_length(text: str, literal: str) -> int:
@@ -4702,7 +4881,14 @@ def _compile_tagged_message_plan(
             return None
         iterator = cls._compile_iterator_pattern(iterator_pattern)
         if iterator is None:
-            return None
+            iterator_capture = cls._compile_iterator_block_pattern(iterator_pattern)
+            if (
+                not isinstance(iterator_capture, dict)
+                or not iterator_capture["start"]
+                or iterator_capture["allow_eof"]
+            ):
+                return None
+            iterator = (iterator_capture["start"], iterator_capture["end"])
         items_schema = tool_calls_schema.get("items")
         if not isinstance(items_schema, dict):
             return None
@@ -4715,10 +4901,11 @@ def _compile_tagged_message_plan(
             if isinstance(content_schema, dict)
             else None
         )
-        object_regex = schema.get("x-regex") if isinstance(schema.get("x-regex"), str) else None
         assistant_prefix: Optional[str] = None
-        if isinstance(content_regex, str) and r"<\|im_start\|>assistant\n" in content_regex:
-            assistant_prefix = "<|im_start|>assistant\n"
+        if isinstance(content_regex, str):
+            optional_prefix = cls._consume_optional_literal_prefix(content_regex.lstrip("^"))
+            if optional_prefix is not None:
+                assistant_prefix = optional_prefix[0]
         leading_capture: Optional[Dict[str, Any]] = None
         for field_name, value_schema in properties.items():
             if not isinstance(value_schema, dict):
@@ -4726,43 +4913,31 @@ def _compile_tagged_message_plan(
             field_regex = value_schema.get("x-regex")
             if not isinstance(field_regex, str):
                 continue
-            if "<think>\\n" in field_regex and "</think>" in field_regex:
-                leading_capture = {
-                    "field": field_name,
-                    "start": "<think>\n",
-                    "end": "</think>",
-                    "strip_after": True,
-                    "implicit_at_start": "(?:<think>\\n)?" in field_regex,
-                }
+            if field_name == "content":
+                continue
+            capture = cls._regex_leading_capture(
+                field_name=field_name,
+                field_regex=field_regex,
+                content_regex=content_regex,
+            )
+            if capture is not None:
+                leading_capture = capture
                 break
-        if leading_capture is None and isinstance(object_regex, str):
-            if (
-                "(?P<thinking>" in object_regex
-                and r"<\|channel\>thought\n" in object_regex
-                and r"\<channel\|\>" in object_regex
-            ):
-                leading_capture = {
-                    "field": "thinking",
-                    "start": "<|channel>thought\n",
-                    "end": "<channel|>",
-                    "strip_after": False,
-                    "implicit_at_start": False,
-                }
         end_markers: List[str] = []
+        content_end_marker_specs: List[Tuple[str, bool]] = []
         iterator_start, iterator_end = iterator
         if "content" in properties:
             end_markers.append(iterator_start)
-        if isinstance(content_regex, str) and r"<\|im_end\|>" in content_regex:
-            end_markers.append("<|im_end|>")
-        if isinstance(object_regex, str) and r"<turn\|>" in object_regex:
-            end_markers.append("<turn|>")
+        if isinstance(content_regex, str):
+            content_end_marker_specs = cls._regex_capture_end_literal_specs(content_regex)
+            end_markers.extend(literal for literal, _ in content_end_marker_specs)
         if not end_markers and iterator_start:
             end_markers.append(iterator_start)
-        trim_before_iterator = (
-            isinstance(content_regex, str)
-            and r"\s*<tool_call>\n" in content_regex
+        deduped_end_markers = tuple(dict.fromkeys(end_markers))
+        trim_before_iterator = any(
+            literal == iterator_start and strip_leading_whitespace
+            for literal, strip_leading_whitespace in content_end_marker_specs
         )
-        end_marker_tuple = tuple(end_markers)
         direct_deltas = item_plan["kind"] == "tagged-parameters"
         direct_init = (
             (
@@ -4773,8 +4948,8 @@ def _compile_tagged_message_plan(
                 bool(leading_capture.get("strip_after")) if leading_capture is not None else False,
                 bool(leading_capture.get("implicit_at_start")) if leading_capture is not None else False,
                 trim_before_iterator,
-                end_marker_tuple,
-                tuple(marker for marker in end_marker_tuple if marker != iterator_start),
+                deduped_end_markers,
+                tuple(marker for marker in deduped_end_markers if marker != iterator_start),
                 iterator_start,
                 iterator_end,
                 item_plan["function_start"],
@@ -4793,9 +4968,9 @@ def _compile_tagged_message_plan(
             "assistant_prefix": assistant_prefix,
             "leading_capture": leading_capture,
             "content_field": "content" if "content" in properties else None,
-            "content_end_markers": end_marker_tuple,
+            "content_end_markers": deduped_end_markers,
             "trim_before_iterator": trim_before_iterator,
-            "stop_markers": tuple(marker for marker in end_marker_tuple if marker != iterator_start),
+            "stop_markers": tuple(marker for marker in deduped_end_markers if marker != iterator_start),
             "direct_init": direct_init,
             "iterator": {
                 "start": iterator_start,
@@ -7192,19 +7367,17 @@ def consume_completion_chunk(
                             logprobs=logprobs,
                             leading_delta=role_delta,
                         )
-                    if self._stream_state_complete():
-                        return self._chunk_payloads(
-                            chunk_id=chunk_id,
-                            created=created,
-                            model=model,
-                            deltas=stream_deltas,
-                            finish_reason=(
-                                "tool_calls" if self._direct.saw_tool_calls else finish_reason
-                            ),
-                            logprobs=logprobs,
-                            leading_delta=role_delta,
-                        )
-                    self._stream_failed = True
+                    return self._chunk_payloads(
+                        chunk_id=chunk_id,
+                        created=created,
+                        model=model,
+                        deltas=stream_deltas,
+                        finish_reason=(
+                            "tool_calls" if self._direct.saw_tool_calls else finish_reason
+                        ),
+                        logprobs=logprobs,
+                        leading_delta=role_delta,
+                    )
                 elif self._stream_plan["kind"] == "segment-message":
                     if role_delta is not None:
                         stream_deltas = [role_delta, *stream_deltas]
@@ -7219,18 +7392,16 @@ def consume_completion_chunk(
                             finish_reason=None,
                             logprobs=logprobs,
                         )
-                    if self._stream_state_complete():
-                        return self._chunk_payloads(
-                            chunk_id=chunk_id,
-                            created=created,
-                            model=model,
-                            deltas=stream_deltas,
-                            finish_reason=(
-                                "tool_calls" if self._message.get("tool_calls") else finish_reason
-                            ),
-                            logprobs=logprobs,
-                        )
-                    self._stream_failed = True
+                    return self._chunk_payloads(
+                        chunk_id=chunk_id,
+                        created=created,
+                        model=model,
+                        deltas=stream_deltas,
+                        finish_reason=(
+                            "tool_calls" if self._message.get("tool_calls") else finish_reason
+                        ),
+                        logprobs=logprobs,
+                    )
                 else:
                     previous_message = self._message
                     partial_deltas: List[Dict[str, Any]] = []
@@ -7238,7 +7409,7 @@ def consume_completion_chunk(
                     parsed = cast(Dict[str, Any], self._stream_state.parsed)
                     message = self._parsed_chat_message(
                         parsed=parsed,
-                        partial=finish_reason is None,
+                        partial=finish_reason is None or not self._stream_state_complete(),
                     )
                     if finish_reason is None:
                         if role_delta is not None:
@@ -7253,22 +7424,20 @@ def consume_completion_chunk(
                             finish_reason=None,
                             logprobs=logprobs,
                         )
-                    if self._stream_state_complete():
-                        if role_delta is not None:
-                            partial_deltas.append(role_delta)
-                        partial_deltas.extend(self._message_deltas(previous_message, message))
-                        self._message = message
-                        return self._chunk_payloads(
-                            chunk_id=chunk_id,
-                            created=created,
-                            model=model,
-                            deltas=partial_deltas,
-                            finish_reason=(
-                                "tool_calls" if message.get("tool_calls") else finish_reason
-                            ),
-                            logprobs=logprobs,
-                        )
-                    self._stream_failed = True
+                    if role_delta is not None:
+                        partial_deltas.append(role_delta)
+                    partial_deltas.extend(self._message_deltas(previous_message, message))
+                    self._message = message
+                    return self._chunk_payloads(
+                        chunk_id=chunk_id,
+                        created=created,
+                        model=model,
+                        deltas=partial_deltas,
+                        finish_reason=(
+                            "tool_calls" if message.get("tool_calls") else finish_reason
+                        ),
+                        logprobs=logprobs,
+                    )
             else:
                 self._stream_failed = True
 

From 7eb494d83ef5fe67f348a0b1a14627fa8da50e36 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 7 Jun 2026 22:35:22 -0700
Subject: [PATCH 3/4] fix(ci): repair Linux accelerator wheels (#2286)

---
 .github/workflows/build-wheels-cuda.yaml   | 53 +++++++++++++++--
 .github/workflows/build-wheels-rocm.yaml   | 69 ++++++++++++++++++----
 .github/workflows/build-wheels-vulkan.yaml | 57 ++++++++++--------
 CHANGELOG.md                               |  2 +
 4 files changed, 140 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 905b9bee9..59dc3558b 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -1,6 +1,12 @@
 name: Build Wheels (CUDA)
 
-on: workflow_dispatch
+on:
+  workflow_dispatch:
+    inputs:
+      release_tag:
+        description: Release tag to upload wheel assets to
+        required: false
+        type: string
 
 permissions:
   contents: write
@@ -131,7 +137,9 @@ jobs:
           if ($IsWindows) {
             python -m pip install build wheel ninja
           } else {
-            python -m pip install build wheel
+            sudo apt-get update
+            sudo apt-get install -y patchelf
+            python -m pip install auditwheel build wheel
           }
 
       - name: Build Wheel
@@ -169,6 +177,12 @@ jobs:
             $env:CPLUS_INCLUDE_PATH = "$cudaRoot/include$pathSeparator$env:CPLUS_INCLUDE_PATH"
             $env:LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LIBRARY_PATH"
             $env:LD_LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LD_LIBRARY_PATH"
+            $cudaLibraryPaths = @(
+              (Join-Path $cudaRoot 'lib'),
+              (Join-Path $cudaRoot 'lib64'),
+              (Join-Path $env:CONDA_PREFIX 'lib')
+            ) | Where-Object { Test-Path $_ }
+            Write-Output "CUDA_LIBRARY_PATHS=$($cudaLibraryPaths -join ':')" >> $env:GITHUB_ENV
           } elseif ($IsWindows) {
             $ninjaPath = ((Get-Command ninja -ErrorAction Stop).Source).Replace('\', '/')
             $env:CMAKE_GENERATOR = 'Ninja'
@@ -218,15 +232,44 @@ jobs:
           # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
           $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS"
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
+          if ($IsLinux) {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_OPENMP=OFF'
+          }
           python -m build --wheel
           # Publish tags that reflect the actual installed toolkit version.
           Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV
 
+      - name: Repair Linux wheel
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          set -euxo pipefail
+          mkdir -p wheelhouse
+          export LD_LIBRARY_PATH="$PWD/llama_cpp/lib:${CUDA_LIBRARY_PATHS}:${LD_LIBRARY_PATH:-}"
+          auditwheel_bin="${CONDA}/envs/llamacpp/bin/auditwheel"
+          "${auditwheel_bin}" repair \
+            --exclude libcuda.so \
+            --exclude libcuda.so.1 \
+            --exclude libcudart.so.11.0 \
+            --exclude libcudart.so.12 \
+            --exclude libcudart.so.13 \
+            --exclude libcublas.so.11 \
+            --exclude libcublas.so.12 \
+            --exclude libcublas.so.13 \
+            --exclude libcublasLt.so.11 \
+            --exclude libcublasLt.so.12 \
+            --exclude libcublasLt.so.13 \
+            -w wheelhouse \
+            dist/*.whl
+          rm dist/*.whl
+          cp wheelhouse/*.whl dist/
+          "${auditwheel_bin}" show dist/*.whl
+
       - uses: softprops/action-gh-release@v3
-        if: startsWith(github.ref, 'refs/tags/')
+        if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '')
         with:
           files: dist/*
-          # Set tag_name to <tag>-cu<cuda_version>
-          tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
+          # Set tag_name to <tag>-cu<cuda_version>.
+          tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-cu${{ env.CUDA_VERSION }}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml
index 6ad0b4954..70fca5edb 100644
--- a/.github/workflows/build-wheels-rocm.yaml
+++ b/.github/workflows/build-wheels-rocm.yaml
@@ -1,6 +1,20 @@
 name: Build Wheels (ROCm)
 
-on: workflow_dispatch
+on:
+  workflow_dispatch:
+    inputs:
+      release_tag:
+        description: Release tag to upload wheel assets to
+        required: false
+        type: string
+      amdgpu_targets:
+        description: AMDGPU targets to compile into the Linux ROCm wheel
+        required: false
+        default: gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
+      windows_amdgpu_targets:
+        description: AMDGPU targets to compile into the Windows HIP Radeon wheel
+        required: false
+        default: gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032
 
 permissions:
   contents: write
@@ -24,7 +38,7 @@ jobs:
       - name: Install system dependencies
         run: |
           apt-get update
-          apt-get install -y --no-install-recommends git cmake lsb-release ninja-build
+          apt-get install -y --no-install-recommends git cmake lsb-release ninja-build patchelf
 
       - uses: actions/checkout@v6
         with:
@@ -37,9 +51,12 @@ jobs:
       - name: Install build dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install build wheel
+          python -m pip install auditwheel build wheel
 
       - name: Build ROCm wheel
+        env:
+          MATRIX_AMDGPU_TARGETS: ${{ matrix.amdgpu_targets }}
+          INPUT_AMDGPU_TARGETS: ${{ inputs.amdgpu_targets }}
         run: |
           export ROCM_PATH="${ROCM_PATH:-/opt/rocm}"
           export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
@@ -56,11 +73,38 @@ jobs:
           rocm_tag="$(hipconfig --version | sed -E 's/^([0-9]+)\.([0-9]+).*/\1\2/')"
           echo "ROCM_VERSION=$rocm_tag" >> "$GITHUB_ENV"
 
-          amdgpu_targets="${{ matrix.amdgpu_targets }}"
-          export CMAKE_ARGS="-DGGML_HIP=on -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off -DAMDGPU_TARGETS=$amdgpu_targets -DCMAKE_HIP_ARCHITECTURES=$amdgpu_targets"
+          amdgpu_targets="${INPUT_AMDGPU_TARGETS:-$MATRIX_AMDGPU_TARGETS}"
+          export CMAKE_ARGS="-DGGML_HIP=on -DGGML_NATIVE=off -DGGML_OPENMP=OFF -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off -DAMDGPU_TARGETS=$amdgpu_targets -DCMAKE_HIP_ARCHITECTURES=$amdgpu_targets"
           python -m build --wheel
+
+      - name: Repair Linux wheel
+        run: |
+          export ROCM_PATH="${ROCM_PATH:-/opt/rocm}"
+          export LD_LIBRARY_PATH="$PWD/llama_cpp/lib:$ROCM_PATH/lib:$ROCM_PATH/lib64:${LD_LIBRARY_PATH:-}"
           mkdir -p wheelhouse
-          cp dist/*.whl wheelhouse/
+          python -m auditwheel repair \
+            --exclude libamdhip64.so \
+            --exclude libamdhip64.so.6 \
+            --exclude libamdhip64.so.7 \
+            --exclude libhiprtc.so \
+            --exclude libhiprtc.so.6 \
+            --exclude libhiprtc.so.7 \
+            --exclude libhipblas.so \
+            --exclude libhipblas.so.2 \
+            --exclude libhipblas.so.3 \
+            --exclude libhipblaslt.so \
+            --exclude libhipblaslt.so.0 \
+            --exclude libhipblaslt.so.1 \
+            --exclude librocblas.so \
+            --exclude librocblas.so.4 \
+            --exclude librocblas.so.5 \
+            --exclude libhsa-runtime64.so.1 \
+            --exclude libhsakmt.so.1 \
+            -w wheelhouse \
+            dist/*.whl
+          rm dist/*.whl
+          cp wheelhouse/*.whl dist/
+          python -m auditwheel show dist/*.whl
 
       - uses: actions/upload-artifact@v7
         with:
@@ -139,11 +183,14 @@ jobs:
           & $clangPath.FullName --version
 
       - name: Build HIP wheel
+        env:
+          MATRIX_AMDGPU_TARGETS: ${{ matrix.amdgpu_targets }}
+          INPUT_AMDGPU_TARGETS: ${{ inputs.windows_amdgpu_targets }}
         run: |
           $ErrorActionPreference = "Stop"
           $hipPath = Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Split-Path | Split-Path
           $rocwmmaInclude = (Join-Path $PWD 'opt\rocm-7.2.1\include').Replace('\', '/')
-          $amdgpuTargets = "${{ matrix.amdgpu_targets }}"
+          $amdgpuTargets = if ($env:INPUT_AMDGPU_TARGETS) { $env:INPUT_AMDGPU_TARGETS } else { $env:MATRIX_AMDGPU_TARGETS }
 
           $env:HIP_PATH = $hipPath
           $env:ROCM_PATH = $hipPath
@@ -198,7 +245,7 @@ jobs:
   release_rocm:
     name: Release ROCm
     needs: [build_wheels]
-    if: startsWith(github.ref, 'refs/tags/')
+    if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '')
     runs-on: ubuntu-latest
 
     steps:
@@ -211,14 +258,14 @@ jobs:
         with:
           files: dist/*
           # Set release name to <tag>-rocm<rocm_version>.
-          tag_name: ${{ github.ref_name }}-rocm72
+          tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-rocm72
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
   release_hip:
     name: Release HIP
     needs: [build_wheels_windows_hip]
-    if: startsWith(github.ref, 'refs/tags/')
+    if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '')
     runs-on: ubuntu-latest
 
     steps:
@@ -231,6 +278,6 @@ jobs:
         with:
           files: dist/*
           # Set release name to <tag>-hip-radeon.
-          tag_name: ${{ github.ref_name }}-hip-radeon
+          tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-hip-radeon
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-wheels-vulkan.yaml b/.github/workflows/build-wheels-vulkan.yaml
index 8790cc20e..822662c3f 100644
--- a/.github/workflows/build-wheels-vulkan.yaml
+++ b/.github/workflows/build-wheels-vulkan.yaml
@@ -1,6 +1,12 @@
 name: Build Wheels (Vulkan)
 
-on: workflow_dispatch
+on:
+  workflow_dispatch:
+    inputs:
+      release_tag:
+        description: Release tag to upload wheel assets to
+        required: false
+        type: string
 
 permissions:
   contents: write
@@ -40,23 +46,6 @@ jobs:
           python-version: ${{ matrix.pyver }}
           cache: "pip"
 
-      - name: Install Vulkan SDK
-        if: runner.os == 'Linux'
-        run: |
-          curl -fL \
-            "https://sdk.lunarg.com/sdk/download/${VULKAN_SDK_VERSION}/linux/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.xz" \
-            -o vulkan-sdk.tar.xz
-          echo "${VULKAN_SDK_LINUX_SHA256}  vulkan-sdk.tar.xz" | sha256sum -c -
-          mkdir -p "$RUNNER_TEMP/vulkan-sdk"
-          tar -xf vulkan-sdk.tar.xz -C "$RUNNER_TEMP/vulkan-sdk"
-          source "$RUNNER_TEMP/vulkan-sdk/${VULKAN_SDK_VERSION}/setup-env.sh"
-          {
-            echo "VULKAN_SDK=$VULKAN_SDK"
-            echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib:${LD_LIBRARY_PATH:-}"
-          } >> "$GITHUB_ENV"
-          echo "$VULKAN_SDK/bin" >> "$GITHUB_PATH"
-          "$VULKAN_SDK/bin/glslc" --version
-
       - name: Install Vulkan SDK
         if: runner.os == 'Windows'
         shell: pwsh
@@ -71,6 +60,7 @@ jobs:
           & "$vulkanSdk\Bin\glslc.exe" --version
 
       - name: Install build dependencies
+        if: runner.os == 'Windows'
         run: |
           python -m pip install --upgrade pip
           python -m pip install build wheel
@@ -81,11 +71,28 @@ jobs:
 
       - name: Build Vulkan wheel
         if: runner.os == 'Linux'
-        run: |
-          export CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_METAL=OFF -DGGML_VULKAN=on"
-          python -m build --wheel
-          mkdir -p wheelhouse
-          cp dist/*.whl wheelhouse/
+        uses: pypa/cibuildwheel@v3.4.1
+        env:
+          CIBW_BUILD: "cp38-manylinux_*"
+          CIBW_ARCHS: "auto64"
+          CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
+          CIBW_BEFORE_ALL_LINUX: >
+            yum install -y xz &&
+            curl -L https://micro.mamba.pm/api/micromamba/linux-64/latest -o /tmp/micromamba.tar.bz2 &&
+            mkdir -p /tmp/micromamba &&
+            tar -xjf /tmp/micromamba.tar.bz2 -C /tmp/micromamba bin/micromamba &&
+            /tmp/micromamba/bin/micromamba create -y -p /opt/vulkan -c conda-forge shaderc libvulkan-loader spirv-headers &&
+            /opt/vulkan/bin/glslc --version &&
+            curl -fL "https://sdk.lunarg.com/sdk/download/${{ env.VULKAN_SDK_VERSION }}/linux/vulkansdk-linux-x86_64-${{ env.VULKAN_SDK_VERSION }}.tar.xz" -o /tmp/vulkan-sdk.tar.xz &&
+            echo "${{ env.VULKAN_SDK_LINUX_SHA256 }}  /tmp/vulkan-sdk.tar.xz" | sha256sum -c - &&
+            mkdir -p /opt/vulkan-sdk &&
+            tar -xf /tmp/vulkan-sdk.tar.xz -C /opt/vulkan-sdk
+          CIBW_ENVIRONMENT_LINUX: >
+            CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_METAL=OFF -DGGML_OPENMP=OFF -DGGML_VULKAN=on -DCMAKE_PREFIX_PATH=/opt/vulkan -DVulkan_INCLUDE_DIR=/opt/vulkan-sdk/${{ env.VULKAN_SDK_VERSION }}/x86_64/include -DVulkan_LIBRARY=/opt/vulkan/lib/libvulkan.so -DVulkan_GLSLC_EXECUTABLE=/opt/vulkan/bin/glslc"
+          CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib:/opt/vulkan/lib auditwheel repair --exclude libvulkan.so.1 -w {dest_dir} {wheel}"
+        with:
+          package-dir: .
+          output-dir: wheelhouse
 
       - name: Build Vulkan wheel
         if: runner.os == 'Windows'
@@ -105,7 +112,7 @@ jobs:
   release:
     name: Release
     needs: [build_wheels]
-    if: startsWith(github.ref, 'refs/tags/')
+    if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '')
     runs-on: ubuntu-latest
 
     steps:
@@ -118,6 +125,6 @@ jobs:
         with:
           files: dist/*
           # Set release name to <tag>-vulkan.
-          tag_name: ${{ github.ref_name }}-vulkan
+          tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-vulkan
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b83ccc83b..e358b871f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Repair Linux accelerator wheels for manylinux publishing
+
 ## [0.3.28]
 
 - feat(example): align server MTP support with llama.cpp by @abetlen in #2283

From d4ac2c2cc0dfab3e078c72101961b93a1261d791 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sun, 7 Jun 2026 22:56:19 -0700
Subject: [PATCH 4/4] fix(example): support multi-step Responses tool streaming
 (#2288)

* fix(example): support multi-step Responses tool streaming

* docs: add Responses tool streaming changelog
---
 CHANGELOG.md              |   1 +
 examples/server/server.py | 112 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e358b871f..ac792bc2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(example): support multi-step Responses tool streaming by @abetlen in #2288
 - fix(ci): Repair Linux accelerator wheels for manylinux publishing
 
 ## [0.3.28]
diff --git a/examples/server/server.py b/examples/server/server.py
index 28fc8f4eb..fb00501cf 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -2812,6 +2812,7 @@ def to_chat_template_tool(self) -> ChatTemplateTool:
 class ResponsesCustomToolFormat(BaseModel):
     model_config = ConfigDict(extra="ignore")
 
+    type: Optional[str] = None
     syntax: Optional[str] = None
     definition: Optional[str] = None
 
@@ -2880,10 +2881,24 @@ class ResponsesWebSearchTool(BaseModel):
     type: Literal["web_search"]
 
 
+class ResponsesNamespaceTool(BaseModel):
+    model_config = ConfigDict(extra="ignore")
+
+    type: Literal["namespace"]
+
+
+class ResponsesImageGenerationTool(BaseModel):
+    model_config = ConfigDict(extra="ignore")
+
+    type: Literal["image_generation"]
+
+
 ResponsesToolDefinition = Union[
     ResponsesFunctionTool,
     ResponsesCustomTool,
     ResponsesWebSearchTool,
+    ResponsesNamespaceTool,
+    ResponsesImageGenerationTool,
 ]
 
 
@@ -5069,6 +5084,68 @@ def _tool_content_type(self, tool_name: str) -> Optional[str]:
             return content_type
         return None
 
+    def _raw_string_tool_arguments(self, tool_name: str, value: str) -> Optional[Dict[str, str]]:
+        if self._tools is None:
+            return None
+        for tool in self._tools:
+            if tool.get("type") != "function":
+                continue
+            function = tool.get("function", {})
+            if function.get("name") != tool_name:
+                continue
+            parameters = function.get("parameters")
+            if not isinstance(parameters, dict):
+                return None
+            required = parameters.get("required")
+            if not isinstance(required, list) or len(required) != 1:
+                return None
+            argument_name = required[0]
+            if not isinstance(argument_name, str):
+                return None
+            properties = parameters.get("properties")
+            if not isinstance(properties, dict):
+                return None
+            argument_schema = properties.get(argument_name)
+            if not isinstance(argument_schema, dict):
+                return None
+            argument_type = argument_schema.get("type")
+            if argument_type == "string" or (
+                isinstance(argument_type, list) and "string" in argument_type
+            ):
+                return {argument_name: value}
+            return None
+        return None
+
+    @classmethod
+    def _raw_object_tool_arguments(cls, value: str) -> Optional[Dict[str, Any]]:
+        candidates = [value]
+        stripped = value.strip()
+        if stripped.startswith("{{") and stripped.endswith("}}"):
+            candidates.append(stripped[1:-1])
+        for candidate in candidates:
+            normalized = cls._gemma4_tool_call_to_json(candidate)
+            for allow_partial in (False, True):
+                try:
+                    parsed = from_json(normalized, allow_partial=allow_partial)
+                except ValueError:
+                    continue
+                if isinstance(parsed, dict):
+                    return {
+                        key: cls._trim_partial_gemma_quote_marker(value)
+                        if isinstance(value, str)
+                        else value
+                        for key, value in parsed.items()
+                    }
+        return None
+
+    @staticmethod
+    def _trim_partial_gemma_quote_marker(value: str) -> str:
+        quote_marker = '<|"|>'
+        for prefix_length in range(len(quote_marker) - 1, 0, -1):
+            if value.endswith(quote_marker[:prefix_length]):
+                return value[:-prefix_length]
+        return value
+
     def _has_text_tools(self) -> bool:
         return any(
             isinstance(tool_schema, dict) and tool_schema.get("content_type") == "text"
@@ -5637,6 +5714,18 @@ def _advance_direct_stream_state(self, text: str) -> Tuple[bool, List[Dict[str,
                     self._direct.saw_tool_calls = saw_tool_calls
                     self._direct.done = done
                     return True, deltas
+                if leading_capture_field is not None:
+                    if buffer.startswith(leading_capture_start):
+                        buffer = buffer[len(leading_capture_start) :]
+                        mode = self.DIRECT_MODE_LEADING_CAPTURE
+                        continue
+                    if leading_capture_start.startswith(buffer):
+                        self._direct.pending = buffer
+                        self._direct.mode = mode
+                        self._direct.tool_call_count = tool_call_count
+                        self._direct.saw_tool_calls = saw_tool_calls
+                        self._direct.done = done
+                        return True, deltas
                 if buffer.startswith(iterator_start):
                     saw_tool_calls = True
                     self._start_direct_tool_call(tool_call_count)
@@ -6302,6 +6391,16 @@ def _advance_stream_state(self, text: str) -> Tuple[bool, List[Dict[str, Any]]]:
                 if not buffer:
                     state.pending = ""
                     return True, deltas
+                leading_capture = plan.get("leading_capture")
+                if leading_capture is not None:
+                    capture_start = leading_capture["start"]
+                    if buffer.startswith(capture_start):
+                        buffer = buffer[len(capture_start) :]
+                        state.mode = "leading-capture"
+                        continue
+                    if capture_start.startswith(buffer):
+                        state.pending = buffer
+                        return True, deltas
                 if buffer.startswith(iterator_start):
                     item_state = self._new_tool_call_state(plan["iterator"]["item"])
                     state.saw_tool_calls = True
@@ -6866,6 +6965,10 @@ def _normalize_tool_call_item(
                 },
             }
         arguments = function.get("arguments", {})
+        if isinstance(arguments, str):
+            arguments = self._raw_object_tool_arguments(arguments) or self._raw_string_tool_arguments(
+                tool_name, arguments
+            )
         if not isinstance(arguments, (dict, ResponseParser.PartialJsonObject)):
             if partial:
                 return None
@@ -8009,7 +8112,14 @@ def _responses_tools_to_chat_tools(
             return None
         chat_tools: List[ChatTemplateTool] = []
         for tool in tools:
-            if isinstance(tool, ResponsesWebSearchTool):
+            if isinstance(
+                tool,
+                (
+                    ResponsesWebSearchTool,
+                    ResponsesNamespaceTool,
+                    ResponsesImageGenerationTool,
+                ),
+            ):
                 continue
             if isinstance(tool, ResponsesFunctionTool):
                 chat_tools.append(tool.to_chat_template_tool())