From a72325b0d5be7592afe87f01571e913e04a8e394 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 7 Jun 2026 18:48:40 -0700 Subject: [PATCH 1/4] fix(example): avoid duplicate streamed response deltas (#2285) --- examples/server/server.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/server/server.py b/examples/server/server.py index 64a16f0bd..ec83fe6e3 100644 --- a/examples/server/server.py +++ b/examples/server/server.py @@ -8695,7 +8695,7 @@ def _ensure_reasoning_stream_item( state, "response.output_item.added", output_index=item_state.output_index, - item=item, + item=copy.deepcopy(item), ), self._response_event( state, @@ -8703,7 +8703,7 @@ def _ensure_reasoning_stream_item( item_id=cast(str, item["id"]), output_index=item_state.output_index, content_index=0, - part=part, + part=copy.deepcopy(part), ), ], item_state @@ -8727,7 +8727,7 @@ def _ensure_message_stream_item( state, "response.output_item.added", output_index=item_state.output_index, - item=item, + item=copy.deepcopy(item), ), self._response_event( state, @@ -8735,7 +8735,7 @@ def _ensure_message_stream_item( item_id=cast(str, item["id"]), output_index=item_state.output_index, content_index=0, - part=part, + part=copy.deepcopy(part), ), ], item_state @@ -8777,7 +8777,7 @@ def _ensure_tool_stream_item( state, "response.output_item.added", output_index=item_state.output_index, - item=item, + item=copy.deepcopy(item), ) ], item_state From 411e0f40e767b2ac3b1155be73916ea94c2ddfc6 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 7 Jun 2026 20:29:46 -0700 Subject: [PATCH 2/4] fix(example): derive streaming response parser boundaries from schema (#2287) --- examples/server/server.py | 329 +++++++++++++++++++++++++++++--------- 1 file changed, 249 insertions(+), 80 deletions(-) diff --git a/examples/server/server.py b/examples/server/server.py index ec83fe6e3..28fc8f4eb 100644 --- a/examples/server/server.py +++ b/examples/server/server.py @@ -4344,6 +4344,11 @@ def capture(match: re.Match[str]) -> str: @staticmethod def _regex_literal_prefix(pattern: str) -> str: + literal, _ = ResponseParser._regex_literal_prefix_and_remainder(pattern) + return literal + + @staticmethod + def _regex_literal_prefix_and_remainder(pattern: str) -> Tuple[str, str]: literal: List[str] = [] index = 0 while index < len(pattern): @@ -4365,7 +4370,181 @@ def _regex_literal_prefix(pattern: str) -> str: break literal.append(char) index += 1 - return "".join(literal) + return "".join(literal), pattern[index:] + + @staticmethod + def _find_regex_group_end(pattern: str, start: int) -> int: + depth = 0 + escaped = False + in_character_class = False + for index in range(start, len(pattern)): + char = pattern[index] + if escaped: + escaped = False + continue + if char == "\\": + escaped = True + continue + if char == "[": + in_character_class = True + continue + if char == "]" and in_character_class: + in_character_class = False + continue + if in_character_class: + continue + if char == "(": + depth += 1 + continue + if char == ")": + depth -= 1 + if depth == 0: + return index + return -1 + + @classmethod + def _consume_optional_literal_prefix( + cls, + pattern: str, + ) -> Optional[Tuple[str, str]]: + if not pattern.startswith("(?:"): + return None + group_end = cls._find_regex_group_end(pattern, 0) + if group_end < 0 or group_end + 1 >= len(pattern) or pattern[group_end + 1] != "?": + return None + literal, remainder = cls._regex_literal_prefix_and_remainder(pattern[3:group_end]) + if not literal or remainder: + return None + return literal, pattern[group_end + 2 :] + + @staticmethod + def _split_regex_alternatives(pattern: str) -> List[str]: + alternatives: List[str] = [] + start = 0 + depth = 0 + escaped = False + in_character_class = False + for index, char in enumerate(pattern): + if escaped: + escaped = False + continue + if char == "\\": + escaped = True + continue + if char == "[": + in_character_class = True + continue + if char == "]" and in_character_class: + in_character_class = False + continue + if in_character_class: + continue + if char == "(": + depth += 1 + continue + if char == ")": + depth -= 1 + continue + if char == "|" and depth == 0: + alternatives.append(pattern[start:index]) + start = index + 1 + alternatives.append(pattern[start:]) + return alternatives + + @classmethod + def _regex_lookahead_literal_specs(cls, pattern: str) -> List[Tuple[str, bool]]: + if not pattern.startswith("(?="): + return [] + group_end = cls._find_regex_group_end(pattern, 0) + if group_end < 0: + return [] + literals: List[Tuple[str, bool]] = [] + for alternative in cls._split_regex_alternatives(pattern[3:group_end]): + strip_leading_whitespace = False + while alternative.startswith(r"\s*"): + strip_leading_whitespace = True + alternative = alternative[3:] + if alternative == "$": + continue + if alternative.endswith("$"): + alternative = alternative[:-1] + literal, _ = cls._regex_literal_prefix_and_remainder(alternative) + if literal: + literals.append((literal, strip_leading_whitespace)) + return literals + + @classmethod + def _regex_capture_parts( + cls, + pattern: str, + ) -> Optional[Tuple[str, str]]: + normalized = pattern.lstrip("^") + captures = [ + (index, token) + for token in ("(.*?)", "(.*)") + if (index := normalized.find(token)) >= 0 + ] + if not captures: + return None + capture_index, capture_token = min(captures, key=lambda item: item[0]) + return normalized[:capture_index], normalized[capture_index + len(capture_token) :] + + @classmethod + def _regex_capture_end_literal_specs(cls, pattern: str) -> List[Tuple[str, bool]]: + capture_parts = cls._regex_capture_parts(pattern) + if capture_parts is None: + return [] + _, suffix_pattern = capture_parts + literal_specs = cls._regex_lookahead_literal_specs(suffix_pattern) + if literal_specs: + return literal_specs + literal, _ = cls._regex_literal_prefix_and_remainder(suffix_pattern) + return [(literal, False)] if literal else [] + + @classmethod + def _regex_capture_end_literals(cls, pattern: str) -> List[str]: + return [literal for literal, _ in cls._regex_capture_end_literal_specs(pattern)] + + @classmethod + def _regex_leading_capture( + cls, + *, + field_name: str, + field_regex: str, + content_regex: Optional[str], + ) -> Optional[Dict[str, Any]]: + capture_parts = cls._regex_capture_parts(field_regex) + if capture_parts is None: + return None + prefix_pattern, _ = capture_parts + prefix_pattern = prefix_pattern.lstrip("^") + optional_prefix = cls._consume_optional_literal_prefix(prefix_pattern) + if optional_prefix is not None: + prefix_pattern = optional_prefix[1] + implicit_at_start = False + optional_capture_start = cls._consume_optional_literal_prefix(prefix_pattern) + if optional_capture_start is not None: + capture_start, prefix_pattern = optional_capture_start + implicit_at_start = True + else: + capture_start, prefix_pattern = cls._regex_literal_prefix_and_remainder(prefix_pattern) + if not capture_start or prefix_pattern: + return None + end_literals = cls._regex_capture_end_literals(field_regex) + if not end_literals: + return None + capture_end = end_literals[0] + strip_after = False + if isinstance(content_regex, str): + escaped_end = re.escape(capture_end) + strip_after = bool(re.search(escaped_end + r"\\s\*", content_regex)) + return { + "field": field_name, + "start": capture_start, + "end": capture_end, + "strip_after": strip_after, + "implicit_at_start": implicit_at_start, + } @staticmethod def _literal_suffix_prefix_length(text: str, literal: str) -> int: @@ -4702,7 +4881,14 @@ def _compile_tagged_message_plan( return None iterator = cls._compile_iterator_pattern(iterator_pattern) if iterator is None: - return None + iterator_capture = cls._compile_iterator_block_pattern(iterator_pattern) + if ( + not isinstance(iterator_capture, dict) + or not iterator_capture["start"] + or iterator_capture["allow_eof"] + ): + return None + iterator = (iterator_capture["start"], iterator_capture["end"]) items_schema = tool_calls_schema.get("items") if not isinstance(items_schema, dict): return None @@ -4715,10 +4901,11 @@ def _compile_tagged_message_plan( if isinstance(content_schema, dict) else None ) - object_regex = schema.get("x-regex") if isinstance(schema.get("x-regex"), str) else None assistant_prefix: Optional[str] = None - if isinstance(content_regex, str) and r"<\|im_start\|>assistant\n" in content_regex: - assistant_prefix = "<|im_start|>assistant\n" + if isinstance(content_regex, str): + optional_prefix = cls._consume_optional_literal_prefix(content_regex.lstrip("^")) + if optional_prefix is not None: + assistant_prefix = optional_prefix[0] leading_capture: Optional[Dict[str, Any]] = None for field_name, value_schema in properties.items(): if not isinstance(value_schema, dict): @@ -4726,43 +4913,31 @@ def _compile_tagged_message_plan( field_regex = value_schema.get("x-regex") if not isinstance(field_regex, str): continue - if "\\n" in field_regex and "" in field_regex: - leading_capture = { - "field": field_name, - "start": "\n", - "end": "", - "strip_after": True, - "implicit_at_start": "(?:\\n)?" in field_regex, - } + if field_name == "content": + continue + capture = cls._regex_leading_capture( + field_name=field_name, + field_regex=field_regex, + content_regex=content_regex, + ) + if capture is not None: + leading_capture = capture break - if leading_capture is None and isinstance(object_regex, str): - if ( - "(?P" in object_regex - and r"<\|channel\>thought\n" in object_regex - and r"\" in object_regex - ): - leading_capture = { - "field": "thinking", - "start": "<|channel>thought\n", - "end": "", - "strip_after": False, - "implicit_at_start": False, - } end_markers: List[str] = [] + content_end_marker_specs: List[Tuple[str, bool]] = [] iterator_start, iterator_end = iterator if "content" in properties: end_markers.append(iterator_start) - if isinstance(content_regex, str) and r"<\|im_end\|>" in content_regex: - end_markers.append("<|im_end|>") - if isinstance(object_regex, str) and r"" in object_regex: - end_markers.append("") + if isinstance(content_regex, str): + content_end_marker_specs = cls._regex_capture_end_literal_specs(content_regex) + end_markers.extend(literal for literal, _ in content_end_marker_specs) if not end_markers and iterator_start: end_markers.append(iterator_start) - trim_before_iterator = ( - isinstance(content_regex, str) - and r"\s*\n" in content_regex + deduped_end_markers = tuple(dict.fromkeys(end_markers)) + trim_before_iterator = any( + literal == iterator_start and strip_leading_whitespace + for literal, strip_leading_whitespace in content_end_marker_specs ) - end_marker_tuple = tuple(end_markers) direct_deltas = item_plan["kind"] == "tagged-parameters" direct_init = ( ( @@ -4773,8 +4948,8 @@ def _compile_tagged_message_plan( bool(leading_capture.get("strip_after")) if leading_capture is not None else False, bool(leading_capture.get("implicit_at_start")) if leading_capture is not None else False, trim_before_iterator, - end_marker_tuple, - tuple(marker for marker in end_marker_tuple if marker != iterator_start), + deduped_end_markers, + tuple(marker for marker in deduped_end_markers if marker != iterator_start), iterator_start, iterator_end, item_plan["function_start"], @@ -4793,9 +4968,9 @@ def _compile_tagged_message_plan( "assistant_prefix": assistant_prefix, "leading_capture": leading_capture, "content_field": "content" if "content" in properties else None, - "content_end_markers": end_marker_tuple, + "content_end_markers": deduped_end_markers, "trim_before_iterator": trim_before_iterator, - "stop_markers": tuple(marker for marker in end_marker_tuple if marker != iterator_start), + "stop_markers": tuple(marker for marker in deduped_end_markers if marker != iterator_start), "direct_init": direct_init, "iterator": { "start": iterator_start, @@ -7192,19 +7367,17 @@ def consume_completion_chunk( logprobs=logprobs, leading_delta=role_delta, ) - if self._stream_state_complete(): - return self._chunk_payloads( - chunk_id=chunk_id, - created=created, - model=model, - deltas=stream_deltas, - finish_reason=( - "tool_calls" if self._direct.saw_tool_calls else finish_reason - ), - logprobs=logprobs, - leading_delta=role_delta, - ) - self._stream_failed = True + return self._chunk_payloads( + chunk_id=chunk_id, + created=created, + model=model, + deltas=stream_deltas, + finish_reason=( + "tool_calls" if self._direct.saw_tool_calls else finish_reason + ), + logprobs=logprobs, + leading_delta=role_delta, + ) elif self._stream_plan["kind"] == "segment-message": if role_delta is not None: stream_deltas = [role_delta, *stream_deltas] @@ -7219,18 +7392,16 @@ def consume_completion_chunk( finish_reason=None, logprobs=logprobs, ) - if self._stream_state_complete(): - return self._chunk_payloads( - chunk_id=chunk_id, - created=created, - model=model, - deltas=stream_deltas, - finish_reason=( - "tool_calls" if self._message.get("tool_calls") else finish_reason - ), - logprobs=logprobs, - ) - self._stream_failed = True + return self._chunk_payloads( + chunk_id=chunk_id, + created=created, + model=model, + deltas=stream_deltas, + finish_reason=( + "tool_calls" if self._message.get("tool_calls") else finish_reason + ), + logprobs=logprobs, + ) else: previous_message = self._message partial_deltas: List[Dict[str, Any]] = [] @@ -7238,7 +7409,7 @@ def consume_completion_chunk( parsed = cast(Dict[str, Any], self._stream_state.parsed) message = self._parsed_chat_message( parsed=parsed, - partial=finish_reason is None, + partial=finish_reason is None or not self._stream_state_complete(), ) if finish_reason is None: if role_delta is not None: @@ -7253,22 +7424,20 @@ def consume_completion_chunk( finish_reason=None, logprobs=logprobs, ) - if self._stream_state_complete(): - if role_delta is not None: - partial_deltas.append(role_delta) - partial_deltas.extend(self._message_deltas(previous_message, message)) - self._message = message - return self._chunk_payloads( - chunk_id=chunk_id, - created=created, - model=model, - deltas=partial_deltas, - finish_reason=( - "tool_calls" if message.get("tool_calls") else finish_reason - ), - logprobs=logprobs, - ) - self._stream_failed = True + if role_delta is not None: + partial_deltas.append(role_delta) + partial_deltas.extend(self._message_deltas(previous_message, message)) + self._message = message + return self._chunk_payloads( + chunk_id=chunk_id, + created=created, + model=model, + deltas=partial_deltas, + finish_reason=( + "tool_calls" if message.get("tool_calls") else finish_reason + ), + logprobs=logprobs, + ) else: self._stream_failed = True From 7eb494d83ef5fe67f348a0b1a14627fa8da50e36 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 7 Jun 2026 22:35:22 -0700 Subject: [PATCH 3/4] fix(ci): repair Linux accelerator wheels (#2286) --- .github/workflows/build-wheels-cuda.yaml | 53 +++++++++++++++-- .github/workflows/build-wheels-rocm.yaml | 69 ++++++++++++++++++---- .github/workflows/build-wheels-vulkan.yaml | 57 ++++++++++-------- CHANGELOG.md | 2 + 4 files changed, 140 insertions(+), 41 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 905b9bee9..59dc3558b 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -1,6 +1,12 @@ name: Build Wheels (CUDA) -on: workflow_dispatch +on: + workflow_dispatch: + inputs: + release_tag: + description: Release tag to upload wheel assets to + required: false + type: string permissions: contents: write @@ -131,7 +137,9 @@ jobs: if ($IsWindows) { python -m pip install build wheel ninja } else { - python -m pip install build wheel + sudo apt-get update + sudo apt-get install -y patchelf + python -m pip install auditwheel build wheel } - name: Build Wheel @@ -169,6 +177,12 @@ jobs: $env:CPLUS_INCLUDE_PATH = "$cudaRoot/include$pathSeparator$env:CPLUS_INCLUDE_PATH" $env:LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LIBRARY_PATH" $env:LD_LIBRARY_PATH = "$cudaRoot/lib$pathSeparator$env:CONDA_PREFIX/lib$pathSeparator$env:LD_LIBRARY_PATH" + $cudaLibraryPaths = @( + (Join-Path $cudaRoot 'lib'), + (Join-Path $cudaRoot 'lib64'), + (Join-Path $env:CONDA_PREFIX 'lib') + ) | Where-Object { Test-Path $_ } + Write-Output "CUDA_LIBRARY_PATHS=$($cudaLibraryPaths -join ':')" >> $env:GITHUB_ENV } elseif ($IsWindows) { $ninjaPath = ((Get-Command ninja -ErrorAction Stop).Source).Replace('\', '/') $env:CMAKE_GENERATOR = 'Ninja' @@ -218,15 +232,44 @@ jobs: # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS" $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' + if ($IsLinux) { + $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_OPENMP=OFF' + } python -m build --wheel # Publish tags that reflect the actual installed toolkit version. Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV + - name: Repair Linux wheel + if: runner.os == 'Linux' + shell: bash + run: | + set -euxo pipefail + mkdir -p wheelhouse + export LD_LIBRARY_PATH="$PWD/llama_cpp/lib:${CUDA_LIBRARY_PATHS}:${LD_LIBRARY_PATH:-}" + auditwheel_bin="${CONDA}/envs/llamacpp/bin/auditwheel" + "${auditwheel_bin}" repair \ + --exclude libcuda.so \ + --exclude libcuda.so.1 \ + --exclude libcudart.so.11.0 \ + --exclude libcudart.so.12 \ + --exclude libcudart.so.13 \ + --exclude libcublas.so.11 \ + --exclude libcublas.so.12 \ + --exclude libcublas.so.13 \ + --exclude libcublasLt.so.11 \ + --exclude libcublasLt.so.12 \ + --exclude libcublasLt.so.13 \ + -w wheelhouse \ + dist/*.whl + rm dist/*.whl + cp wheelhouse/*.whl dist/ + "${auditwheel_bin}" show dist/*.whl + - uses: softprops/action-gh-release@v3 - if: startsWith(github.ref, 'refs/tags/') + if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '') with: files: dist/* - # Set tag_name to -cu - tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }} + # Set tag_name to -cu. + tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-cu${{ env.CUDA_VERSION }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml index 6ad0b4954..70fca5edb 100644 --- a/.github/workflows/build-wheels-rocm.yaml +++ b/.github/workflows/build-wheels-rocm.yaml @@ -1,6 +1,20 @@ name: Build Wheels (ROCm) -on: workflow_dispatch +on: + workflow_dispatch: + inputs: + release_tag: + description: Release tag to upload wheel assets to + required: false + type: string + amdgpu_targets: + description: AMDGPU targets to compile into the Linux ROCm wheel + required: false + default: gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201 + windows_amdgpu_targets: + description: AMDGPU targets to compile into the Windows HIP Radeon wheel + required: false + default: gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032 permissions: contents: write @@ -24,7 +38,7 @@ jobs: - name: Install system dependencies run: | apt-get update - apt-get install -y --no-install-recommends git cmake lsb-release ninja-build + apt-get install -y --no-install-recommends git cmake lsb-release ninja-build patchelf - uses: actions/checkout@v6 with: @@ -37,9 +51,12 @@ jobs: - name: Install build dependencies run: | python -m pip install --upgrade pip - python -m pip install build wheel + python -m pip install auditwheel build wheel - name: Build ROCm wheel + env: + MATRIX_AMDGPU_TARGETS: ${{ matrix.amdgpu_targets }} + INPUT_AMDGPU_TARGETS: ${{ inputs.amdgpu_targets }} run: | export ROCM_PATH="${ROCM_PATH:-/opt/rocm}" export HIP_PATH="${HIP_PATH:-$ROCM_PATH}" @@ -56,11 +73,38 @@ jobs: rocm_tag="$(hipconfig --version | sed -E 's/^([0-9]+)\.([0-9]+).*/\1\2/')" echo "ROCM_VERSION=$rocm_tag" >> "$GITHUB_ENV" - amdgpu_targets="${{ matrix.amdgpu_targets }}" - export CMAKE_ARGS="-DGGML_HIP=on -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off -DAMDGPU_TARGETS=$amdgpu_targets -DCMAKE_HIP_ARCHITECTURES=$amdgpu_targets" + amdgpu_targets="${INPUT_AMDGPU_TARGETS:-$MATRIX_AMDGPU_TARGETS}" + export CMAKE_ARGS="-DGGML_HIP=on -DGGML_NATIVE=off -DGGML_OPENMP=OFF -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off -DAMDGPU_TARGETS=$amdgpu_targets -DCMAKE_HIP_ARCHITECTURES=$amdgpu_targets" python -m build --wheel + + - name: Repair Linux wheel + run: | + export ROCM_PATH="${ROCM_PATH:-/opt/rocm}" + export LD_LIBRARY_PATH="$PWD/llama_cpp/lib:$ROCM_PATH/lib:$ROCM_PATH/lib64:${LD_LIBRARY_PATH:-}" mkdir -p wheelhouse - cp dist/*.whl wheelhouse/ + python -m auditwheel repair \ + --exclude libamdhip64.so \ + --exclude libamdhip64.so.6 \ + --exclude libamdhip64.so.7 \ + --exclude libhiprtc.so \ + --exclude libhiprtc.so.6 \ + --exclude libhiprtc.so.7 \ + --exclude libhipblas.so \ + --exclude libhipblas.so.2 \ + --exclude libhipblas.so.3 \ + --exclude libhipblaslt.so \ + --exclude libhipblaslt.so.0 \ + --exclude libhipblaslt.so.1 \ + --exclude librocblas.so \ + --exclude librocblas.so.4 \ + --exclude librocblas.so.5 \ + --exclude libhsa-runtime64.so.1 \ + --exclude libhsakmt.so.1 \ + -w wheelhouse \ + dist/*.whl + rm dist/*.whl + cp wheelhouse/*.whl dist/ + python -m auditwheel show dist/*.whl - uses: actions/upload-artifact@v7 with: @@ -139,11 +183,14 @@ jobs: & $clangPath.FullName --version - name: Build HIP wheel + env: + MATRIX_AMDGPU_TARGETS: ${{ matrix.amdgpu_targets }} + INPUT_AMDGPU_TARGETS: ${{ inputs.windows_amdgpu_targets }} run: | $ErrorActionPreference = "Stop" $hipPath = Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Split-Path | Split-Path $rocwmmaInclude = (Join-Path $PWD 'opt\rocm-7.2.1\include').Replace('\', '/') - $amdgpuTargets = "${{ matrix.amdgpu_targets }}" + $amdgpuTargets = if ($env:INPUT_AMDGPU_TARGETS) { $env:INPUT_AMDGPU_TARGETS } else { $env:MATRIX_AMDGPU_TARGETS } $env:HIP_PATH = $hipPath $env:ROCM_PATH = $hipPath @@ -198,7 +245,7 @@ jobs: release_rocm: name: Release ROCm needs: [build_wheels] - if: startsWith(github.ref, 'refs/tags/') + if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '') runs-on: ubuntu-latest steps: @@ -211,14 +258,14 @@ jobs: with: files: dist/* # Set release name to -rocm. - tag_name: ${{ github.ref_name }}-rocm72 + tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-rocm72 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} release_hip: name: Release HIP needs: [build_wheels_windows_hip] - if: startsWith(github.ref, 'refs/tags/') + if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '') runs-on: ubuntu-latest steps: @@ -231,6 +278,6 @@ jobs: with: files: dist/* # Set release name to -hip-radeon. - tag_name: ${{ github.ref_name }}-hip-radeon + tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-hip-radeon env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-wheels-vulkan.yaml b/.github/workflows/build-wheels-vulkan.yaml index 8790cc20e..822662c3f 100644 --- a/.github/workflows/build-wheels-vulkan.yaml +++ b/.github/workflows/build-wheels-vulkan.yaml @@ -1,6 +1,12 @@ name: Build Wheels (Vulkan) -on: workflow_dispatch +on: + workflow_dispatch: + inputs: + release_tag: + description: Release tag to upload wheel assets to + required: false + type: string permissions: contents: write @@ -40,23 +46,6 @@ jobs: python-version: ${{ matrix.pyver }} cache: "pip" - - name: Install Vulkan SDK - if: runner.os == 'Linux' - run: | - curl -fL \ - "https://sdk.lunarg.com/sdk/download/${VULKAN_SDK_VERSION}/linux/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.xz" \ - -o vulkan-sdk.tar.xz - echo "${VULKAN_SDK_LINUX_SHA256} vulkan-sdk.tar.xz" | sha256sum -c - - mkdir -p "$RUNNER_TEMP/vulkan-sdk" - tar -xf vulkan-sdk.tar.xz -C "$RUNNER_TEMP/vulkan-sdk" - source "$RUNNER_TEMP/vulkan-sdk/${VULKAN_SDK_VERSION}/setup-env.sh" - { - echo "VULKAN_SDK=$VULKAN_SDK" - echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib:${LD_LIBRARY_PATH:-}" - } >> "$GITHUB_ENV" - echo "$VULKAN_SDK/bin" >> "$GITHUB_PATH" - "$VULKAN_SDK/bin/glslc" --version - - name: Install Vulkan SDK if: runner.os == 'Windows' shell: pwsh @@ -71,6 +60,7 @@ jobs: & "$vulkanSdk\Bin\glslc.exe" --version - name: Install build dependencies + if: runner.os == 'Windows' run: | python -m pip install --upgrade pip python -m pip install build wheel @@ -81,11 +71,28 @@ jobs: - name: Build Vulkan wheel if: runner.os == 'Linux' - run: | - export CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_METAL=OFF -DGGML_VULKAN=on" - python -m build --wheel - mkdir -p wheelhouse - cp dist/*.whl wheelhouse/ + uses: pypa/cibuildwheel@v3.4.1 + env: + CIBW_BUILD: "cp38-manylinux_*" + CIBW_ARCHS: "auto64" + CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014" + CIBW_BEFORE_ALL_LINUX: > + yum install -y xz && + curl -L https://micro.mamba.pm/api/micromamba/linux-64/latest -o /tmp/micromamba.tar.bz2 && + mkdir -p /tmp/micromamba && + tar -xjf /tmp/micromamba.tar.bz2 -C /tmp/micromamba bin/micromamba && + /tmp/micromamba/bin/micromamba create -y -p /opt/vulkan -c conda-forge shaderc libvulkan-loader spirv-headers && + /opt/vulkan/bin/glslc --version && + curl -fL "https://sdk.lunarg.com/sdk/download/${{ env.VULKAN_SDK_VERSION }}/linux/vulkansdk-linux-x86_64-${{ env.VULKAN_SDK_VERSION }}.tar.xz" -o /tmp/vulkan-sdk.tar.xz && + echo "${{ env.VULKAN_SDK_LINUX_SHA256 }} /tmp/vulkan-sdk.tar.xz" | sha256sum -c - && + mkdir -p /opt/vulkan-sdk && + tar -xf /tmp/vulkan-sdk.tar.xz -C /opt/vulkan-sdk + CIBW_ENVIRONMENT_LINUX: > + CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_METAL=OFF -DGGML_OPENMP=OFF -DGGML_VULKAN=on -DCMAKE_PREFIX_PATH=/opt/vulkan -DVulkan_INCLUDE_DIR=/opt/vulkan-sdk/${{ env.VULKAN_SDK_VERSION }}/x86_64/include -DVulkan_LIBRARY=/opt/vulkan/lib/libvulkan.so -DVulkan_GLSLC_EXECUTABLE=/opt/vulkan/bin/glslc" + CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib:/opt/vulkan/lib auditwheel repair --exclude libvulkan.so.1 -w {dest_dir} {wheel}" + with: + package-dir: . + output-dir: wheelhouse - name: Build Vulkan wheel if: runner.os == 'Windows' @@ -105,7 +112,7 @@ jobs: release: name: Release needs: [build_wheels] - if: startsWith(github.ref, 'refs/tags/') + if: startsWith(github.ref, 'refs/tags/') || (github.event_name == 'workflow_dispatch' && inputs.release_tag != '') runs-on: ubuntu-latest steps: @@ -118,6 +125,6 @@ jobs: with: files: dist/* # Set release name to -vulkan. - tag_name: ${{ github.ref_name }}-vulkan + tag_name: ${{ github.event_name == 'workflow_dispatch' && inputs.release_tag || github.ref_name }}-vulkan env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index b83ccc83b..e358b871f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Repair Linux accelerator wheels for manylinux publishing + ## [0.3.28] - feat(example): align server MTP support with llama.cpp by @abetlen in #2283 From d4ac2c2cc0dfab3e078c72101961b93a1261d791 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sun, 7 Jun 2026 22:56:19 -0700 Subject: [PATCH 4/4] fix(example): support multi-step Responses tool streaming (#2288) * fix(example): support multi-step Responses tool streaming * docs: add Responses tool streaming changelog --- CHANGELOG.md | 1 + examples/server/server.py | 112 +++++++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e358b871f..ac792bc2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(example): support multi-step Responses tool streaming by @abetlen in #2288 - fix(ci): Repair Linux accelerator wheels for manylinux publishing ## [0.3.28] diff --git a/examples/server/server.py b/examples/server/server.py index 28fc8f4eb..fb00501cf 100644 --- a/examples/server/server.py +++ b/examples/server/server.py @@ -2812,6 +2812,7 @@ def to_chat_template_tool(self) -> ChatTemplateTool: class ResponsesCustomToolFormat(BaseModel): model_config = ConfigDict(extra="ignore") + type: Optional[str] = None syntax: Optional[str] = None definition: Optional[str] = None @@ -2880,10 +2881,24 @@ class ResponsesWebSearchTool(BaseModel): type: Literal["web_search"] +class ResponsesNamespaceTool(BaseModel): + model_config = ConfigDict(extra="ignore") + + type: Literal["namespace"] + + +class ResponsesImageGenerationTool(BaseModel): + model_config = ConfigDict(extra="ignore") + + type: Literal["image_generation"] + + ResponsesToolDefinition = Union[ ResponsesFunctionTool, ResponsesCustomTool, ResponsesWebSearchTool, + ResponsesNamespaceTool, + ResponsesImageGenerationTool, ] @@ -5069,6 +5084,68 @@ def _tool_content_type(self, tool_name: str) -> Optional[str]: return content_type return None + def _raw_string_tool_arguments(self, tool_name: str, value: str) -> Optional[Dict[str, str]]: + if self._tools is None: + return None + for tool in self._tools: + if tool.get("type") != "function": + continue + function = tool.get("function", {}) + if function.get("name") != tool_name: + continue + parameters = function.get("parameters") + if not isinstance(parameters, dict): + return None + required = parameters.get("required") + if not isinstance(required, list) or len(required) != 1: + return None + argument_name = required[0] + if not isinstance(argument_name, str): + return None + properties = parameters.get("properties") + if not isinstance(properties, dict): + return None + argument_schema = properties.get(argument_name) + if not isinstance(argument_schema, dict): + return None + argument_type = argument_schema.get("type") + if argument_type == "string" or ( + isinstance(argument_type, list) and "string" in argument_type + ): + return {argument_name: value} + return None + return None + + @classmethod + def _raw_object_tool_arguments(cls, value: str) -> Optional[Dict[str, Any]]: + candidates = [value] + stripped = value.strip() + if stripped.startswith("{{") and stripped.endswith("}}"): + candidates.append(stripped[1:-1]) + for candidate in candidates: + normalized = cls._gemma4_tool_call_to_json(candidate) + for allow_partial in (False, True): + try: + parsed = from_json(normalized, allow_partial=allow_partial) + except ValueError: + continue + if isinstance(parsed, dict): + return { + key: cls._trim_partial_gemma_quote_marker(value) + if isinstance(value, str) + else value + for key, value in parsed.items() + } + return None + + @staticmethod + def _trim_partial_gemma_quote_marker(value: str) -> str: + quote_marker = '<|"|>' + for prefix_length in range(len(quote_marker) - 1, 0, -1): + if value.endswith(quote_marker[:prefix_length]): + return value[:-prefix_length] + return value + def _has_text_tools(self) -> bool: return any( isinstance(tool_schema, dict) and tool_schema.get("content_type") == "text" @@ -5637,6 +5714,18 @@ def _advance_direct_stream_state(self, text: str) -> Tuple[bool, List[Dict[str, self._direct.saw_tool_calls = saw_tool_calls self._direct.done = done return True, deltas + if leading_capture_field is not None: + if buffer.startswith(leading_capture_start): + buffer = buffer[len(leading_capture_start) :] + mode = self.DIRECT_MODE_LEADING_CAPTURE + continue + if leading_capture_start.startswith(buffer): + self._direct.pending = buffer + self._direct.mode = mode + self._direct.tool_call_count = tool_call_count + self._direct.saw_tool_calls = saw_tool_calls + self._direct.done = done + return True, deltas if buffer.startswith(iterator_start): saw_tool_calls = True self._start_direct_tool_call(tool_call_count) @@ -6302,6 +6391,16 @@ def _advance_stream_state(self, text: str) -> Tuple[bool, List[Dict[str, Any]]]: if not buffer: state.pending = "" return True, deltas + leading_capture = plan.get("leading_capture") + if leading_capture is not None: + capture_start = leading_capture["start"] + if buffer.startswith(capture_start): + buffer = buffer[len(capture_start) :] + state.mode = "leading-capture" + continue + if capture_start.startswith(buffer): + state.pending = buffer + return True, deltas if buffer.startswith(iterator_start): item_state = self._new_tool_call_state(plan["iterator"]["item"]) state.saw_tool_calls = True @@ -6866,6 +6965,10 @@ def _normalize_tool_call_item( }, } arguments = function.get("arguments", {}) + if isinstance(arguments, str): + arguments = self._raw_object_tool_arguments(arguments) or self._raw_string_tool_arguments( + tool_name, arguments + ) if not isinstance(arguments, (dict, ResponseParser.PartialJsonObject)): if partial: return None @@ -8009,7 +8112,14 @@ def _responses_tools_to_chat_tools( return None chat_tools: List[ChatTemplateTool] = [] for tool in tools: - if isinstance(tool, ResponsesWebSearchTool): + if isinstance( + tool, + ( + ResponsesWebSearchTool, + ResponsesNamespaceTool, + ResponsesImageGenerationTool, + ), + ): continue if isinstance(tool, ResponsesFunctionTool): chat_tools.append(tool.to_chat_template_tool())