diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 843fe37d062..9df86d0489b 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -5,6 +5,9 @@ # Define the CANN base image for easier version updates later ARG CHIP_TYPE=910b ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A # ============================================================================== # BUILD STAGE @@ -55,6 +58,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full && \ cp build/bin/* /app/full/ && \ cp *.py /app/full/ && \ + cp -r conversion /app/full/ && \ cp -r gguf-py /app/full/ && \ cp -r requirements /app/full/ && \ cp requirements.txt /app/full/ @@ -67,6 +71,19 @@ RUN mkdir -p /app/full && \ # ============================================================================== FROM ${CANN_BASE_IMAGE} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + # -- Install runtime dependencies -- RUN yum install -y libgomp curl && \ yum clean all && \ diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index d6579ecf1ad..c19b7038bbe 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -27,6 +30,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -35,6 +39,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index b3f6ccfc984..621fe8b6a97 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_CUDA_DEV_CONTAINER} AS build # CUDA architecture to build for (defaults to all supported archs) @@ -32,6 +36,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -40,6 +45,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_CUDA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index da164dcfa5b..b127c5cec46 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -1,4 +1,7 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ## Build Image @@ -33,6 +36,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -40,6 +44,19 @@ RUN mkdir -p /app/full \ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + ARG IGC_VERSION=v2.20.5 ARG IGC_VERSION_FULL=2_2.20.5+19972 ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10 diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index d54e70838f2..447d871ac4f 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -1,4 +1,7 @@ ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ascendai/cann:$ASCEND_VERSION AS build @@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \ # TODO: use image with NNRT FROM ascendai/cann:$ASCEND_VERSION AS runtime + +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion / ENV LC_ALL=C.utf8 diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 665a76f58ce..3194294b36d 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_MUSA_DEV_CONTAINER} AS build # MUSA architecture to build for (defaults to all supported archs) @@ -37,6 +41,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -45,6 +50,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_MUSA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 30355d2fc99..86d9d589d35 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -3,6 +3,7 @@ glibc, config, stdenv, + stdenvNoCC, runCommand, cmake, ninja, @@ -19,6 +20,8 @@ openssl, shaderc, spirv-headers, + nodejs, + importNpmLock, useBlas ? builtins.all (x: !x) [ useCuda @@ -130,7 +133,31 @@ effectiveStdenv.mkDerivation (finalAttrs: { src = lib.cleanSource ../../.; }; - postPatch = '' + # Builds the webui locally, taking care not to require updating any sha256 hash. + webui = stdenvNoCC.mkDerivation { + pname = "webui"; + version = llamaVersion; + src = lib.cleanSource ../../tools/ui; + + nativeBuildInputs = [ + nodejs + importNpmLock.linkNodeModulesHook + ]; + + # no sha256 required when using buildNodeModules + npmDeps = importNpmLock.buildNodeModules { + npmRoot = ../../tools/ui; + inherit nodejs; + }; + + installPhase = '' + LLAMA_UI_OUT_DIR=$out npm run build --offline + ''; + }; + + postPatch = lib.optionalString useWebUi '' + cp -r ${finalAttrs.webui} tools/ui/dist + chmod -R u+w tools/ui/dist ''; # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015, diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 31b58736d7e..6dabdb323ca 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2 ARG http_proxy= ARG https_proxy= +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ## Build Image FROM ubuntu:${UBUNTU_VERSION} AS build @@ -77,6 +81,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/ReleaseOV/bin/* /app/full/ \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -88,6 +93,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base # Pass proxy args to runtime stage ARG http_proxy ARG https_proxy +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE RUN apt-get update \ && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 525ddc79051..3fdf7a8e487 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1 # Target the ROCm build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ### Build image FROM ${BASE_ROCM_DEV_CONTAINER} AS build @@ -49,6 +53,7 @@ RUN mkdir -p /app/lib \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -57,6 +62,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_ROCM_DEV_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index 757cd97cd4c..31c2fa902d4 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -1,5 +1,8 @@ ARG GCC_VERSION=15.2.0 ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ### Build Llama.cpp stage FROM gcc:${GCC_VERSION} AS build @@ -34,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \ COPY *.py /opt/llama.cpp/bin COPY .devops/tools.sh /opt/llama.cpp/bin +COPY conversion /opt/llama.cpp/conversion COPY gguf-py /opt/llama.cpp/gguf-py COPY requirements.txt /opt/llama.cpp/gguf-py @@ -44,14 +48,28 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements FROM scratch AS collector # Copy llama.cpp binaries and libraries -COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin -COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib -COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py +COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin +COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib +COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py +COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion ### Base image FROM ubuntu:${UBUNTU_VERSION} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ apt update -y && \ @@ -91,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y COPY --from=collector /llama.cpp/bin /app COPY --from=collector /llama.cpp/gguf-py /app/gguf-py +COPY --from=collector /llama.cpp/conversion /app/conversion RUN pip install --no-cache-dir --break-system-packages \ -r /app/gguf-py/requirements.txt diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index f4d199ed426..138a50d7da5 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=26.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -23,6 +26,7 @@ RUN mkdir -p /app/lib && \ RUN mkdir -p /app/full \ && cp build/bin/* /app/full \ && cp *.py /app/full \ + && cp -r conversion /app/full \ && cp -r gguf-py /app/full \ && cp -r requirements /app/full \ && cp requirements.txt /app/full \ @@ -31,6 +35,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \ libglvnd0 libgl1 libglx0 libegl1 libgles2 \ diff --git a/.devops/zendnn.Dockerfile b/.devops/zendnn.Dockerfile new file mode 100644 index 00000000000..76299b49e40 --- /dev/null +++ b/.devops/zendnn.Dockerfile @@ -0,0 +1,101 @@ +ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + +FROM ubuntu:$UBUNTU_VERSION AS build + +RUN apt-get update && \ + apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates + +ENV CC=gcc-13 CXX=g++-13 + +WORKDIR /app + +COPY . . + +RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \ + cmake --build build -j $(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so*" -exec cp -P {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r conversion /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ubuntu:$UBUNTU_VERSION AS base + +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + +RUN apt-get update \ + && apt-get install -y libgomp1 libnuma1 curl \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-wheel \ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.editorconfig b/.editorconfig index 7c1af01a1d8..5663b8fdbfc 100644 --- a/.editorconfig +++ b/.editorconfig @@ -45,7 +45,7 @@ insert_final_newline = unset trim_trailing_whitespace = unset insert_final_newline = unset -[tools/server/webui/**] +[tools/ui/**] indent_style = unset indent_size = unset end_of_line = unset diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index c7001edf050..23150d0b619 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -100,8 +100,8 @@ body: label: Relevant log output description: > Please copy and paste any relevant log output, including the command that you entered and any generated text. - For very long logs (thousands of lines), preferably upload them as files instead. - On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command. + For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose. + On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command. value: |
Logs diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 831c98eb637..041a7cdb2ee 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -88,8 +88,8 @@ body: description: > If applicable, please copy and paste any relevant log output, including any generated text. If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well. - For very long logs (thousands of lines), please upload them as files instead. - On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command. + For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose. + On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command. value: |
Logs diff --git a/.github/actions/ccache-clear/action.yml b/.github/actions/ccache-clear/action.yml new file mode 100644 index 00000000000..d38587efaf8 --- /dev/null +++ b/.github/actions/ccache-clear/action.yml @@ -0,0 +1,22 @@ +name: "ccache-clear" +description: "Delete all GitHub Actions caches matching a key prefix" +inputs: + key: + description: "Cache key prefix to match and delete" + required: true + +runs: + using: "composite" + steps: + - name: Clear caches + shell: bash + run: | + CACHES=$(gh cache list --key "ccache-${{ inputs.key }}" --json id,key --jq '.[] | "\(.id) \(.key)"' 2>/dev/null) + if [ -z "$CACHES" ]; then + echo "No caches found with key prefix: ${{ inputs.key }}" + exit 0 + fi + while read -r id key; do + echo "Deleting cache: $id ($key)" + gh cache delete "$id" + done <<< "$CACHES" diff --git a/.github/actions/linux-setup-spacemit/action.yml b/.github/actions/linux-setup-spacemit/action.yml index e2193e8931d..39e405b6779 100644 --- a/.github/actions/linux-setup-spacemit/action.yml +++ b/.github/actions/linux-setup-spacemit/action.yml @@ -15,6 +15,6 @@ runs: id: setup uses: ./.github/actions/unarchive-tar with: - url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz + url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz path: ${{ inputs.path }} strip: 1 diff --git a/.github/actions/unarchive-tar/action.yml b/.github/actions/unarchive-tar/action.yml index b97e402f46a..3d2f9be7bdd 100644 --- a/.github/actions/unarchive-tar/action.yml +++ b/.github/actions/unarchive-tar/action.yml @@ -24,4 +24,4 @@ runs: run: | mkdir -p ${{ inputs.path }} cd ${{ inputs.path }} - curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }} + curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }} diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml index 6ad61582a50..43c63ce44f0 100644 --- a/.github/actions/windows-setup-cuda/action.yml +++ b/.github/actions/windows-setup-cuda/action.yml @@ -96,3 +96,34 @@ runs: echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Install Cuda Toolkit 13.3 + if: ${{ inputs.cuda_version == '13.3' }} + shell: pwsh + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 diff --git a/.github/labeler.yml b/.github/labeler.yml index 2120672eee5..60aa51d2cc2 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -73,10 +73,10 @@ android: - changed-files: - any-glob-to-any-file: - examples/llama.android/** -server/webui: +server/ui: - changed-files: - any-glob-to-any-file: - - tools/server/webui/** + - tools/ui/** server: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build-3rd-party.yml b/.github/workflows/build-3rd-party.yml index 642d9786441..82e53dbafb3 100644 --- a/.github/workflows/build-3rd-party.yml +++ b/.github/workflows/build-3rd-party.yml @@ -22,9 +22,9 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: ubuntu-24-llguidance: diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml index ef3fe502fa7..3e857d48e39 100644 --- a/.github/workflows/build-and-test-snapdragon.yml +++ b/.github/workflows/build-and-test-snapdragon.yml @@ -31,7 +31,7 @@ jobs: android-ndk-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3' + image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7' defaults: run: shell: bash @@ -61,7 +61,7 @@ jobs: linux-iot-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1' + image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7' defaults: run: shell: bash diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml index 5d88305a4f0..a05248e1298 100644 --- a/.github/workflows/build-android.yml +++ b/.github/workflows/build-android.yml @@ -27,12 +27,12 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - android: + default: runs-on: ubuntu-latest steps: @@ -58,7 +58,7 @@ jobs: cd examples/llama.android ./gradlew build --no-daemon - android-ndk: + ndk: runs-on: ubuntu-latest container: image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3' @@ -73,6 +73,11 @@ jobs: fetch-depth: 0 lfs: false + - name: Dependencies + run: | + apt-get update + apt-get install -y build-essential + - name: Build id: ndk_build run: | @@ -86,3 +91,59 @@ jobs: with: name: llama-cpp-android-arm64-cpu path: pkg-adb/llama.cpp + + arm64: + runs-on: ubuntu-latest + + env: + NDK_VERSION: "29.0.14206865" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789) + # for some reason, the ccache does not improve the build time in this case + # example: + # cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831 + # cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394 + # + #- name: ccache + # uses: ggml-org/ccache-action@v1.2.21 + # with: + # key: android-ubuntu-arm64 + # evict-old-files: 1d + # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Set up JDK + uses: actions/setup-java@v5 + with: + java-version: 17 + distribution: temurin + + - name: Setup Android SDK + uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1 + with: + log-accepted-android-sdk-licenses: false + + - name: Install NDK + run: | + sdkmanager "ndk;${{ env.NDK_VERSION }}" + echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml index b99e614666e..2b3d14d1f3c 100644 --- a/.github/workflows/build-apple.yml +++ b/.github/workflows/build-apple.yml @@ -32,12 +32,12 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - macOS-latest-ios: + macos-latest-arm64: runs-on: macos-latest steps: @@ -48,7 +48,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: macOS-latest-ios + key: apple-arm64 evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -56,18 +56,58 @@ jobs: id: cmake_build run: | sysctl -a - cmake -B build -G Xcode \ + cmake -B build \ + -DCMAKE_BUILD_RPATH="@loader_path" \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + -DGGML_METAL_EMBED_LIBRARY=OFF \ + -DGGML_METAL_SHADER_DEBUG=ON \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1 + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main -E "test-llama-archs" --verbose --timeout 900 + + macos-latest-x64: + runs-on: macos-15-intel + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: apple-x64 + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build + id: cmake_build + run: | + sysctl -a + # Metal is disabled due to intermittent failures with Github runners not having a GPU: + # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + cmake -B build \ + -DCMAKE_BUILD_RPATH="@loader_path" \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_BUILD_BORINGSSL=ON \ + -DGGML_METAL=OFF \ + -DGGML_RPC=ON \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 + time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 macos-latest-ios-xcode: runs-on: macos-latest @@ -89,6 +129,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -115,7 +156,7 @@ jobs: xcodebuild -downloadPlatform iOS xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build - macOS-latest-tvos: + macos-latest-tvos: runs-on: macos-latest steps: @@ -123,10 +164,11 @@ jobs: id: checkout uses: actions/checkout@v6 + # TODO: this likely does not do anything - if yes, remove it - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: macOS-latest-tvos + key: apple-tvos evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -138,6 +180,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -147,7 +190,7 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - macOS-latest-visionos: + macos-latest-visionos: runs-on: macos-latest steps: @@ -155,6 +198,14 @@ jobs: id: checkout uses: actions/checkout@v6 + # TODO: this likely does not do anything - if yes, remove it + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: apple-visionos + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + - name: Build id: cmake_build run: | @@ -163,6 +214,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -172,7 +224,7 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - macOS-latest-swift: + macos-latest-swift: runs-on: macos-latest needs: macos-latest-ios-xcode @@ -185,10 +237,11 @@ jobs: id: checkout uses: actions/checkout@v6 + # TODO: this likely does not do anything - if yes, remove it - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: macOS-latest-swift + key: apple-swift evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} @@ -206,6 +259,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml index bc0a92c7fcf..53d65f3768b 100644 --- a/.github/workflows/build-cache.yml +++ b/.github/workflows/build-cache.yml @@ -28,7 +28,7 @@ jobs: id: cache-sdk with: path: ./vulkan_sdk - key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} + key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} - name: Setup Vulkan SDK if: steps.cache-sdk.outputs.cache-hit != 'true' @@ -54,7 +54,7 @@ jobs: # id: cache-toolchain # with: # path: ./spacemit_toolchain - # key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} + # key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} # - name: Setup SpacemiT Toolchain # if: steps.cache-toolchain.outputs.cache-hit != 'true' @@ -81,7 +81,7 @@ jobs: id: cache-openvino with: path: ./openvino_toolkit - key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} - name: Setup OpenVINO Toolkit if: steps.cache-openvino.outputs.cache-hit != 'true' @@ -108,7 +108,7 @@ jobs: id: cache-rocm with: path: C:\Program Files\AMD\ROCm - key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} + key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} - name: Setup ROCm if: steps.cache-rocm.outputs.cache-hit != 'true' diff --git a/.github/workflows/build-cann.yml b/.github/workflows/build-cann.yml index d39b8763733..6d76ed49992 100644 --- a/.github/workflows/build-cann.yml +++ b/.github/workflows/build-cann.yml @@ -29,74 +29,76 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - openEuler-latest-cann: - defaults: - run: - shell: bash -el {0} - strategy: - matrix: - arch: [x86, aarch64] - chip_type: ['910b', '310p'] - build: ['Release'] - use_acl_graph: ['on', 'off'] - exclude: - # 310P does not support USE_ACL_GRAPH=on - - chip_type: '310p' - use_acl_graph: 'on' - runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} - steps: - - name: Checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Free up disk space - uses: ggml-org/free-disk-space@v1.3.1 - with: - tool-cache: true - - - name: Set container image - id: cann-image - run: | - image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}" - echo "image=${image}" >> "${GITHUB_OUTPUT}" - - - name: Pull container image - run: docker pull "${{ steps.cann-image.outputs.image }}" - - - name: Build - env: - BUILD_TYPE: ${{ matrix.build }} - SOC_TYPE: ascend${{ matrix.chip_type }} - USE_ACL_GRAPH: ${{ matrix.use_acl_graph }} - run: | - HOST_UID=$(id -u) - HOST_GID=$(id -g) - - docker run --rm \ - -v "${PWD}:/workspace" \ - -w /workspace \ - -e SOC_TYPE=${SOC_TYPE} \ - -e BUILD_TYPE=${BUILD_TYPE} \ - -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \ - "${{ steps.cann-image.outputs.image }}" \ - bash -lc ' - set -e - yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel - yum clean all && rm -rf /var/cache/yum - git config --global --add safe.directory "/workspace" - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DGGML_CANN=on \ - -DSOC_TYPE=${SOC_TYPE} \ - -DUSE_ACL_GRAPH=${USE_ACL_GRAPH} - cmake --build build -j $(nproc) - - chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build - ' +# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) +# in order to enable it again, we have to provision dedicated runners to run it +# openEuler-latest-cann: +# defaults: +# run: +# shell: bash -el {0} +# strategy: +# matrix: +# arch: [x86, aarch64] +# chip_type: ['910b', '310p'] +# build: ['Release'] +# use_acl_graph: ['on', 'off'] +# exclude: +# # 310P does not support USE_ACL_GRAPH=on +# - chip_type: '310p' +# use_acl_graph: 'on' +# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} +# steps: +# - name: Checkout +# uses: actions/checkout@v6 +# with: +# fetch-depth: 0 +# +# - name: Free up disk space +# uses: ggml-org/free-disk-space@v1.3.1 +# with: +# tool-cache: true +# +# - name: Set container image +# id: cann-image +# run: | +# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}" +# echo "image=${image}" >> "${GITHUB_OUTPUT}" +# +# - name: Pull container image +# run: docker pull "${{ steps.cann-image.outputs.image }}" +# +# - name: Build +# env: +# BUILD_TYPE: ${{ matrix.build }} +# SOC_TYPE: ascend${{ matrix.chip_type }} +# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }} +# run: | +# HOST_UID=$(id -u) +# HOST_GID=$(id -g) +# +# docker run --rm \ +# -v "${PWD}:/workspace" \ +# -w /workspace \ +# -e SOC_TYPE=${SOC_TYPE} \ +# -e BUILD_TYPE=${BUILD_TYPE} \ +# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \ +# "${{ steps.cann-image.outputs.image }}" \ +# bash -lc ' +# set -e +# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel +# yum clean all && rm -rf /var/cache/yum +# git config --global --add safe.directory "/workspace" +# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} +# cmake -S . -B build \ +# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ +# -DGGML_CANN=on \ +# -DSOC_TYPE=${SOC_TYPE} \ +# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH} +# cmake --build build -j $(nproc) +# +# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build +# ' diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml index 84cf8ddf48a..5becff09c1b 100644 --- a/.github/workflows/build-cmake-pkg.yml +++ b/.github/workflows/build-cmake-pkg.yml @@ -5,23 +5,23 @@ on: jobs: linux: - runs-on: ubuntu-slim + runs-on: [self-hosted, Linux, CPU] steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - - name: Install dependencies - run: | - sudo apt update - sudo apt install -y build-essential tcl cmake - - name: Build run: | PREFIX="$(pwd)"/inst - cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \ - -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build \ + -DCMAKE_PREFIX_PATH="$PREFIX" \ + -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_APP=OFF \ + -DCMAKE_BUILD_TYPE=Release cmake --build build --config Release cmake --install build --prefix "$PREFIX" --config Release diff --git a/.github/workflows/build-cpu.yml b/.github/workflows/build-cpu.yml new file mode 100644 index 00000000000..8f62e1a177c --- /dev/null +++ b/.github/workflows/build-cpu.yml @@ -0,0 +1,215 @@ +name: CI (cpu) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-cpu.yml', + '.github/workflows/build-cmake-pkg.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-cpu.yml', + '.github/workflows/build-cmake-pkg.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + build-cmake-pkg: + uses: ./.github/workflows/build-cmake-pkg.yml + + ubuntu: + strategy: + matrix: + include: + - build: 'x64' + os: ubuntu-22.04 + - build: 'arm64' + os: ubuntu-24.04-arm + + runs-on: ${{ matrix.os }} + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: cpu-${{ matrix.os }} + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build Dependencies + id: build_depends + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + python3 python3-pip python3-dev python3-wheel \ + libjpeg-dev build-essential libssl-dev \ + git-lfs + + - name: Toolchain workaround (GCC 14) + if: ${{ contains(matrix.os, 'ubuntu-24.04') }} + run: | + sudo apt-get install -y gcc-14 g++-14 + echo "CC=gcc-14" >> "$GITHUB_ENV" + echo "CXX=g++-14" >> "$GITHUB_ENV" + + - name: Python Dependencies + id: python_depends + run: | + export PIP_BREAK_SYSTEM_PACKAGES="1" + python3 -m pip install --upgrade pip setuptools + pip3 install ./gguf-py + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + - name: Test llama2c conversion + id: llama2c_test + run: | + cd build + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + + windows: + runs-on: windows-2025 + + env: + OPENBLAS_VERSION: 0.3.23 + SDE_VERSION: 9.33.0-2024-01-07 + VULKAN_VERSION: 1.4.313.2 + + strategy: + matrix: + include: + - build: 'x64-cpu-static' + arch: 'x64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF' + - build: 'x64-openblas' + arch: 'x64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + - build: 'x64-vulkan' + arch: 'x64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' + - build: 'arm64' + arch: 'arm64' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: cpu-windows-2025-${{ matrix.build }} + variant: ccache + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Download OpenBLAS + id: get_openblas + if: ${{ matrix.build == 'x64-openblas' }} + run: | + curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" + curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" + mkdir $env:RUNNER_TEMP/openblas + tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas + $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) + $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) + $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') + & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll + + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'x64-vulkan' }} + run: | + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" + & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install + Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" + Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja + + - name: Build + id: cmake_build + run: | + cmake -S . -B build ${{ matrix.defines }} ` + -DLLAMA_BUILD_BORINGSSL=ON + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} + + - name: Add libopenblas.dll + id: add_libopenblas_dll + if: ${{ matrix.build == 'x64-openblas' }} + run: | + cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll + cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt + + - name: Test + id: cmake_test + if: ${{ matrix.arch == 'x64' }} + run: | + cd build + ctest -L main -C Release --verbose --timeout 900 + + # TODO: disabled for now, consider adding tests for all CPU variants instead + # - name: Test (Intel SDE) + # id: cmake_test_sde + # if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation + # run: | + # curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" + # # for some weird reason windows tar doesn't like sde tar.xz + # 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz + # 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar + # $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) + # cd build + # $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 + # & $sde -future -- ctest -L main -C Release --verbose --timeout 900 diff --git a/.github/workflows/build-cross.yml b/.github/workflows/build-cross.yml index 97c7368387e..eef78b67417 100644 --- a/.github/workflows/build-cross.yml +++ b/.github/workflows/build-cross.yml @@ -277,7 +277,7 @@ jobs: env: # Make sure this is in sync with build-cache.yml - SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2" + SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4" steps: - uses: actions/checkout@v6 @@ -287,7 +287,7 @@ jobs: # id: cache-toolchain # with: # path: ./spacemit_toolchain - # key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} + # key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }} - name: Setup SpacemiT Toolchain #if: steps.cache-toolchain.outputs.cache-hit != 'true' diff --git a/.github/workflows/build-cuda-ubuntu.yml b/.github/workflows/build-cuda-ubuntu.yml new file mode 100644 index 00000000000..6271b22cbd2 --- /dev/null +++ b/.github/workflows/build-cuda-ubuntu.yml @@ -0,0 +1,134 @@ +name: CI (CUDA, ubuntu) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-cuda-ubuntu.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-cuda-ubuntu.yml', + 'ggml/src/ggml-cuda/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + cuda: + runs-on: ubuntu-24.04 + container: nvidia/cuda:12.6.2-devel-ubuntu24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Install dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt update + apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: cuda-ubuntu-24.04-cuda + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build with CMake + # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project + run: | + cmake -S . -B build -G Ninja \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_ARCHITECTURES=89-real \ + -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ + -DGGML_NATIVE=OFF \ + -DGGML_CUDA=ON \ + -DGGML_CUDA_CUB_3DOT2=ON + cmake --build build + + hip: + runs-on: ubuntu-22.04 + container: rocm/dev-ubuntu-22.04:6.1.2 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: cuda-ubuntu-22.04-hip + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build with native CMake HIP support + id: cmake_build + run: | + cmake -B build -S . \ + -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ + -DGPU_TARGETS="gfx1030" \ + -DGGML_HIP=ON + cmake --build build --config Release -j $(nproc) + + musa: + runs-on: ubuntu-22.04 + container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dependencies + id: depends + run: | + apt-get update + apt-get install -y build-essential git cmake libssl-dev + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: cuda-ubuntu-22.04-musa + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build with native CMake MUSA support + id: cmake_build + run: | + cmake -B build -S . \ + -DGGML_MUSA=ON + time cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/build-cuda-windows.yml b/.github/workflows/build-cuda-windows.yml new file mode 100644 index 00000000000..e9e941421b6 --- /dev/null +++ b/.github/workflows/build-cuda-windows.yml @@ -0,0 +1,162 @@ +name: CI (CUDA, windows) + +# TODO: this workflow is only triggered manually because it is very heavy on the CI +# when we provision dedicated windows runners, we can enable it for pushes too +# note: running this workflow manually will populate the ccache for the release builds +# this can be used before merging a PR to speed up the release workflow +on: + workflow_dispatch: # allows manual triggering + +# note: this will run in queue with the release workflow +concurrency: + group: release + queue: max + +env: + GH_TOKEN: ${{ github.token }} + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + cuda: + runs-on: windows-2022 + + permissions: + actions: write + + strategy: + matrix: + cuda: ['12.4', '13.3'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-windows-2022-x64-cuda-${{ matrix.cuda }} + + - name: Install Cuda Toolkit + uses: ./.github/actions/windows-setup-cuda + with: + cuda_version: ${{ matrix.cuda }} + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja + + - name: Build + id: cmake_build + shell: cmd + # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + cmake -S . -B build -G "Ninja Multi-Config" ^ + -DLLAMA_BUILD_SERVER=ON ^ + -DLLAMA_BUILD_BORINGSSL=ON ^ + -DGGML_NATIVE=OFF ^ + -DGGML_BACKEND_DL=ON ^ + -DGGML_CPU_ALL_VARIANTS=ON ^ + -DGGML_CUDA=ON ^ + -DGGML_RPC=ON ^ + -DGGML_CUDA_CUB_3DOT2=ON + set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 + cmake --build build --config Release -j %NINJA_JOBS% -t ggml + cmake --build build --config Release + + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2022-x64-cuda-${{ matrix.cuda }} + + hip: + runs-on: windows-2022 + + permissions: + actions: write + + env: + # Make sure this is in sync with build-cache.yml + HIPSDK_INSTALLER_VERSION: "26.Q1" + + strategy: + matrix: + include: + # sync with release.yml + - name: "radeon" + gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Grab rocWMMA package + id: grab_rocwmma + run: | + curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb" + 7z x rocwmma.deb + 7z x data.tar + + - name: Use ROCm Installation Cache + uses: actions/cache@v5 + id: cache-rocm + with: + path: C:\Program Files\AMD\ROCm + key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} + + - name: Setup ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' + uses: ./.github/actions/windows-setup-rocm + with: + version: ${{ env.HIPSDK_INSTALLER_VERSION }} + + - name: Verify ROCm + id: verify + run: | + # Find and test ROCm installation + $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 + if (-not $clangPath) { + Write-Error "ROCm installation not found" + exit 1 + } + & $clangPath.FullName --version + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + # TODO: this build does not match the build in release.yml, so we use a different cache key + # ideally, the builds should match, similar to the CUDA build above so that we would be able + # to populate the ccache for the release with manual runs of this workflow + #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }} + key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }} + + - name: Build + id: cmake_build + run: | + $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) + $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" + cmake -G "Unix Makefiles" -B build -S . ` + -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` + -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` + -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" ` + -DCMAKE_BUILD_TYPE=Release ` + -DLLAMA_BUILD_BORINGSSL=ON ` + -DROCM_DIR="${env:HIP_PATH}" ` + -DGGML_HIP=ON ` + -DGGML_HIP_ROCWMMA_FATTN=ON ` + -DGPU_TARGETS="gfx1100" ` + -DGGML_RPC=ON + cmake --build build -j ${env:NUMBER_OF_PROCESSORS} + + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }} + key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }} diff --git a/.github/workflows/build-ibm.yml b/.github/workflows/build-ibm.yml new file mode 100644 index 00000000000..d2e4f3cdaeb --- /dev/null +++ b/.github/workflows/build-ibm.yml @@ -0,0 +1,150 @@ +name: CI (ibm) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-ibm.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-ibm.yml', + 'ggml/src/ggml-cpu/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + + ubuntu-24-s390x: + runs-on: ubuntu-24.04-s390x + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Build Dependencies + id: build_depends + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + python3 python3-pip python3-dev python3-wheel \ + libjpeg-dev build-essential libssl-dev \ + git-lfs + + - name: Toolchain workaround (GCC 14) + run: | + sudo apt-get install -y gcc-14 g++-14 + echo "CC=gcc-14" >> "$GITHUB_ENV" + echo "CXX=g++-14" >> "$GITHUB_ENV" + + - name: Python Dependencies + id: python_depends + run: | + export PIP_BREAK_SYSTEM_PACKAGES="1" + python3 -m pip install --upgrade pip setuptools + pip3 install ./gguf-py + + - name: Swap Endianness + id: endianness + run: | + for f in models/*.gguf; do + echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big + done + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + - name: Test llama2c (s390x) + id: llama2c_test_s390x + run: | + cd build + echo "Fetch llama2c big-endian model" + wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf + ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + + ubuntu-24-ppc64le: + runs-on: ubuntu-24.04-ppc64le + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Build Dependencies + id: build_depends + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + python3 python3-pip python3-dev python3-wheel \ + libjpeg-dev build-essential libssl-dev \ + git-lfs + + - name: Toolchain workaround (GCC 14) + run: | + sudo apt-get install -y gcc-14 g++-14 + echo "CC=gcc-14" >> "$GITHUB_ENV" + echo "CXX=g++-14" >> "$GITHUB_ENV" + + - name: Python Dependencies + id: python_depends + run: | + export PIP_BREAK_SYSTEM_PACKAGES="1" + python3 -m pip install --upgrade pip setuptools + pip3 install ./gguf-py + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + - name: Test llama2c conversion + id: llama2c_test + run: | + cd build + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 diff --git a/.github/workflows/build-msys.yml b/.github/workflows/build-msys.yml index 57cec7c166c..c2633c151a5 100644 --- a/.github/workflows/build-msys.yml +++ b/.github/workflows/build-msys.yml @@ -15,9 +15,9 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: windows-msys2: @@ -37,7 +37,7 @@ jobs: #- name: ccache # uses: ggml-org/ccache-action@v1.2.16 # with: - # key: windows-msys2 + # key: msys-windows-2025-x64 # variant: ccache # evict-old-files: 1d # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} diff --git a/.github/workflows/build-opencl.yml b/.github/workflows/build-opencl.yml new file mode 100644 index 00000000000..251b1f8d593 --- /dev/null +++ b/.github/workflows/build-opencl.yml @@ -0,0 +1,82 @@ +name: CI (opencl) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-opencl.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cl' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-opencl.yml', + 'ggml/src/ggml-opencl/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + windows-2025-opencl-adreno: + runs-on: windows-2025 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: opencl-windows-2025-x64 + variant: ccache + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja + + - name: Install OpenCL Headers and Libs + id: install_opencl + run: | + git clone https://github.com/KhronosGroup/OpenCL-Headers + cd OpenCL-Headers + cmake -B build ` + -DBUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_TESTING=OFF ` + -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build build --target install + git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader + cd OpenCL-ICD-Loader + cmake -B build-arm64-release ` + -A arm64 ` + -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" + cmake --build build-arm64-release --target install --config release + + - name: Build + id: cmake_build + run: | + cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml index f7177f6be37..ddcbc669745 100644 --- a/.github/workflows/build-openvino.yml +++ b/.github/workflows/build-openvino.yml @@ -29,30 +29,18 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: ubuntu-24-openvino: - name: ubuntu-24-openvino-${{ matrix.openvino_device }} + runs-on: [self-hosted, Linux, Intel, OpenVINO] concurrency: - group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }} + group: openvino-gpu-${{ github.head_ref || github.ref }} cancel-in-progress: false - strategy: - matrix: - include: - - variant: cpu - runner: '"ubuntu-24.04"' - openvino_device: "CPU" - - variant: gpu - runner: '["self-hosted","Linux","Intel","OpenVINO"]' - openvino_device: "GPU" - - runs-on: ${{ fromJSON(matrix.runner) }} - env: # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile OPENVINO_VERSION_MAJOR: "2026.0" @@ -63,14 +51,6 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: ccache - if: runner.environment == 'github-hosted' - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1 - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - name: Dependencies id: depends run: | @@ -78,16 +58,7 @@ jobs: sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd - - name: Use OpenVINO Toolkit Cache - if: runner.environment == 'github-hosted' - uses: actions/cache@v5 - id: cache-openvino - with: - path: ./openvino_toolkit - key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} - - name: Setup OpenVINO Toolkit - if: steps.cache-openvino.outputs.cache-hit != 'true' uses: ./.github/actions/linux-setup-openvino with: path: ./openvino_toolkit @@ -109,12 +80,17 @@ jobs: -DGGML_OPENVINO=ON time cmake --build build/ReleaseOV --config Release -j $(nproc) - - name: Test - id: cmake_test + - name: Test (CPU) + id: cmake_test_cpu + # TODO: fix and re-enable the `test-llama-archs` test below + run: | + cd ${{ github.workspace }} + ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000 + + - name: Test (GPU) + id: cmake_test_gpu # TODO: fix and re-enable the `test-llama-archs` test below run: | cd ${{ github.workspace }} - if [ "${{ matrix.openvino_device }}" = "GPU" ]; then - export GGML_OPENVINO_DEVICE=GPU - fi + export GGML_OPENVINO_DEVICE=GPU ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000 diff --git a/.github/workflows/build-riscv.yml b/.github/workflows/build-riscv.yml index b78b13140e5..70615378b5e 100644 --- a/.github/workflows/build-riscv.yml +++ b/.github/workflows/build-riscv.yml @@ -29,11 +29,84 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: + ubuntu-cpu-riscv64-native: + runs-on: ubuntu-24.04-riscv + + steps: + - name: Install dependencies + run: | + # Install necessary packages + sudo apt-get update + sudo apt-get install -y libssl-dev + + # Set gcc-14 and g++-14 as the default compilers + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 + + git lfs install + + - name: Check environment + run: | + uname -a + gcc --version + g++ --version + ldd --version + cmake --version + rustc --version + env + echo "nproc=$(nproc)" + + - name: Clone + id: checkout + uses: actions/checkout@v6 + + # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation + #- name: ccache + # uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1 + # with: + # key: riscv-ubuntu-native + # evict-old-files: 1d + # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENMP=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_TOOLS=ON \ + -DLLAMA_BUILD_TESTS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DGGML_RPC=ON \ + -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ + -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 + + time cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + - name: Test llama2c conversion + id: llama2c_test + run: | + cd build + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + ubuntu-riscv64-native-sanitizer: runs-on: ubuntu-24.04-riscv @@ -62,12 +135,13 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: ccache - uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1 - with: - key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation + #- name: ccache + # uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1 + # with: + # key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }} + # evict-old-files: 1d + # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - name: Build id: cmake_build diff --git a/.github/workflows/build-rpc.yml b/.github/workflows/build-rpc.yml new file mode 100644 index 00000000000..d04dc375b5c --- /dev/null +++ b/.github/workflows/build-rpc.yml @@ -0,0 +1,66 @@ +name: CI (rpc) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-rpc.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-rpc.yml', + 'ggml/src/ggml-rpc/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + ubuntu-24-rpc: + runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} + + continue-on-error: true + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential libssl-dev ninja-build + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -G "Ninja" \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose diff --git a/.github/workflows/build-sanitize.yml b/.github/workflows/build-sanitize.yml index c7b73d1dd0d..e242abcfd3c 100644 --- a/.github/workflows/build-sanitize.yml +++ b/.github/workflows/build-sanitize.yml @@ -22,66 +22,65 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - ubuntu-latest-sanitizer: - runs-on: ubuntu-latest + ctest: + runs-on: [self-hosted, X64, CPU, Linux] continue-on-error: true strategy: matrix: sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug] steps: - name: Clone id: checkout uses: actions/checkout@v6 - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends + # with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings + - name: Build (undefined) + id: cmake_build_undefined + if: ${{ matrix.sanitizer == 'UNDEFINED' }} run: | - sudo apt-get update - sudo apt-get install build-essential libssl-dev + cmake -B build \ + -DCMAKE_BUILD_TYPE=Debug \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON + + cmake --build build --config Debug -j $(nproc) - name: Build id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} + if: ${{ matrix.sanitizer == 'ADDRESS' }} run: | cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + cmake --build build --config RelWithDebInfo -j $(nproc) - name: Build (no OpenMP) id: cmake_build_no_openmp if: ${{ matrix.sanitizer == 'THREAD' }} run: | cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DGGML_OPENMP=OFF - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) + cmake --build build --config RelWithDebInfo -j $(nproc) - name: Test id: cmake_test + # skip run in Debug - very slow + if: ${{ matrix.sanitizer != 'UNDEFINED' }} run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L main -E tokenizer --verbose --timeout 900 diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index ee8fd41d7c7..436100c8a4c 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -50,27 +50,12 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - determine-tag: - name: Determine tag name - runs-on: ubuntu-slim - outputs: - tag_name: ${{ steps.tag.outputs.name }} - steps: - - name: Clone - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - ggml-ci-nvidia-cuda: - needs: determine-tag + gpu-cuda: runs-on: [self-hosted, Linux, NVIDIA] steps: @@ -80,14 +65,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | nvidia-smi - GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-nvidia-vulkan-cm: - needs: determine-tag + gpu-vulkan-nvidia-cm: runs-on: [self-hosted, Linux, NVIDIA] steps: @@ -97,14 +79,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary - GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-nvidia-vulkan-cm2: - needs: determine-tag + gpu-vulkan-nvidia-cm2: runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2] steps: @@ -114,14 +93,12 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary - GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-nvidia-webgpu: - runs-on: [self-hosted, Linux, NVIDIA] + gpu-webgpu-nvidia: + runs-on: [self-hosted, Linux, NVIDIA, X64] steps: - name: Clone @@ -147,10 +124,10 @@ jobs: GG_BUILD_WEBGPU=1 \ GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \ - bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # TODO: provision AMX-compatible machine - #ggml-ci-cpu-amx: + #cpu-amx: # runs-on: [self-hosted, Linux, CPU, AMX] # steps: @@ -161,10 +138,10 @@ jobs: # - name: Test # id: ggml-ci # run: | - # bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # TODO: provision AMD GPU machine - # ggml-ci-amd-vulkan: + # amd-vulkan: # runs-on: [self-hosted, Linux, AMD] # steps: @@ -176,10 +153,10 @@ jobs: # id: ggml-ci # run: | # vulkaninfo --summary - # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # TODO: provision AMD GPU machine - # ggml-ci-amd-rocm: + # amd-rocm: # runs-on: [self-hosted, Linux, AMD] # steps: @@ -191,10 +168,9 @@ jobs: # id: ggml-ci # run: | # amd-smi static - # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-mac-metal: - needs: determine-tag + gpu-metal: runs-on: [self-hosted, macOS, ARM64] steps: @@ -204,13 +180,10 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-mac-webgpu: - needs: determine-tag + gpu-webgpu-apple: runs-on: [self-hosted, macOS, ARM64] steps: @@ -233,14 +206,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-mac-vulkan: - needs: determine-tag + gpu-vulkan-apple: runs-on: [self-hosted, macOS, ARM64] steps: @@ -250,14 +220,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-linux-intel-vulkan: - needs: determine-tag + gpu-vulkan-intel-linux: runs-on: [self-hosted, Linux, Intel] steps: @@ -269,14 +236,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp - ggml-ci-win-intel-vulkan: - needs: determine-tag + gpu-vulkan-intel-windows: runs-on: [self-hosted, Windows, X64, Intel] steps: @@ -291,15 +255,13 @@ jobs: MSYSTEM: UCRT64 CHERE_INVOKING: 1 PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }} - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create # a valid python environment for testing LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp - ggml-ci-intel-openvino-gpu-low-perf: - needs: determine-tag + gpu-openvino-low-perf: runs-on: [self-hosted, Linux, Intel, OpenVINO] concurrency: @@ -331,8 +293,99 @@ jobs: - name: Test id: ggml-ci - env: - HF_WEBUI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | source ./openvino_toolkit/setupvars.sh - GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + cpu-x64-high-perf: + runs-on: [self-hosted, Linux, X64] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + cpu-arm64-high-perf-graviton4: + runs-on: ah-ubuntu_22_04-c8g_8x + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dependencies + id: depends + run: | + set -euxo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ + apt-get install -y \ + build-essential \ + python3-venv \ + gpg \ + wget \ + time \ + git-lfs + + git lfs install + + # install the latest cmake + sudo install -d /usr/share/keyrings + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor \ + | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ + | sudo tee /etc/apt/sources.list.d/kitware.list + sudo apt-get update + sudo apt-get install -y cmake + + - name: Test + id: ggml-ci + run: | + LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + cpu-arm64-graviton4-kleidiai: + runs-on: ah-ubuntu_22_04-c8g_8x + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Dependencies + id: depends + run: | + set -euxo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ + apt-get install -y \ + build-essential \ + python3-venv \ + gpg \ + wget \ + time \ + git-lfs + + git lfs install + + # install the latest cmake + sudo install -d /usr/share/keyrings + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor \ + | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ + | sudo tee /etc/apt/sources.list.d/kitware.list + sudo apt-get update + sudo apt-get install -y cmake + + - name: Test + id: ggml-ci + run: | + GG_BUILD_KLEIDIAI=1 \ + GG_BUILD_EXTRA_TESTS_0=1 \ + bash ./ci/run.sh ./tmp/results ./tmp/mnt diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml index 09635f64edb..ef377c8186f 100644 --- a/.github/workflows/build-sycl.yml +++ b/.github/workflows/build-sycl.yml @@ -29,132 +29,134 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - ubuntu-24-sycl: - strategy: - matrix: - build: [fp32, fp16] - include: - - build: fp32 - fp16: OFF - - build: fp16 - fp16: ON - - runs-on: ubuntu-24.04 - - env: - ONEAPI_ROOT: /opt/intel/oneapi/ - ONEAPI_INSTALLER_VERSION: "2025.3.3" - LEVEL_ZERO_VERSION: "1.28.2" - LEVEL_ZERO_UBUNTU_VERSION: "u24.04" - - continue-on-error: true - - steps: - - uses: actions/checkout@v6 - - - name: Use oneAPI Installation Cache - uses: actions/cache@v5 - id: cache-sycl - with: - path: ${{ env.ONEAPI_ROOT }} - key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} - - - name: Download & Install oneAPI - shell: bash - if: steps.cache-sycl.outputs.cache-hit != 'true' - run: | - cd /tmp - wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh - sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept - - - name: Install Level Zero SDK - shell: bash - run: | - cd /tmp - wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb - wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb - sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb - - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-24-sycl-${{ matrix.build }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -G "Ninja" \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DLLAMA_OPENSSL=OFF \ - -DGGML_NATIVE=OFF \ - -DGGML_SYCL_F16=${{ matrix.fp16 }} - time cmake --build build --config Release -j $(nproc) - - windows-latest-sycl: - runs-on: windows-2022 - - defaults: - run: - shell: bash - - env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel - LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip - ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" - ONEAPI_INSTALLER_VERSION: "2025.3.3" - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Use oneAPI Installation Cache - uses: actions/cache@v5 - id: cache-sycl - with: - path: ${{ env.ONEAPI_ROOT }} - key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} - - - name: Download & Install oneAPI - shell: bash - if: steps.cache-sycl.outputs.cache-hit != 'true' - run: | - scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - - - name: Install Level Zero SDK - shell: pwsh - run: | - Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" - Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force - "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: windows-latest-sycl - variant: ccache - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args - - - name: Build - id: cmake_build - run: examples/sycl/win-build-sycl.bat +# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) +# in order to enable it again, we have to provision dedicated runners to run it +# ubuntu-24-sycl: +# strategy: +# matrix: +# build: [fp32] +# include: +# - build: fp32 +# fp16: OFF +# +# runs-on: ubuntu-24.04 +# +# env: +# ONEAPI_ROOT: /opt/intel/oneapi/ +# ONEAPI_INSTALLER_VERSION: "2025.3.3" +# LEVEL_ZERO_VERSION: "1.28.2" +# LEVEL_ZERO_UBUNTU_VERSION: "u24.04" +# +# continue-on-error: true +# +# steps: +# - uses: actions/checkout@v6 +# +# - name: Use oneAPI Installation Cache +# uses: actions/cache@v5 +# id: cache-sycl +# with: +# path: ${{ env.ONEAPI_ROOT }} +# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} +# +# - name: Download & Install oneAPI +# shell: bash +# if: steps.cache-sycl.outputs.cache-hit != 'true' +# run: | +# cd /tmp +# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh +# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept +# +# - name: Install Level Zero SDK +# shell: bash +# run: | +# cd /tmp +# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb +# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb +# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb +# +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: ccache +# uses: ggml-org/ccache-action@v1.2.21 +# with: +# key: sycl-ubuntu-24-${{ matrix.build }} +# evict-old-files: 1d +# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} +# +# - name: Build +# id: cmake_build +# run: | +# source /opt/intel/oneapi/setvars.sh +# cmake -B build \ +# -G "Ninja" \ +# -DCMAKE_BUILD_TYPE=Release \ +# -DGGML_SYCL=ON \ +# -DCMAKE_C_COMPILER=icx \ +# -DCMAKE_CXX_COMPILER=icpx \ +# -DLLAMA_OPENSSL=OFF \ +# -DGGML_NATIVE=OFF \ +# -DGGML_SYCL_F16=${{ matrix.fp16 }} +# time cmake --build build --config Release -j $(nproc) + +# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) +# in order to enable it again, we have to provision dedicated runners to run it +# windows-latest-sycl: +# runs-on: windows-2022 +# +# defaults: +# run: +# shell: bash +# +# env: +# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe +# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel +# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip +# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" +# ONEAPI_INSTALLER_VERSION: "2025.3.3" +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: Use oneAPI Installation Cache +# uses: actions/cache@v5 +# id: cache-sycl +# with: +# path: ${{ env.ONEAPI_ROOT }} +# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} +# +# - name: Download & Install oneAPI +# shell: bash +# if: steps.cache-sycl.outputs.cache-hit != 'true' +# run: | +# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL +# +# - name: Install Level Zero SDK +# shell: pwsh +# run: | +# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" +# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force +# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append +# +# - name: ccache +# uses: ggml-org/ccache-action@v1.2.21 +# with: +# key: sycl-windows-latest +# variant: ccache +# evict-old-files: 1d +# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} +# +# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args +# +# - name: Build +# id: cmake_build +# run: examples/sycl/win-build-sycl.bat diff --git a/.github/workflows/build-vulkan.yml b/.github/workflows/build-vulkan.yml index ab32b6525ba..a103c50faf7 100644 --- a/.github/workflows/build-vulkan.yml +++ b/.github/workflows/build-vulkan.yml @@ -31,26 +31,56 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: - ubuntu-24-vulkan-llvmpipe: - runs-on: ubuntu-24.04 + ubuntu-arm64: + runs-on: ubuntu-24.04-arm steps: - name: Clone id: checkout uses: actions/checkout@v6 + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build + echo "CC=gcc-14" >> "$GITHUB_ENV" + echo "CXX=g++-14" >> "$GITHUB_ENV" + - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: ubuntu-24-vulkan-llvmpipe + key: vulkan-ubuntu-24.04-arm-new + variant: ccache evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + - name: Configure + id: cmake_configure + run: | + cmake -B build \ + -G "Ninja" \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_VULKAN=ON + + - name: Build + id: cmake_build + run: | + time cmake --build build -j $(nproc) + + ubuntu-llvmpipe: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + - name: Dependencies id: depends run: | @@ -68,7 +98,7 @@ jobs: id: cache-sdk with: path: ./vulkan_sdk - key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} + key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} - name: Setup Vulkan SDK if: steps.cache-sdk.outputs.cache-hit != 'true' @@ -77,6 +107,13 @@ jobs: path: ./vulkan_sdk version: ${{ env.VULKAN_SDK_VERSION }} + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: vulkan-ubuntu-24.04-llvmpipe + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + - name: Build id: cmake_build run: | diff --git a/.github/workflows/build-webgpu.yml b/.github/workflows/build-webgpu.yml new file mode 100644 index 00000000000..bade95c6ab1 --- /dev/null +++ b/.github/workflows/build-webgpu.yml @@ -0,0 +1,173 @@ +name: CI (webgpu) + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + paths: [ + '.github/workflows/build-webgpu.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.wgsl' + ] + + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/build-webgpu.yml', + 'ggml/src/ggml-webgpu/**' + ] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + +jobs: + macos: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: webgpu-macos-latest + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Dawn Dependency + id: dawn-depends + run: | + DAWN_VERSION="v20260317.182325" + DAWN_OWNER="google" + DAWN_REPO="dawn" + DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release" + echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + curl -L -o artifact.tar.gz \ + "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + mkdir dawn + tar -xvf artifact.tar.gz -C dawn --strip-components=1 + + - name: Build + id: cmake_build + run: | + export CMAKE_PREFIX_PATH=dawn + cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF + time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + ubuntu: + runs-on: ubuntu-24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: webgpu-ubuntu-24.04 + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Dependencies + id: depends + run: | + sudo add-apt-repository -y ppa:kisak/kisak-mesa + sudo apt-get update -y + sudo apt-get install -y build-essential mesa-vulkan-drivers \ + libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev + + - name: Dawn Dependency + id: dawn-depends + run: | + sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev + DAWN_VERSION="v20260317.182325" + DAWN_OWNER="google" + DAWN_REPO="dawn" + DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release" + echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + curl -L -o artifact.tar.gz \ + "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" + mkdir dawn + tar -xvf artifact.tar.gz -C dawn --strip-components=1 + + - name: Build + id: cmake_build + run: | + export Dawn_DIR=dawn/lib64/cmake/Dawn + cmake -B build \ + -DGGML_WEBGPU=ON + time cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + # This is using llvmpipe and runs slower than other backends + # test-backend-ops is too slow on llvmpipe, skip it + ctest -L main -E test-backend-ops --verbose --timeout 900 + + ubuntu-wasm: + runs-on: ubuntu-24.04-arm + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: webgpu-ubuntu-24.04-arm-wasm + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Install Emscripten + run: | + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + ./emsdk install latest + ./emsdk activate latest + + - name: Fetch emdawnwebgpu + run: | + DAWN_TAG="v20260317.182325" + EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip" + echo "Downloading ${EMDAWN_PKG}" + curl -L -o emdawn.zip \ + "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}" + unzip emdawn.zip + + - name: Build WASM WebGPU + run: | + source emsdk/emsdk_env.sh + emcmake cmake -B build-wasm \ + -G "Ninja" \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_WEBGPU=ON \ + -DLLAMA_OPENSSL=OFF \ + -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg + + time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 65fa24f4468..00000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,1110 +0,0 @@ -name: CI - -on: - workflow_dispatch: # allows manual triggering - push: - branches: - - master - paths: [ - '.github/workflows/build.yml', - '.github/workflows/build-cmake-pkg.yml', - '**/CMakeLists.txt', - '**/.cmake', - '**/*.h', - '**/*.hpp', - '**/*.c', - '**/*.cpp', - '**/*.cu', - '**/*.cuh', - '**/*.swift', - '**/*.m', - '**/*.metal', - '**/*.comp', - '**/*.glsl', - '**/*.wgsl' - ] - - pull_request: - types: [opened, synchronize, reopened] - paths: [ - '.github/workflows/build.yml', - '.github/workflows/build-cmake-pkg.yml', - '**/CMakeLists.txt', - '**/.cmake', - '**/*.h', - '**/*.hpp', - '**/*.c', - '**/*.cpp', - '**/*.cu', - '**/*.cuh', - '**/*.swift', - '**/*.m', - '**/*.metal', - '**/*.comp', - '**/*.glsl', - '**/*.wgsl' - ] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -env: - GGML_NLOOP: 3 - GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - -jobs: - build-cmake-pkg: - uses: ./.github/workflows/build-cmake-pkg.yml - - macOS-latest-arm64: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: macOS-latest-arm64 - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_BUILD_BORINGSSL=ON \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=OFF \ - -DGGML_METAL_SHADER_DEBUG=ON \ - -DGGML_RPC=ON - time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1 - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main -E "test-llama-archs" --verbose --timeout 900 - - macOS-latest-x64: - runs-on: macos-15-intel - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: macOS-latest-x64 - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - sysctl -a - # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_BUILD_BORINGSSL=ON \ - -DGGML_METAL=OFF \ - -DGGML_RPC=ON \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3 - time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - macOS-latest-arm64-webgpu: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: macOS-latest-arm64-webgpu - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dawn Dependency - id: dawn-depends - run: | - DAWN_VERSION="v20260317.182325" - DAWN_OWNER="google" - DAWN_REPO="dawn" - DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release" - echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" - curl -L -o artifact.tar.gz \ - "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" - mkdir dawn - tar -xvf artifact.tar.gz -C dawn --strip-components=1 - - - name: Build - id: cmake_build - run: | - export CMAKE_PREFIX_PATH=dawn - cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF - time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-cpu: - strategy: - matrix: - include: - - build: 'x64' - os: ubuntu-22.04 - - build: 'arm64' - os: ubuntu-24.04-arm - - build: 's390x' - os: ubuntu-24.04-s390x - - build: 'ppc64le' - os: ubuntu-24.04-ppc64le - - runs-on: ${{ matrix.os }} - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - if: ${{ matrix.build != 's390x' && matrix.build != 'ppc64le' }} - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-cpu-${{ matrix.build }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build Dependencies - id: build_depends - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - python3 python3-pip python3-dev python3-wheel \ - libjpeg-dev build-essential libssl-dev \ - git-lfs - - - name: Toolchain workaround (GCC 14) - if: ${{ contains(matrix.os, 'ubuntu-24.04') }} - run: | - sudo apt-get install -y gcc-14 g++-14 - echo "CC=gcc-14" >> "$GITHUB_ENV" - echo "CXX=g++-14" >> "$GITHUB_ENV" - - - name: Python Dependencies - id: python_depends - run: | - export PIP_BREAK_SYSTEM_PACKAGES="1" - python3 -m pip install --upgrade pip setuptools - pip3 install ./gguf-py - - - name: Swap Endianness - id: endianness - if: ${{ matrix.build == 's390x' }} - run: | - for f in models/*.gguf; do - echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big - done - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_RPC=ON - time cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - if: ${{ matrix.build != 's390x' }} - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - - name: Test llama2c (s390x) - id: llama2c_test_s390x - if: ${{ matrix.build == 's390x' }} - run: | - cd build - echo "Fetch llama2c big-endian model" - wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf - ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - android-arm64: - runs-on: ubuntu-latest - - env: - NDK_VERSION: "29.0.14206865" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: android-arm64 - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Set up JDK - uses: actions/setup-java@v5 - with: - java-version: 17 - distribution: temurin - - - name: Setup Android SDK - uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1 - with: - log-accepted-android-sdk-licenses: false - - - name: Install NDK - run: | - sdkmanager "ndk;${{ env.NDK_VERSION }}" - echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-28 \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_BACKEND_DL=ON \ - -DGGML_NATIVE=OFF \ - -DGGML_CPU_ALL_VARIANTS=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_BORINGSSL=ON \ - -DGGML_RPC=ON - time cmake --build build --config Release -j $(nproc) - - ubuntu-latest-rpc: - runs-on: ubuntu-latest - - continue-on-error: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libssl-dev ninja-build - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -G "Ninja" \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_RPC=ON - time cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose - - ubuntu-24-vulkan: - strategy: - matrix: - include: - - build: 'x64' - os: ubuntu-24.04 - - build: 'arm64' - os: ubuntu-24.04-arm - - runs-on: ${{ matrix.os }} - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build - echo "CC=gcc-14" >> "$GITHUB_ENV" - echo "CXX=g++-14" >> "$GITHUB_ENV" - - - name: Configure - id: cmake_configure - run: | - cmake -B build \ - -G "Ninja" \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DGGML_BACKEND_DL=ON \ - -DGGML_CPU_ALL_VARIANTS=ON \ - -DGGML_VULKAN=ON - - - name: Build - id: cmake_build - run: | - time cmake --build build -j $(nproc) - - ubuntu-24-webgpu: - runs-on: ubuntu-24.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-24-webgpu - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo add-apt-repository -y ppa:kisak/kisak-mesa - sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers \ - libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev - - - name: Get latest Vulkan SDK version - id: vulkan_sdk_version - run: | - echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV" - - - name: Use Vulkan SDK Cache - uses: actions/cache@v5 - id: cache-sdk - with: - path: ./vulkan_sdk - key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} - - - name: Setup Vulkan SDK - if: steps.cache-sdk.outputs.cache-hit != 'true' - uses: ./.github/actions/linux-setup-vulkan - with: - path: ./vulkan_sdk - version: ${{ env.VULKAN_SDK_VERSION }} - - - name: Dawn Dependency - id: dawn-depends - run: | - sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev - DAWN_VERSION="v20260317.182325" - DAWN_OWNER="google" - DAWN_REPO="dawn" - DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release" - echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" - curl -L -o artifact.tar.gz \ - "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz" - mkdir dawn - tar -xvf artifact.tar.gz -C dawn --strip-components=1 - - - name: Build - id: cmake_build - run: | - export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build \ - -DGGML_WEBGPU=ON - time cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - # This is using llvmpipe and runs slower than other backends - # test-backend-ops is too slow on llvmpipe, skip it - ctest -L main -E test-backend-ops --verbose --timeout 900 - - ubuntu-24-webgpu-wasm: - runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Install Emscripten - run: | - git clone https://github.com/emscripten-core/emsdk.git - cd emsdk - ./emsdk install latest - ./emsdk activate latest - - - name: Fetch emdawnwebgpu - run: | - DAWN_TAG="v20260317.182325" - EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip" - echo "Downloading ${EMDAWN_PKG}" - curl -L -o emdawn.zip \ - "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}" - unzip emdawn.zip - - - name: Build WASM WebGPU - run: | - source emsdk/emsdk_env.sh - emcmake cmake -B build-wasm \ - -G "Ninja" \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_WEBGPU=ON \ - -DLLAMA_OPENSSL=OFF \ - -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg - - time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc) - - ubuntu-22-hip: - runs-on: ubuntu-22.04 - container: rocm/dev-ubuntu-22.04:6.1.2 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-22-hip - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build with native CMake HIP support - id: cmake_build - run: | - cmake -B build -S . \ - -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ - -DGGML_HIP_ROCWMMA_FATTN=ON \ - -DGPU_TARGETS="gfx1030" \ - -DGGML_HIP=ON - cmake --build build --config Release -j $(nproc) - - ubuntu-22-musa: - runs-on: ubuntu-22.04 - container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dependencies - id: depends - run: | - apt-get update - apt-get install -y build-essential git cmake libssl-dev - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-22-musa - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build with native CMake MUSA support - id: cmake_build - run: | - cmake -B build -S . \ - -DGGML_MUSA=ON - time cmake --build build --config Release -j $(nproc) - - - windows-latest: - runs-on: windows-2025 - - env: - OPENBLAS_VERSION: 0.3.23 - SDE_VERSION: 9.33.0-2024-01-07 - VULKAN_VERSION: 1.4.313.2 - - strategy: - matrix: - include: - - build: 'cpu-x64 (static)' - arch: 'x64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF' - - build: 'openblas-x64' - arch: 'x64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - - build: 'vulkan-x64' - arch: 'x64' - defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' - - build: 'llvm-arm64' - arch: 'arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - - build: 'llvm-arm64-opencl-adreno' - arch: 'arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: windows-latest-${{ matrix.build }} - variant: ccache - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Download OpenBLAS - id: get_openblas - if: ${{ matrix.build == 'openblas-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" - curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" - mkdir $env:RUNNER_TEMP/openblas - tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') - & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll - - - name: Install Vulkan SDK - id: get_vulkan - if: ${{ matrix.build == 'vulkan-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" - & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install - Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" - Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} - run: | - git clone https://github.com/KhronosGroup/OpenCL-Headers - cd OpenCL-Headers - cmake -B build ` - -DBUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build --target install - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader - cd OpenCL-ICD-Loader - cmake -B build-arm64-release ` - -A arm64 ` - -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build-arm64-release --target install --config release - - - name: Build - id: cmake_build - run: | - cmake -S . -B build ${{ matrix.defines }} ` - -DLLAMA_BUILD_BORINGSSL=ON - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - - - name: Add libopenblas.dll - id: add_libopenblas_dll - if: ${{ matrix.build == 'openblas-x64' }} - run: | - cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll - cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt - - - name: Test - id: cmake_test - if: ${{ matrix.arch == 'x64' }} - run: | - cd build - ctest -L main -C Release --verbose --timeout 900 - - # TODO: disabled for now, consider adding tests for all CPU variants instead - # - name: Test (Intel SDE) - # id: cmake_test_sde - # if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation - # run: | - # curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" - # # for some weird reason windows tar doesn't like sde tar.xz - # 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz - # 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar - # $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) - # cd build - # $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 - # & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - - ubuntu-latest-cuda: - runs-on: ubuntu-latest - container: nvidia/cuda:12.6.2-devel-ubuntu24.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Install dependencies - env: - DEBIAN_FRONTEND: noninteractive - run: | - apt update - apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-latest-cuda - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build with CMake - # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project - run: | - cmake -S . -B build -G Ninja \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CUDA_ARCHITECTURES=89-real \ - -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - -DGGML_NATIVE=OFF \ - -DGGML_CUDA=ON \ - -DGGML_CUDA_CUB_3DOT2=ON - cmake --build build - - windows-2022-cuda: - runs-on: windows-2022 - - strategy: - matrix: - cuda: ['12.4'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Install ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: windows-cuda-${{ matrix.cuda }} - variant: ccache - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Install Cuda Toolkit - uses: ./.github/actions/windows-setup-cuda - with: - cuda_version: ${{ matrix.cuda }} - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Build - id: cmake_build - shell: cmd - # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project - run: | - call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 - cmake -S . -B build -G "Ninja Multi-Config" ^ - -DLLAMA_BUILD_SERVER=ON ^ - -DLLAMA_BUILD_BORINGSSL=ON ^ - -DGGML_NATIVE=OFF ^ - -DGGML_BACKEND_DL=ON ^ - -DGGML_CPU_ALL_VARIANTS=ON ^ - -DGGML_CUDA=ON ^ - -DGGML_RPC=ON ^ - -DGGML_CUDA_CUB_3DOT2=ON - set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 - cmake --build build --config Release -j %NINJA_JOBS% -t ggml - cmake --build build --config Release - - - windows-latest-hip: - runs-on: windows-2022 - - env: - # Make sure this is in sync with build-cache.yml - HIPSDK_INSTALLER_VERSION: "26.Q1" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Grab rocWMMA package - id: grab_rocwmma - run: | - curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb" - 7z x rocwmma.deb - 7z x data.tar - - - name: Use ROCm Installation Cache - uses: actions/cache@v5 - id: cache-rocm - with: - path: C:\Program Files\AMD\ROCm - key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} - - - name: Setup ROCm - if: steps.cache-rocm.outputs.cache-hit != 'true' - uses: ./.github/actions/windows-setup-rocm - with: - version: ${{ env.HIPSDK_INSTALLER_VERSION }} - - - name: Verify ROCm - id: verify - run: | - # Find and test ROCm installation - $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 - if (-not $clangPath) { - Write-Error "ROCm installation not found" - exit 1 - } - & $clangPath.FullName --version - - - name: Install ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ${{ github.job }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" ` - -DCMAKE_BUILD_TYPE=Release ` - -DLLAMA_BUILD_BORINGSSL=ON ` - -DROCM_DIR="${env:HIP_PATH}" ` - -DGGML_HIP=ON ` - -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGPU_TARGETS="gfx1100" ` - -DGGML_RPC=ON - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - - ubuntu-cpu-riscv64-native: - runs-on: ubuntu-24.04-riscv - - steps: - - name: Install dependencies - run: | - # Install necessary packages - sudo apt-get update - sudo apt-get install -y libssl-dev - - # Set gcc-14 and g++-14 as the default compilers - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - - git lfs install - - - name: Check environment - run: | - uname -a - gcc --version - g++ --version - ldd --version - cmake --version - rustc --version - - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1 - with: - key: ubuntu-cpu-riscv64-native - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=ON \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DGGML_RPC=ON \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 - - time cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - -# TODO: simplify the following workflows using a matrix -# TODO: run lighter CI on PRs and the full CI only on master (if needed) - ggml-ci-x64-cpu-low-perf: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-x64-cpu-low-perf - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ggml-ci-arm64-cpu-low-perf: - runs-on: ubuntu-22.04-arm - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-low-perf - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ggml-ci-x64-cpu-high-perf: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-x64-cpu-high-perf - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ggml-ci-arm64-cpu-high-perf: - runs-on: ubuntu-22.04-arm - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-high-perf - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ggml-ci-arm64-cpu-high-perf-sve: - runs-on: ubuntu-22.04-arm - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-high-perf-sve - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ggml-ci-arm64-cpu-kleidiai: - runs-on: ubuntu-22.04-arm - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-kleidiai - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential - - - name: Test - id: ggml-ci - run: | - GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - - ggml-ci-arm64-cpu-kleidiai-graviton4: - runs-on: ah-ubuntu_22_04-c8g_8x - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Dependencies - id: depends - run: | - set -euxo pipefail - sudo apt-get update - sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ - apt-get install -y \ - build-essential \ - python3-venv \ - gpg \ - wget \ - time \ - git-lfs - - git lfs install - - # install the latest cmake - sudo install -d /usr/share/keyrings - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ - | gpg --dearmor \ - | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null - echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ - | sudo tee /etc/apt/sources.list.d/kitware.list - sudo apt-get update - sudo apt-get install -y cmake - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-kleidiai-graviton4 - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Test - id: ggml-ci - run: | - GG_BUILD_KLEIDIAI=1 \ - GG_BUILD_EXTRA_TESTS_0=1 \ - bash ./ci/run.sh ./tmp/results ./tmp/mnt diff --git a/.github/workflows/check-vendor.yml b/.github/workflows/check-vendor.yml index 1671ed7b8bd..015629f380c 100644 --- a/.github/workflows/check-vendor.yml +++ b/.github/workflows/check-vendor.yml @@ -19,7 +19,7 @@ on: jobs: check-vendor: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - name: Checkout diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index c88396c0a7d..50b598b84dd 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -15,7 +15,7 @@ concurrency: jobs: model-naming: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - uses: actions/checkout@v6 - name: Check model naming conventions diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a5bae7141fe..6f1f2721e45 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -11,6 +11,11 @@ name: Publish Docker image on: workflow_dispatch: # allows manual triggering + inputs: + skip_s390x: + description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)" + type: boolean + default: false schedule: # Rebuild daily rather than on every push because it is expensive - cron: '12 4 * * *' @@ -64,6 +69,8 @@ jobs: - name: Generate build and merge matrices id: matrices shell: bash + env: + SKIP_S390X: ${{ inputs.skip_s390x || 'false' }} run: | set -euo pipefail @@ -86,6 +93,11 @@ jobs: ] JSON + if [ "${SKIP_S390X}" = "true" ]; then + jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp + mv build-matrix.json.tmp build-matrix.json + fi + BUILD_MATRIX="$(jq -c . build-matrix.json)" MERGE_MATRIX="$(jq -c ' reduce .[] as $entry ({}; .[$entry.tag] |= ( @@ -132,6 +144,7 @@ jobs: config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }} steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 @@ -187,6 +200,10 @@ jobs: env: GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} uses: ggml-org/free-disk-space@v1.3.1 @@ -211,13 +228,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: full provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -235,13 +265,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: light provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -259,13 +302,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: server provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -330,10 +386,15 @@ jobs: steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Download digest metadata uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: @@ -361,6 +422,8 @@ jobs: IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}" PREFIX="${IMAGE_REPO}:" SRC_TAG="${{ needs.create_tag.outputs.source_tag }}" + BUILD_DATE="${{ steps.build_date.outputs.date }}" + COMMIT_SHA="${{ steps.checkout.outputs.commit }}" TAGS="${{ matrix.config.tag }}" ARCHES="${{ matrix.config.arches }}" DIGEST_GLOB="/tmp/digests/*.tsv" @@ -412,11 +475,21 @@ jobs: refs+=("${IMAGE_REPO}@${digest}") done + local annotations=( + --annotation "index:org.opencontainers.image.created=${BUILD_DATE}" + --annotation "index:org.opencontainers.image.version=${SRC_TAG}" + --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}" + --annotation "index:org.opencontainers.image.title=llama.cpp" + --annotation "index:org.opencontainers.image.description=LLM inference in C/C++" + --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}" + --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}" + ) + echo "Creating ${merged_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}" echo "Creating ${merged_versioned_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}" } for tag in $TAGS; do diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index 53f6a0ccfda..59159cd4144 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -15,7 +15,7 @@ concurrency: jobs: editorconfig: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - uses: actions/checkout@v6 - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0 diff --git a/.github/workflows/hip-quality-check.yml b/.github/workflows/hip-quality-check.yml index d00d30ed65c..14b9f41a6ec 100644 --- a/.github/workflows/hip-quality-check.yml +++ b/.github/workflows/hip-quality-check.yml @@ -28,9 +28,9 @@ concurrency: env: GGML_NLOOP: 3 GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: ubuntu-22-hip-quality-check: @@ -50,7 +50,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: ubuntu-22-hip-quality-check + key: hip-quality-check-ubuntu-22.04 evict-old-files: 1d save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml index 7126b62b690..3e440b67d9b 100644 --- a/.github/workflows/pre-tokenizer-hashes.yml +++ b/.github/workflows/pre-tokenizer-hashes.yml @@ -3,16 +3,16 @@ name: Check Pre-Tokenizer Hashes on: push: paths: - - 'convert_hf_to_gguf.py' + - 'conversion/base.py' - 'convert_hf_to_gguf_update.py' pull_request: paths: - - 'convert_hf_to_gguf.py' + - 'conversion/base.py' - 'convert_hf_to_gguf_update.py' jobs: pre-tokenizer-hashes: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - name: Checkout repository @@ -30,16 +30,16 @@ jobs: - name: Update pre-tokenizer hashes run: | - cp convert_hf_to_gguf.py /tmp + cp conversion/base.py /tmp .venv/bin/python convert_hf_to_gguf_update.py --check-missing - name: Check if committed pre-tokenizer hashes matches generated version run: | - if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then - echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)." - echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes" + if ! diff -q conversion/base.py /tmp/base.py; then + echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)." + echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes" echo "Differences found:" - diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true + diff conversion/base.py /tmp/base.py || true exit 1 fi echo "Model pre-tokenizer hashes are up to date." diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 1219b874592..2c7fab40b44 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -20,7 +20,7 @@ concurrency: jobs: python-check-requirements: - runs-on: ubuntu-slim + runs-on: [self-hosted, CPU, fast] name: check-requirements steps: - name: Check out source repository diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 1e5d64c1aee..0424f372a14 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -21,7 +21,7 @@ concurrency: jobs: flake8-lint: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] name: Lint steps: - name: Check out source repository diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index cbeeb39d05b..14edb1a9d17 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -22,7 +22,7 @@ concurrency: jobs: python-type-check: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] name: python type-check steps: - name: Check out source repository diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8b01798ae78..a1642fc2229 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,34 +27,54 @@ on: '**/*.glsl' ] -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - env: + GH_TOKEN: ${{ github.token }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }} CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON" +# note: run this workflow one at a time for better cache reuse +concurrency: + group: release + queue: max + jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml + check-release: + runs-on: ubuntu-slim - macOS-cpu: - needs: - - webui-build + outputs: + should_release: ${{ steps.check.outputs.should_release }} + + steps: + - id: check + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + echo "should_release=true" >> $GITHUB_OUTPUT + elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then + if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then + echo "should_release=false" >> $GITHUB_OUTPUT + else + echo "should_release=true" >> $GITHUB_OUTPUT + fi + else + echo "should_release=false" >> $GITHUB_OUTPUT + fi + macos-cpu: + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} strategy: matrix: include: - build: 'arm64' arch: 'arm64' - os: macos-14 + os: macos-26 defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON" - - build: 'arm64-kleidiai' - arch: 'arm64' - os: macos-14 - defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON" + # TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780) + # in order to enable it again, we have to provision dedicated runners to run it + #- build: 'arm64-kleidiai' + # arch: 'arm64' + # os: macos-14 + # defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON" - build: 'x64' arch: 'x64' os: macos-15-intel @@ -64,6 +84,9 @@ jobs: runs-on: ${{ matrix.os }} + permissions: + actions: write + steps: - name: Clone id: checkout @@ -71,17 +94,17 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: macOS-latest-${{ matrix.arch }} - evict-old-files: 1d + key: release-${{ matrix.os }}-${{ matrix.arch }} - name: Build id: cmake_build @@ -96,6 +119,11 @@ jobs: ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-${{ matrix.os }}-${{ matrix.arch }} + - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name @@ -104,7 +132,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -113,9 +141,8 @@ jobs: name: llama-bin-macos-${{ matrix.build }}.tar.gz ubuntu-cpu: - needs: - - webui-build - + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} strategy: matrix: include: @@ -128,6 +155,9 @@ jobs: runs-on: ${{ matrix.os }} + permissions: + actions: write + steps: - name: Clone id: checkout @@ -135,18 +165,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: ccache - if: ${{ matrix.build != 's390x' }} - uses: ggml-org/ccache-action@v1.2.21 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - key: ubuntu-cpu-${{ matrix.build }} - evict-old-files: 1d + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Dependencies id: depends @@ -161,6 +185,12 @@ jobs: echo "CC=gcc-14" >> "$GITHUB_ENV" echo "CXX=g++-14" >> "$GITHUB_ENV" + - name: ccache + if: ${{ matrix.build != 's390x' }} + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-${{ matrix.os }}-cpu + - name: Build id: cmake_build run: | @@ -174,6 +204,12 @@ jobs: ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) + - name: ccache-clear + if: ${{ matrix.build != 's390x' }} + uses: ./.github/actions/ccache-clear + with: + key: release-${{ matrix.os }}-cpu + - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name @@ -182,7 +218,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -191,8 +227,8 @@ jobs: name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz ubuntu-vulkan: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} strategy: matrix: @@ -204,6 +240,9 @@ jobs: runs-on: ${{ matrix.os }} + permissions: + actions: write + steps: - name: Clone id: checkout @@ -211,17 +250,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - key: ubuntu-vulkan-${{ matrix.build }} - evict-old-files: 1d + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Dependencies id: depends @@ -238,6 +272,11 @@ jobs: echo "CXX=g++-14" >> "$GITHUB_ENV" fi + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-${{ matrix.os }}-vulkan + - name: Build id: cmake_build run: | @@ -251,6 +290,11 @@ jobs: ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-${{ matrix.os }}-vulkan + - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name @@ -259,7 +303,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -268,11 +312,14 @@ jobs: name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz android-arm64: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: ubuntu-latest + #permissions: + # actions: write + env: NDK_VERSION: "29.0.14206865" @@ -283,17 +330,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: android-arm64 - evict-old-files: 1d + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Set up JDK uses: actions/setup-java@v5 @@ -311,6 +353,17 @@ jobs: sdkmanager "ndk;${{ env.NDK_VERSION }}" echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV + # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789) + # for some reason, the ccache does not improve the build time in this case + # example: + # cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831 + # cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394 + # + #- name: ccache + # uses: ggml-org/ccache-action@v1.2.21 + # with: + # key: release-android-arm64 + - name: Build id: cmake_build run: | @@ -329,6 +382,11 @@ jobs: ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) + #- name: ccache-clear + # uses: ./.github/actions/ccache-clear + # with: + # key: release-android-arm64 + - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name @@ -337,7 +395,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -346,11 +404,14 @@ jobs: name: llama-bin-android-arm64.tar.gz ubuntu-24-openvino: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: ubuntu-24.04 + permissions: + actions: write + outputs: openvino_version: ${{ steps.openvino_version.outputs.value }} @@ -370,17 +431,17 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: ubuntu-24-openvino-release-no-preset-v1 - evict-old-files: 1d + key: release-ubuntu-24.04-openvino-release-no-preset-v1 - name: Dependencies run: | @@ -393,7 +454,7 @@ jobs: id: cache-openvino with: path: ./openvino_toolkit - key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} - name: Setup OpenVINO Toolkit if: steps.cache-openvino.outputs.cache-hit != 'true' @@ -418,6 +479,11 @@ jobs: -DGGML_OPENVINO=ON cmake --build build/ReleaseOV --config Release -j $(nproc) + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-ubuntu-24.04-openvino-release-no-preset-v1 + - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name @@ -426,7 +492,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/ReleaseOV/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -435,11 +501,14 @@ jobs: name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz windows-cpu: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: windows-2025 + permissions: + actions: write + strategy: matrix: include: @@ -452,23 +521,22 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - key: windows-latest-cpu-${{ matrix.arch }} - variant: ccache - evict-old-files: 1d + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install Ninja run: | choco install ninja + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-windows-2025-${{ matrix.arch }}-cpu + - name: Build shell: cmd run: | @@ -483,6 +551,11 @@ jobs: ${{ env.CMAKE_ARGS }} cmake --build build --config Release + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2025-${{ matrix.arch }}-cpu + - name: Pack artifacts id: pack_artifacts run: | @@ -496,11 +569,14 @@ jobs: name: llama-bin-win-cpu-${{ matrix.arch }}.zip windows: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: windows-2025 + permissions: + actions: write + env: OPENBLAS_VERSION: 0.3.23 VULKAN_VERSION: 1.4.313.2 @@ -522,18 +598,12 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }} - variant: ccache - evict-old-files: 1d + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install Vulkan SDK id: get_vulkan @@ -549,6 +619,11 @@ jobs: run: | choco install ninja + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }} + - name: Install OpenCL Headers and Libs id: install_opencl if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }} @@ -575,6 +650,11 @@ jobs: cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --config Release --target ${{ matrix.target }} + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }} + - name: Pack artifacts id: pack_artifacts run: | @@ -587,32 +667,29 @@ jobs: name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip windows-cuda: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: windows-2022 + permissions: + actions: write + strategy: matrix: - cuda: ['12.4', '13.1'] + cuda: ['12.4', '13.3'] steps: - name: Clone id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: Install ccache - uses: ggml-org/ccache-action@v1.2.21 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - key: windows-cuda-${{ matrix.cuda }} - variant: ccache - evict-old-files: 1d + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install Cuda Toolkit uses: ./.github/actions/windows-setup-cuda @@ -624,6 +701,11 @@ jobs: run: | choco install ninja + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-windows-2022-x64-cuda-${{ matrix.cuda }} + - name: Build id: cmake_build shell: cmd @@ -640,6 +722,11 @@ jobs: set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2022-x64-cuda-${{ matrix.cuda }} + - name: Pack artifacts id: pack_artifacts run: | @@ -666,221 +753,220 @@ jobs: path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip - windows-sycl: - needs: - - webui-build - - runs-on: windows-2022 - - defaults: - run: - shell: bash - - env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel - LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip - ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" - ONEAPI_INSTALLER_VERSION: "2025.3.3" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: Use oneAPI Installation Cache - uses: actions/cache@v5 - id: cache-sycl - with: - path: ${{ env.ONEAPI_ROOT }} - key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} - - - name: Download & Install oneAPI - shell: bash - if: steps.cache-sycl.outputs.cache-hit != 'true' - run: | - scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - - - name: Install Level Zero SDK - shell: pwsh - run: | - Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" - Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force - "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append - - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: windows-latest-sycl - variant: ccache - evict-old-files: 1d - - - name: Build - id: cmake_build - shell: cmd - run: | - call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force - cmake -G "Ninja" -B build ^ - -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^ - -DCMAKE_BUILD_TYPE=Release ^ - -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ - -DGGML_CPU=OFF -DGGML_SYCL=ON ^ - -DLLAMA_BUILD_BORINGSSL=ON - cmake --build build --target ggml-sycl -j - - - name: Build the release package - id: pack_artifacts - run: | - echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true) - if [ -n "$ZE_LOADER_DLL" ]; then - echo "Using Level Zero loader: $ZE_LOADER_DLL" - cp "$ZE_LOADER_DLL" ./build/bin - else - echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime" - fi - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin - - echo "cp oneAPI running time dll files to ./build/bin done" - 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/* - - - name: Upload the release package - uses: actions/upload-artifact@v6 - with: - path: llama-bin-win-sycl-x64.zip - name: llama-bin-win-sycl-x64.zip - - ubuntu-24-sycl: - needs: - - webui-build - - strategy: - matrix: - build: [fp32, fp16] - include: - - build: fp32 - fp16: OFF - - build: fp16 - fp16: ON - - runs-on: ubuntu-24.04 - - env: - ONEAPI_ROOT: /opt/intel/oneapi/ - ONEAPI_INSTALLER_VERSION: "2025.3.3" - LEVEL_ZERO_VERSION: "1.28.2" - LEVEL_ZERO_UBUNTU_VERSION: "u24.04" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Use oneAPI Installation Cache - uses: actions/cache@v5 - id: cache-sycl - with: - path: ${{ env.ONEAPI_ROOT }} - key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} - - - name: Download & Install oneAPI - shell: bash - if: steps.cache-sycl.outputs.cache-hit != 'true' - run: | - cd /tmp - wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh - sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept - - - name: Install Level Zero SDK - shell: bash - run: | - cd /tmp - wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb - wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb - sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb - - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 - with: - name: webui-build - path: tools/server/public/ - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ubuntu-24-sycl-${{ matrix.build }} - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -G "Ninja" \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DLLAMA_OPENSSL=OFF \ - -DGGML_NATIVE=OFF \ - -DGGML_SYCL_F16=${{ matrix.fp16 }} - time cmake --build build --config Release -j $(nproc) - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - id: pack_artifacts - run: | - cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . - - - name: Upload artifacts - uses: actions/upload-artifact@v6 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz - name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz +# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) +# in order to enable it again, we have to provision dedicated runners to run it +# windows-sycl: +# +# runs-on: windows-2022 +# +# defaults: +# run: +# shell: bash +# +# env: +# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe +# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel +# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip +# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" +# ONEAPI_INSTALLER_VERSION: "2025.3.3" +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: Use oneAPI Installation Cache +# uses: actions/cache@v5 +# id: cache-sycl +# with: +# path: ${{ env.ONEAPI_ROOT }} +# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} +# +# - name: Download & Install oneAPI +# shell: bash +# if: steps.cache-sycl.outputs.cache-hit != 'true' +# run: | +# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL +# +# - name: Install Level Zero SDK +# shell: pwsh +# run: | +# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" +# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force +# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append +# +# - name: Setup Node.js +# uses: actions/setup-node@v6 +# with: +# node-version: "24" +# cache: "npm" +# cache-dependency-path: "tools/ui/package-lock.json" +# +# - name: ccache +# uses: ggml-org/ccache-action@v1.2.21 +# with: +# key: release-windows-2022-x64-sycl +# +# - name: Build +# id: cmake_build +# shell: cmd +# run: | +# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force +# cmake -G "Ninja" -B build ^ +# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^ +# -DCMAKE_BUILD_TYPE=Release ^ +# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ +# -DGGML_CPU=OFF -DGGML_SYCL=ON ^ +# -DLLAMA_BUILD_BORINGSSL=ON +# cmake --build build --target ggml-sycl -j +# +# - name: Build the release package +# id: pack_artifacts +# run: | +# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" +# +# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin +# +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin +# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true) +# if [ -n "$ZE_LOADER_DLL" ]; then +# echo "Using Level Zero loader: $ZE_LOADER_DLL" +# cp "$ZE_LOADER_DLL" ./build/bin +# else +# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime" +# fi +# +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin +# +# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin +# +# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin +# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin +# +# echo "cp oneAPI running time dll files to ./build/bin done" +# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/* +# +# - name: Upload the release package +# uses: actions/upload-artifact@v6 +# with: +# path: llama-bin-win-sycl-x64.zip +# name: llama-bin-win-sycl-x64.zip + +# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) +# in order to enable it again, we have to provision dedicated runners to run it +# ubuntu-24-sycl: +# +# strategy: +# matrix: +# build: [fp32] +# include: +# - build: fp32 +# fp16: OFF +# +# runs-on: ubuntu-24.04 +# +# env: +# ONEAPI_ROOT: /opt/intel/oneapi/ +# ONEAPI_INSTALLER_VERSION: "2025.3.3" +# LEVEL_ZERO_VERSION: "1.28.2" +# LEVEL_ZERO_UBUNTU_VERSION: "u24.04" +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# with: +# fetch-depth: 0 +# +# - name: Use oneAPI Installation Cache +# uses: actions/cache@v5 +# id: cache-sycl +# with: +# path: ${{ env.ONEAPI_ROOT }} +# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} +# +# - name: Download & Install oneAPI +# shell: bash +# if: steps.cache-sycl.outputs.cache-hit != 'true' +# run: | +# cd /tmp +# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh +# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept +# +# - name: Install Level Zero SDK +# shell: bash +# run: | +# cd /tmp +# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb +# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb +# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb +# +# - name: Setup Node.js +# uses: actions/setup-node@v6 +# with: +# node-version: "24" +# cache: "npm" +# cache-dependency-path: "tools/ui/package-lock.json" +# +# - name: ccache +# uses: ggml-org/ccache-action@v1.2.21 +# with: +# key: release-ubuntu-24.04-sycl +# +# - name: Build +# id: cmake_build +# run: | +# source /opt/intel/oneapi/setvars.sh +# cmake -B build \ +# -G "Ninja" \ +# -DCMAKE_BUILD_TYPE=Release \ +# -DGGML_SYCL=ON \ +# -DCMAKE_C_COMPILER=icx \ +# -DCMAKE_CXX_COMPILER=icpx \ +# -DLLAMA_OPENSSL=OFF \ +# -DGGML_NATIVE=OFF \ +# -DGGML_SYCL_F16=${{ matrix.fp16 }} +# time cmake --build build --config Release -j $(nproc) +# +# - name: Determine tag name +# id: tag +# uses: ./.github/actions/get-tag-name +# +# - name: Pack artifacts +# id: pack_artifacts +# run: | +# cp LICENSE ./build/bin/ +# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . +# +# - name: Upload artifacts +# uses: actions/upload-artifact@v6 +# with: +# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz +# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz ubuntu-22-rocm: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: ubuntu-22.04 + permissions: + actions: write + strategy: matrix: include: @@ -895,11 +981,12 @@ jobs: with: fetch-depth: 0 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Free up disk space uses: ggml-org/free-disk-space@v1.3.1 @@ -909,8 +996,7 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }} - evict-old-files: 1d + key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }} - name: Dependencies id: depends @@ -968,6 +1054,11 @@ jobs: ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }} + - name: Determine tag name id: tag uses: ./.github/actions/get-tag-name @@ -979,7 +1070,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -988,11 +1079,14 @@ jobs: name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz windows-hip: - needs: - - webui-build + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: windows-2022 + permissions: + actions: write + env: HIPSDK_INSTALLER_VERSION: "26.Q1" @@ -1007,11 +1101,12 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Grab rocWMMA package id: grab_rocwmma @@ -1025,13 +1120,12 @@ jobs: uses: actions/cache@v5 with: path: C:\Program Files\AMD\ROCm - key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} + key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} - name: ccache uses: ggml-org/ccache-action@v1.2.21 with: - key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64 - evict-old-files: 1d + key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }} - name: Install ROCm if: steps.cache-rocm.outputs.cache-hit != 'true' @@ -1091,6 +1185,11 @@ jobs: cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\" + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }} + - name: Pack artifacts id: pack_artifacts run: | @@ -1102,8 +1201,10 @@ jobs: path: llama-bin-win-hip-${{ matrix.name }}-x64.zip name: llama-bin-win-hip-${{ matrix.name }}-x64.zip - ios-xcode-build: - runs-on: macos-15 + ios-xcode: + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} + runs-on: macos-26 steps: - name: Checkout code @@ -1113,7 +1214,7 @@ jobs: - name: Setup Xcode run: | - sudo xcode-select -s /Applications/Xcode_16.4.app + sudo xcode-select -s /Applications/Xcode_26.4.app - name: Build id: cmake_build @@ -1123,12 +1224,13 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO @@ -1157,96 +1259,102 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-xcframework.zip name: llama-${{ steps.tag.outputs.name }}-xcframework.zip - - openEuler-cann: - strategy: - matrix: - include: - # 910b with aclgraph (both architectures) - - arch: x86 - chip_type: '910b' - build: 'Release' - use_acl_graph: 'on' - - arch: aarch64 - chip_type: '910b' - build: 'Release' - use_acl_graph: 'on' - # 310p without aclgraph (both architectures) - - arch: x86 - chip_type: '310p' - build: 'Release' - use_acl_graph: 'off' - - arch: aarch64 - chip_type: '310p' - build: 'Release' - use_acl_graph: 'off' - runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} - steps: - - name: Checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - - name: Free up disk space - uses: ggml-org/free-disk-space@v1.3.1 - with: - tool-cache: true - - - name: Set container image - id: cann-image - run: | - image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}" - echo "image=${image}" >> "${GITHUB_OUTPUT}" - - - name: Pull container image - run: docker pull "${{ steps.cann-image.outputs.image }}" - - - name: Build - env: - BUILD_TYPE: ${{ matrix.build }} - SOC_TYPE: ascend${{ matrix.chip_type }} - USE_ACL_GRAPH: ${{ matrix.use_acl_graph }} - run: | - HOST_UID=$(id -u) - HOST_GID=$(id -g) - - docker run --rm \ - -v "${PWD}:/workspace" \ - -w /workspace \ - -e SOC_TYPE=${SOC_TYPE} \ - -e BUILD_TYPE=${BUILD_TYPE} \ - -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \ - "${{ steps.cann-image.outputs.image }}" \ - bash -lc ' - set -e - yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel - yum clean all && rm -rf /var/cache/yum - git config --global --add safe.directory "/workspace" - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DGGML_CANN=on \ - -DSOC_TYPE=${SOC_TYPE} \ - -DUSE_ACL_GRAPH=${USE_ACL_GRAPH} - cmake --build build -j $(nproc) - - chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build - ' - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - run: | - cp LICENSE ./build/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . - - - name: Upload artifacts - uses: actions/upload-artifact@v6 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz - name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz +# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) +# in order to enable it again, we have to provision dedicated runners to run it +# openEuler-cann: +# strategy: +# matrix: +# include: +# # 910b with aclgraph (both architectures) +# - arch: x86 +# chip_type: '910b' +# build: 'Release' +# use_acl_graph: 'on' +# - arch: aarch64 +# chip_type: '910b' +# build: 'Release' +# use_acl_graph: 'on' +# # 310p without aclgraph (both architectures) +# - arch: x86 +# chip_type: '310p' +# build: 'Release' +# use_acl_graph: 'off' +# - arch: aarch64 +# chip_type: '310p' +# build: 'Release' +# use_acl_graph: 'off' +# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} +# steps: +# - name: Checkout +# uses: actions/checkout@v6 +# with: +# fetch-depth: 0 +# +# - name: Free up disk space +# uses: ggml-org/free-disk-space@v1.3.1 +# with: +# tool-cache: true +# +# - name: Set container image +# id: cann-image +# run: | +# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}" +# echo "image=${image}" >> "${GITHUB_OUTPUT}" +# +# - name: Pull container image +# run: docker pull "${{ steps.cann-image.outputs.image }}" +# +# - name: Build +# env: +# BUILD_TYPE: ${{ matrix.build }} +# SOC_TYPE: ascend${{ matrix.chip_type }} +# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }} +# run: | +# HOST_UID=$(id -u) +# HOST_GID=$(id -g) +# +# docker run --rm \ +# -v "${PWD}:/workspace" \ +# -w /workspace \ +# -e SOC_TYPE=${SOC_TYPE} \ +# -e BUILD_TYPE=${BUILD_TYPE} \ +# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \ +# "${{ steps.cann-image.outputs.image }}" \ +# bash -lc ' +# set -e +# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel +# yum clean all && rm -rf /var/cache/yum +# git config --global --add safe.directory "/workspace" +# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} +# cmake -S . -B build \ +# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ +# -DGGML_CANN=on \ +# -DSOC_TYPE=${SOC_TYPE} \ +# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH} +# cmake --build build -j $(nproc) +# +# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build +# ' +# +# - name: Determine tag name +# id: tag +# uses: ./.github/actions/get-tag-name +# +# - name: Pack artifacts +# run: | +# cp LICENSE ./build/bin/ +# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . +# +# - name: Upload artifacts +# uses: actions/upload-artifact@v6 +# with: +# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz +# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz + + ui: + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} + uses: ./.github/workflows/ui-build.yml release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -1259,21 +1367,21 @@ jobs: runs-on: ubuntu-slim needs: - - webui-build - windows - windows-cpu - windows-cuda - - windows-sycl + #- windows-sycl - windows-hip - ubuntu-22-rocm - ubuntu-cpu - ubuntu-vulkan - ubuntu-24-openvino - - ubuntu-24-sycl + #- ubuntu-24-sycl - android-arm64 - - macOS-cpu - - ios-xcode-build - - openEuler-cann + - macos-cpu + - ios-xcode + #- openEuler-cann + - ui outputs: tag_name: ${{ steps.tag.outputs.name }} @@ -1333,6 +1441,18 @@ jobs: mv -v artifact/*.zip release mv -v artifact/*.tar.gz release + - name: Download UI build + id: download_ui + uses: actions/download-artifact@v7 + with: + name: ui-build + path: ./ui-dist + + - name: Package UI + id: package_ui + run: | + tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist . + - name: Create release id: create_release uses: ggml-org/action-create-release@v1 @@ -1349,7 +1469,7 @@ jobs: **macOS/iOS:** - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz) - - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz) + - macOS Apple Silicon (arm64, KleidiAI enabled) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23780) - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz) - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip) @@ -1361,8 +1481,7 @@ jobs: - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz) - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz) - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz) - - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz) - - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz) + - Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705) **Android:** - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz) @@ -1371,16 +1490,20 @@ jobs: - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip) - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip) - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip) - - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip) + - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip) - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip) - - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip) + - Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705) - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip) **openEuler:** - - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz) - - [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz) - - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz) - - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz) + - [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705) + - openEuler x86 (310p) + - openEuler x86 (910b, ACL Graph) + - openEuler aarch64 (310p) + - openEuler aarch64 (910b, ACL Graph) + + **UI:** + - [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz) - name: Upload release id: upload_release @@ -1404,14 +1527,14 @@ jobs: } } - webui-publish: + ui-publish: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} needs: - release - uses: ./.github/workflows/webui-publish.yml + uses: ./.github/workflows/ui-publish.yml with: version_tag: ${{ needs.release.outputs.tag_name }} secrets: - hf_token: ${{ secrets.HF_TOKEN_WEBUI_STATIC_OUTPUT }} + hf_token: ${{ secrets.HF_TOKEN_UI_STATIC_OUTPUT }} diff --git a/.github/workflows/server-sanitize.yml b/.github/workflows/server-sanitize.yml index 4c9f447cf8b..c0817cbba87 100644 --- a/.github/workflows/server-sanitize.yml +++ b/.github/workflows/server-sanitize.yml @@ -26,10 +26,10 @@ on: ] env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_VERBOSITY: 10 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} @@ -37,7 +37,7 @@ concurrency: jobs: server: - runs-on: ubuntu-latest + runs-on: [self-hosted, CPU, Linux, llama-server] strategy: matrix: @@ -46,19 +46,19 @@ jobs: fail-fast: false steps: - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get -y install \ - build-essential \ - xxd \ - git \ - cmake \ - curl \ - wget \ - language-pack-en \ - libssl-dev + #- name: Dependencies + # id: depends + # run: | + # sudo apt-get update + # sudo apt-get -y install \ + # build-essential \ + # xxd \ + # git \ + # cmake \ + # curl \ + # wget \ + # language-pack-en \ + # libssl-dev - name: Clone id: checkout @@ -67,6 +67,13 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" + - name: Build id: cmake_build run: | diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 117c79c2f87..b9baede58b3 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -29,22 +29,17 @@ on: ] env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_VERBOSITY: 10 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml - server-metal: - needs: webui-build runs-on: [self-hosted, llama-server, macOS, ARM64] name: server-metal (${{ matrix.wf_name }}) @@ -72,11 +67,12 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: Setup Node.js + uses: actions/setup-node@v6 with: - name: webui-build - path: tools/server/public/ + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" - name: Build id: cmake_build @@ -95,42 +91,106 @@ jobs: export ${{ matrix.extra_args }} pytest -v -x -m "not slow" - # TODO: provision CUDA runner - # server-cuda: - # runs-on: [self-hosted, llama-server, Linux, NVIDIA] - # - # name: server-cuda (${{ matrix.wf_name }}) - # strategy: - # matrix: - # build_type: [Release] - # wf_name: ["GPUx1"] - # include: - # - build_type: Release - # extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - # wf_name: "GPUx1, backend-sampling" - # fail-fast: false - # - # steps: - # - name: Clone - # id: checkout - # uses: actions/checkout@v6 - # with: - # fetch-depth: 0 - # ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - # - # - name: Build - # id: cmake_build - # run: | - # cmake -B build -DGGML_SCHED_NO_REALLOC=ON - # cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server - # - # - name: Tests - # id: server_integration_tests - # if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} - # run: | - # cd tools/server/tests - # python3 -m venv venv - # source venv/bin/activate - # pip install -r requirements.txt - # export ${{ matrix.extra_args }} - # pytest -v -x -m "not slow" + server-cuda: + runs-on: [self-hosted, llama-server, Linux, NVIDIA] + + name: server-cuda (${{ matrix.wf_name }}) + strategy: + matrix: + build_type: [Release] + wf_name: ["GPUx1"] + include: + - build_type: Release + extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" + wf_name: "GPUx1, backend-sampling" + fail-fast: false + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Build + id: cmake_build + run: | + cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + + - name: Tests + id: server_integration_tests + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + run: | + cd tools/server/tests + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + export ${{ matrix.extra_args }} + pytest -v -x -m "not slow" + + server-kleidiai: + runs-on: ah-ubuntu_22_04-c8g_8x + + name: server-kleidiai (${{ matrix.wf_name }}) + strategy: + matrix: + include: + - build_type: Release + extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON" + extra_args: "" + wf_name: "CPUx1, kleidiai" + fail-fast: false + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Dependencies + id: depends + run: | + set -euxo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ + apt-get install -y \ + build-essential \ + libssl-dev \ + python3-venv \ + gpg \ + wget \ + time \ + git-lfs + + git lfs install + + # install the latest cmake + sudo install -d /usr/share/keyrings + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor \ + | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ + | sudo tee /etc/apt/sources.list.d/kitware.list + sudo apt-get update + sudo apt-get install -y cmake + + - name: Build + id: cmake_build + run: | + cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }} + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + + - name: Tests + id: server_integration_tests + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + run: | + cd tools/server/tests + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + export ${{ matrix.extra_args }} + pytest -v -x -m "not slow" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index f12e7b7366f..5a02cc15ad5 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -44,37 +44,18 @@ on: ] env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_VERBOSITY: 10 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml - - server: - needs: webui-build - runs-on: ubuntu-latest - - name: server (${{ matrix.wf_name }}) - strategy: - matrix: - build_type: [Release] - wf_name: ["default"] - include: - - build_type: Release - extra_args: "" - wf_name: "default" - - build_type: Release - extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - wf_name: "backend-sampling" - fail-fast: false + ubuntu: + runs-on: ubuntu-24.04-arm steps: - name: Dependencies @@ -98,19 +79,19 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 with: - name: webui-build - path: tools/server/public/ + key: server-ubuntu-24.04-arm + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - name: Build id: cmake_build run: | cmake -B build \ - -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_SCHED_NO_REALLOC=ON - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + cmake --build build --config Release -j $(nproc) --target llama-server - name: Python setup id: setup_python @@ -121,23 +102,34 @@ jobs: - name: Tests id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} run: | cd tools/server/tests - export ${{ matrix.extra_args }} pytest -v -x -m "not slow" - name: Slow tests id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} + if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }} run: | cd tools/server/tests - export ${{ matrix.extra_args }} SLOW_TESTS=1 pytest -v -x - server-windows: - needs: webui-build - runs-on: windows-2022 + - name: Tests (Backend sampling) + id: server_integration_tests_backend_sampling + run: | + cd tools/server/tests + export LLAMA_ARG_BACKEND_SAMPLING=1 + pytest -v -x -m "not slow" + + - name: Slow tests (Backend sampling) + id: server_integration_tests_slow_backend_sampling + if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }} + run: | + cd tools/server/tests + export LLAMA_ARG_BACKEND_SAMPLING=1 + SLOW_TESTS=1 pytest -v -x + + windows: + runs-on: windows-2025 steps: - name: Clone @@ -147,17 +139,24 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - name: Download WebUI build artifact - uses: actions/download-artifact@v7 + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 with: - name: webui-build - path: tools/server/public/ + key: server-windows-2025-x64 + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - name: Build id: cmake_build + shell: cmd run: | - cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server + cmake -B build -G "Ninja Multi-Config" ^ + -DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DLLAMA_BUILD_BORINGSSL=ON ^ + -DGGML_SCHED_NO_REALLOC=ON + set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 + cmake --build build --config Release -j %NINJA_JOBS% --target llama-server - name: Python setup id: setup_python @@ -168,7 +167,6 @@ jobs: - name: Tests id: server_integration_tests - if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd tools/server/tests $env:PYTHONIOENCODING = ":replace" @@ -176,7 +174,7 @@ jobs: - name: Slow tests id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} + if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }} run: | cd tools/server/tests $env:SLOW_TESTS = "1" diff --git a/.github/workflows/ui-build-self-hosted.yml b/.github/workflows/ui-build-self-hosted.yml new file mode 100644 index 00000000000..e5d576cda62 --- /dev/null +++ b/.github/workflows/ui-build-self-hosted.yml @@ -0,0 +1,43 @@ +name: UI Build (self-hosted) + +on: + workflow_call: + +jobs: + build: + runs-on: [self-hosted, fast] + env: + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" + + - name: Install dependencies + run: npm ci + working-directory: tools/ui + + - name: Build application + run: npm run build + working-directory: tools/ui + + - name: Generate checksums + run: | + cd tools/ui/dist + for f in *; do + sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt + done + + - name: Upload built UI + uses: actions/upload-artifact@v6 + with: + name: ui-build + path: tools/ui/dist/ + retention-days: 1 diff --git a/.github/workflows/webui-build.yml b/.github/workflows/ui-build.yml similarity index 67% rename from .github/workflows/webui-build.yml rename to .github/workflows/ui-build.yml index 7f512a69d3d..92b0573fb8d 100644 --- a/.github/workflows/webui-build.yml +++ b/.github/workflows/ui-build.yml @@ -1,11 +1,10 @@ -name: Build WebUI +name: UI Build on: workflow_call: jobs: build: - name: Build WebUI runs-on: ubuntu-slim env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -19,26 +18,26 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Build application run: npm run build - working-directory: tools/server/webui + working-directory: tools/ui - name: Generate checksums run: | - cd tools/server/public + cd tools/ui/dist for f in *; do sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt done - - name: Upload built webui + - name: Upload built UI uses: actions/upload-artifact@v6 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: tools/ui/dist/ retention-days: 1 diff --git a/.github/workflows/webui-publish.yml b/.github/workflows/ui-publish.yml similarity index 75% rename from .github/workflows/webui-publish.yml rename to .github/workflows/ui-publish.yml index bf0d707a23e..cec0fa52a12 100644 --- a/.github/workflows/webui-publish.yml +++ b/.github/workflows/ui-publish.yml @@ -1,4 +1,4 @@ -name: WebUI Publish +name: UI Publish on: workflow_call: @@ -13,15 +13,20 @@ on: required: true jobs: + build: + name: Build static output + uses: ./.github/workflows/ui-build.yml + publish: - name: Publish WebUI Static Output - runs-on: ubuntu-24.04-arm + name: Publish UI Static Output + needs: build + runs-on: ubuntu-slim permissions: contents: read env: - HF_BUCKET_NAME: ${{ vars.HF_BUCKET_WEBUI_STATIC_OUTPUT }} + HF_BUCKET_NAME: ${{ vars.HF_BUCKET_UI_STATIC_OUTPUT }} steps: - name: Checkout code @@ -29,11 +34,11 @@ jobs: with: fetch-depth: 1 - - name: Download WebUI build artifact + - name: Download UI build artifact uses: actions/download-artifact@v7 with: - name: webui-build - path: tools/server/public/ + name: ui-build + path: tools/ui/dist/ - name: Install Hugging Face Hub CLI run: pip install -U huggingface_hub @@ -44,12 +49,12 @@ jobs: - name: Sync built files to Hugging Face bucket (version tag) run: | # Upload the built files to the Hugging Face bucket under the release version - hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet + hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet - name: Sync built files to Hugging Face bucket (latest) run: | # Also upload to the 'latest' directory for fallback downloads - hf buckets sync tools/server/public hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet + hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet - name: Verify upload run: | diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml new file mode 100644 index 00000000000..5457d900c87 --- /dev/null +++ b/.github/workflows/ui-self-hosted.yml @@ -0,0 +1,118 @@ +name: UI (self-hosted) + +# these are the same as ui.yml, but with self-hosted runners +# the runners come with pre-installed Playwright browsers version: 1.56.1 +# the jobs are much lighter because they don't need to install node and playwright browsers + +on: + workflow_dispatch: + inputs: + sha: + description: 'Commit SHA1 to build' + required: false + type: string + push: + branches: + - master + paths: [ + '.github/workflows/ui-self-hosted.yml', + '.github/workflows/ui-build-self-hosted.yml', + 'tools/ui/**.*', + 'tools/server/tests/**.*' + ] + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/ui-self-hosted.yml', + '.github/workflows/ui-build-self-hosted.yml', + 'tools/ui/**.*', + 'tools/server/tests/**.*' + ] + +env: + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_VERBOSITY: 10 + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + ui-build: + name: Build static output + uses: ./.github/workflows/ui-build-self-hosted.yml + + ui-checks: + name: Checks + needs: ui-build + runs-on: [self-hosted, PLAYWRIGHT] + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install dependencies + id: setup + run: npm ci + working-directory: tools/ui + + - name: Run type checking + if: ${{ always() && steps.setup.conclusion == 'success' }} + run: npm run check + working-directory: tools/ui + + - name: Run linting + if: ${{ always() && steps.setup.conclusion == 'success' }} + run: npm run lint + working-directory: tools/ui + + - name: Run Client tests + if: ${{ always() }} + run: npm run test:client + working-directory: tools/ui + + - name: Run Unit tests + if: ${{ always() }} + run: npm run test:unit + working-directory: tools/ui + + e2e-tests: + name: E2E Tests + needs: ui-build + runs-on: [self-hosted, PLAYWRIGHT] + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install dependencies + id: setup + run: npm ci + working-directory: tools/ui + + - name: Build application + if: ${{ always() && steps.setup.conclusion == 'success' }} + run: npm run build + working-directory: tools/ui + + - name: Build Storybook + if: ${{ always() }} + run: npm run build-storybook + working-directory: tools/ui + + - name: Run UI tests + if: ${{ always() }} + run: npm run test:ui -- --testTimeout=60000 + working-directory: tools/ui + + - name: Run E2E tests + if: ${{ always() }} + run: npm run test:e2e + working-directory: tools/ui diff --git a/.github/workflows/server-webui.yml b/.github/workflows/ui.yml similarity index 71% rename from .github/workflows/server-webui.yml rename to .github/workflows/ui.yml index 72c2016efe7..b3712e45059 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/ui.yml @@ -1,4 +1,4 @@ -name: Server WebUI +name: UI on: workflow_dispatch: @@ -11,37 +11,39 @@ on: branches: - master paths: [ - '.github/workflows/server-webui.yml', - 'tools/server/webui/**.*', + '.github/workflows/ui.yml', + '.github/workflows/ui-build.yml', + 'tools/ui/**.*', 'tools/server/tests/**.*' ] pull_request: types: [opened, synchronize, reopened] paths: [ - '.github/workflows/server-webui.yml', - 'tools/server/webui/**.*', + '.github/workflows/ui.yml', + '.github/workflows/ui-build.yml', + 'tools/ui/**.*', 'tools/server/tests/**.*' ] env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 + LLAMA_ARG_LOG_COLORS: 1 + LLAMA_ARG_LOG_PREFIX: 1 + LLAMA_ARG_LOG_TIMESTAMPS: 1 + LLAMA_ARG_LOG_VERBOSITY: 10 concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: - webui-build: - name: Build WebUI - uses: ./.github/workflows/webui-build.yml - - webui-checks: - name: WebUI Checks - needs: webui-build - runs-on: ubuntu-24.04-arm + ui-build: + name: Build static output + uses: ./.github/workflows/ui-build.yml + + ui-checks: + name: Checks + needs: ui-build + runs-on: ubuntu-latest continue-on-error: true steps: - name: Checkout code @@ -56,44 +58,44 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Run type checking if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run check - working-directory: tools/server/webui + working-directory: tools/ui - name: Run linting if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run lint - working-directory: tools/server/webui + working-directory: tools/ui - name: Install Playwright browsers id: playwright if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps - working-directory: tools/server/webui + working-directory: tools/ui - name: Run Client tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:client - working-directory: tools/server/webui + working-directory: tools/ui - name: Run Unit tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:unit - working-directory: tools/server/webui + working-directory: tools/ui e2e-tests: name: E2E Tests - needs: webui-build - runs-on: ubuntu-24.04-arm + needs: ui-build + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v6 @@ -107,36 +109,36 @@ jobs: with: node-version: "24" cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" + cache-dependency-path: "tools/ui/package-lock.json" - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} run: npm ci - working-directory: tools/server/webui + working-directory: tools/ui - name: Build application if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run build - working-directory: tools/server/webui + working-directory: tools/ui - name: Install Playwright browsers id: playwright if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps - working-directory: tools/server/webui + working-directory: tools/ui - name: Build Storybook if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run build-storybook - working-directory: tools/server/webui + working-directory: tools/ui - name: Run UI tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:ui -- --testTimeout=60000 - working-directory: tools/server/webui + working-directory: tools/ui - name: Run E2E tests if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:e2e - working-directory: tools/server/webui + working-directory: tools/ui diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml index 2ab06eb9811..6e8bc1aa07c 100644 --- a/.github/workflows/update-ops-docs.yml +++ b/.github/workflows/update-ops-docs.yml @@ -3,18 +3,20 @@ name: Update Operations Documentation on: push: paths: + - '.github/workflows/update-ops-docs.yml' - 'docs/ops.md' - 'docs/ops/**' - 'scripts/create_ops_docs.py' pull_request: paths: + - '.github/workflows/update-ops-docs.yml' - 'docs/ops.md' - 'docs/ops/**' - 'scripts/create_ops_docs.py' jobs: update-ops-docs: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast, ARM64] steps: - name: Checkout repository diff --git a/.gitignore b/.gitignore index 5f53fbf7a61..8dc9d7d0b8d 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,6 @@ /tmp/ /autogen-*.md /common/build-info.cpp -/tools/server/public # Deprecated @@ -93,10 +92,12 @@ !/examples/sycl/*.bat !/examples/sycl/*.sh -# Server Web UI temporary files +# Server Web UI temporary files (+ legacy directory) /tools/server/webui/node_modules /tools/server/webui/dist +/tools/ui/node_modules +/tools/ui/dist # Python diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 727a850b183..06d97ae78ee 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -1,7 +1,7 @@ You are a coding agent. Here are some very important rules that you must follow: General: -- By very precise and concise when writing code, comments, explanations, etc. +- Be very precise and concise when writing code, comments, explanations, etc. - PR and commit titles format: ` : `. Lookup recents for examples - Don't try to build or run the code unless you are explicitly asked to do so - Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources @@ -16,12 +16,15 @@ Pull requests (PRs): - New branch names are prefixed with "gg/" - Before opening a pull request, ask the user to confirm the description - When creating a pull request, look for the repository's PR template and follow it -- For the AI usage disclosure section, write "YES. llama.cpp + pi" +- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]" +- Ask the user to tell you what model was used and write it in place of [MODEL] - Always create the pull requests in draft mode Commits: - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag - Do not explicitly set the git author in commits - rely on the default git config +- Always use `--no-gpg-sign` when committing +- Never `git push` without explicit confirmation from the user Resources (read on demand): - [CONTRIBUTING.md](CONTRIBUTING.md) diff --git a/CMakeLists.txt b/CMakeLists.txt index 36761bf9835..eabff576cc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,14 +104,16 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) # extra artifacts -option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON) -option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket when available (requires LLAMA_BUILD_WEBUI=ON)" ON) -option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) -option(LLAMA_TESTS_INSTALL "llama: install tests" ON) +option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_APP "llama: build the unified binary" ${LLAMA_STANDALONE}) +option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON) +option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON) + +option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) +option(LLAMA_TESTS_INSTALL "llama: install tests" ON) # 3rd party libs option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON) @@ -221,17 +223,8 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS) add_subdirectory(tools) endif() -# Automatically add all files from the 'licenses' directory -file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*") - -foreach(FILE_PATH ${EXTRA_LICENSES}) - get_filename_component(FILE_NAME "${FILE_PATH}" NAME) - string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}") - license_add_file("${NAME}" "${FILE_PATH}") -endforeach() - -if (LLAMA_BUILD_COMMON) - license_generate(llama-common) +if (LLAMA_BUILD_APP) + add_subdirectory(app) endif() # @@ -276,18 +269,6 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama) -install( - FILES convert_hf_to_gguf.py - PERMISSIONS - OWNER_READ - OWNER_WRITE - OWNER_EXECUTE - GROUP_READ - GROUP_EXECUTE - WORLD_READ - WORLD_EXECUTE - DESTINATION ${CMAKE_INSTALL_BINDIR}) - configure_file(cmake/llama.pc.in "${CMAKE_CURRENT_BINARY_DIR}/llama.pc" @ONLY) diff --git a/CODEOWNERS b/CODEOWNERS index a4395969fe3..4b9d9017715 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -15,7 +15,7 @@ # ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin # ggml-org/llama-mtmd : ngxson # ggml-org/llama-server : ggerganov, ngxson, allozaur, angt, ServeurpersoCom -# ggml-org/llama-webui : allozaur +# ggml-org/llama-ui : allozaur /.devops/*.Dockerfile @ngxson /.github/actions/ @ggml-org/ci @@ -26,6 +26,7 @@ /common/fit.* @JohannesGaessler /common/jinja/ @CISC /common/ngram-map.* @srogmann +/conversion/ @CISC /convert_*.py @CISC /docs/backend/snapdragon/ @ggml-org/ggml-hexagon /examples/batched.swift/ @ggerganov @@ -48,7 +49,6 @@ /examples/parallel/ @ggerganov /examples/passkey/ @ggerganov /examples/retrieval/ @ggerganov -/examples/save-load-state/ @ggerganov /examples/speculative-simple/ @ggerganov /examples/speculative/ @ggerganov /ggml/cmake/ @ggerganov @@ -107,7 +107,7 @@ /tools/rpc/ @ggml-org/ggml-rpc /tools/server/* @ggml-org/llama-server # no subdir /tools/server/tests/ @ggml-org/llama-server -/tools/server/webui/ @ggml-org/llama-webui +/tools/ui/ @ggml-org/llama-ui /tools/tokenize/ @ggerganov /tools/tts/ @ggerganov /vendor/ @ggerganov diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 99504f14f31..6881a4d3ab3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -63,6 +63,7 @@ After submitting your PR: - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules - Let other maintainers merge their own PRs - When merging a PR, make sure you have a good understanding of the changes +- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you) Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions: diff --git a/README.md b/README.md index a0c14b9d7f0..dbe2c363a56 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ LLM inference in C/C++ - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/). ---- @@ -280,7 +281,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Metal](docs/build.md#metal-build) | Apple Silicon | | [BLAS](docs/build.md#blas-build) | All | | [BLIS](docs/backend/BLIS.md) | All | -| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [SYCL](docs/backend/SYCL.md) | Intel GPU | | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs | | [MUSA](docs/build.md#musa) | Moore Threads GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU | @@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [CANN](docs/build.md#cann) | Ascend NPU | | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE | -| [WebGPU [In Progress]](docs/build.md#webgpu) | All | +| [WebGPU](docs/build.md#webgpu) | All | | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | | [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon | | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR | diff --git a/SECURITY.md b/SECURITY.md index 3a8d07f6443..a98b8e70bd8 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -12,16 +12,16 @@ ## Reporting a vulnerability +> [!IMPORTANT] +> The private security disclosure program is disabled until further notice. Please submit patches with fixes directly to the repo as public PRs. Emails will be ignored. + If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new). A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. -> [!IMPORTANT] -> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080 - -## Requirements +### Requirements Before submitting your report, ensure you meet the following requirements: @@ -31,7 +31,7 @@ Before submitting your report, ensure you meet the following requirements: Maintainers reserve the right to close the report if these requirements are not fulfilled. -## Covered Topics +### Covered Topics Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues. diff --git a/VERSION_NUMBER b/VERSION_NUMBER index 17e51c385ea..d917d3e26ad 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -0.1.1 +0.1.2 diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt new file mode 100644 index 00000000000..3ce503955b3 --- /dev/null +++ b/app/CMakeLists.txt @@ -0,0 +1,31 @@ +set(TARGET llama-app) + +add_executable(${TARGET} llama.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama) + +target_link_libraries(${TARGET} PRIVATE + llama-server-impl + llama-cli-impl + llama-completion-impl + llama-bench-impl + llama-batched-bench-impl + llama-fit-params-impl + llama-quantize-impl + llama-perplexity-impl +) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +# Automatically add all files from the 'licenses' directory +file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*") + +foreach(FILE_PATH ${EXTRA_LICENSES}) + get_filename_component(FILE_NAME "${FILE_PATH}" NAME) + string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}") + license_add_file("${NAME}" "${FILE_PATH}") +endforeach() + +license_generate(${TARGET}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/app/llama.cpp b/app/llama.cpp new file mode 100644 index 00000000000..30b09f9ef7e --- /dev/null +++ b/app/llama.cpp @@ -0,0 +1,127 @@ +#include "build-info.h" + +#include <cstdio> +#include <cstdlib> +#include <string> +#include <vector> + +// embedded data generated by cmake +extern const char * LICENSES[]; + +// visible +int llama_server(int argc, char ** argv); +int llama_cli(int argc, char ** argv); + +// hidden +int llama_completion(int argc, char ** argv); +int llama_bench(int argc, char ** argv); +int llama_batched_bench(int argc, char ** argv); +int llama_fit_params(int argc, char ** argv); +int llama_quantize(int argc, char ** argv); +int llama_perplexity(int argc, char ** argv); + +// hands the update over to the install script, which downloads and swaps the binary +static int llama_update(int argc, char ** argv) { + (void) argc; + (void) argv; + +#if defined(_WIN32) + return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\""); +#else + return system("curl -fsSL https://llama.app/install.sh | sh"); +#endif +} + +static const char * progname; + +static int help(int argc, char ** argv); +static int version(int argc, char ** argv); +static int licenses(int argc, char ** argv); + +struct command { + const char * name; + const char * desc; + std::vector<std::string> aliases; + bool hidden; + int (*func)(int, char **); +}; + +static const command cmds[] = { + {"serve", "HTTP API server", {"server"}, false, llama_server }, + {"cli", "Command-line interactive interface", {"client"}, false, llama_cli }, + {"update", "Update llama to the latest release", {}, false, llama_update }, + {"completion", "Text completion", {"complete"}, true, llama_completion }, + {"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench }, + {"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench}, + {"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params }, + {"quantize", "Quantize a model", {}, true, llama_quantize }, + {"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity }, + {"version", "Show version", {}, false, version }, + {"licenses", "Show third-party licenses", {"credits"}, false, licenses }, + {"help", "Show available commands", {}, false, help }, +}; + +static int version(int argc, char ** argv) { + printf("%s\n", llama_build_info()); + return 0; +} + +static int licenses(int argc, char ** argv) { + for (int i = 0; LICENSES[i]; ++i) { + printf("%s\n", LICENSES[i]); + } + return 0; +} + +static int help(int argc, char ** argv) { + const bool show_all = argc >= 2 && std::string(argv[1]) == "all"; + + printf("Usage: %s <command> [options]\n\nAvailable commands:\n", progname); + + for (const auto & cmd : cmds) { + if (show_all || !cmd.hidden) { + printf(" %-15s %s\n", cmd.name, cmd.desc); + } + } + printf("\n"); + + if (!show_all) { + printf("Run '%s help all' to show additional commands.\n", progname); + } + printf("Run '%s <command> --help' for command-specific usage.\n", progname); + + return 0; +} + +static bool matches(const std::string & arg, const command & cmd) { + if (arg == cmd.name) { + return true; + } + for (const auto & alias : cmd.aliases) { + if (arg == alias) { + return true; + } + } + return false; +} + +int main(int argc, char ** argv) { + progname = argv[0]; + + const std::string arg = argc >= 2 ? argv[1] : "help"; + + for (const auto & cmd : cmds) { + if (matches(arg, cmd)) { + // keep cmd.name so the router's child processes re-invoke correctly +#ifdef _WIN32 + _putenv_s("LLAMA_APP_CMD", cmd.name); +#else + setenv("LLAMA_APP_CMD", cmd.name, 1); +#endif + return cmd.func(argc - 1, argv + 1); + } + } + + fprintf(stderr, "error: unknown command '%s'\n", arg.c_str()); + return 1; +} diff --git a/build-xcframework.sh b/build-xcframework.sh index c25a1ef28c1..5d289922a84 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -7,6 +7,8 @@ VISIONOS_MIN_OS_VERSION=1.0 TVOS_MIN_OS_VERSION=16.4 BUILD_SHARED_LIBS=OFF +LLAMA_BUILD_APP=OFF +LLAMA_BUILD_COMMON=OFF LLAMA_BUILD_EXAMPLES=OFF LLAMA_BUILD_TOOLS=OFF LLAMA_BUILD_TESTS=OFF @@ -31,6 +33,8 @@ COMMON_CMAKE_ARGS=( -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} + -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP} + -DLLAMA_BUILD_COMMON=${LLAMA_BUILD_COMMON} -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES} -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS} -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS} @@ -414,7 +418,7 @@ cmake -B build-ios-sim -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -S . -cmake --build build-ios-sim --config Release -- -quiet +cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet echo "Building for iOS devices..." cmake -B build-ios-device -G Xcode \ @@ -428,7 +432,7 @@ cmake -B build-ios-device -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -S . -cmake --build build-ios-device --config Release -- -quiet +cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet echo "Building for macOS..." cmake -B build-macos -G Xcode \ @@ -439,7 +443,7 @@ cmake -B build-macos -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -S . -cmake --build build-macos --config Release -- -quiet +cmake --build build-macos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet echo "Building for visionOS..." cmake -B build-visionos -G Xcode \ @@ -454,7 +458,7 @@ cmake -B build-visionos -G Xcode \ -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -S . -cmake --build build-visionos --config Release -- -quiet +cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet echo "Building for visionOS simulator..." cmake -B build-visionos-sim -G Xcode \ @@ -469,7 +473,7 @@ cmake -B build-visionos-sim -G Xcode \ -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_SERVER=OFF \ -S . -cmake --build build-visionos-sim --config Release -- -quiet +cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet # Add tvOS builds (might need the same u_int definitions as watchOS and visionOS) echo "Building for tvOS simulator..." @@ -485,7 +489,7 @@ cmake -B build-tvos-sim -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -S . -cmake --build build-tvos-sim --config Release -- -quiet +cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet echo "Building for tvOS devices..." cmake -B build-tvos-device -G Xcode \ @@ -500,7 +504,7 @@ cmake -B build-tvos-device -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -S . -cmake --build build-tvos-device --config Release -- -quiet +cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet # Setup frameworks and copy binaries and headers echo "Setting up framework structures..." diff --git a/ci/run.sh b/ci/run.sh index 529da07779f..e4a34ff0acd 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -66,6 +66,8 @@ fi if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" +else + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF" fi if [ ! -z ${GG_BUILD_CUDA} ]; then @@ -114,9 +116,12 @@ fi if [ ! -z ${GG_BUILD_VULKAN} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1" - # if on Mac, disable METAL if [[ "$OSTYPE" == "darwin"* ]]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF" + MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan" + MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake" + if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DSPIRV-Headers_DIR=${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers" + fi fi # Build shared libs on Windows @@ -127,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then fi if [ ! -z ${GG_BUILD_WEBGPU} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1" if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then if [ -z "${CMAKE_PREFIX_PATH}" ]; then @@ -161,6 +166,8 @@ fi if [ ! -z ${GG_BUILD_BLAS} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}" +else + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF" fi if [ ! -z ${GG_BUILD_OPENVINO} ]; then @@ -232,7 +239,7 @@ function gg_run_ctest_debug { (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -455,10 +462,10 @@ function gg_run_qwen3_0_6b { (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -694,8 +701,8 @@ function gg_sum_test_backend_ops_cpu { ## main -export LLAMA_LOG_PREFIX=1 -export LLAMA_LOG_TIMESTAMPS=1 +export LLAMA_ARG_LOG_PREFIX=1 +export LLAMA_ARG_LOG_TIMESTAMPS=1 if [ -z ${GG_BUILD_LOW_PERF} ]; then # Create symlink: ./llama.cpp/models-mnt -> $MNT/models diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 90cbec5b6f1..b4defc76ff0 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -7,7 +7,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") -set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") +set(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake) diff --git a/common/arg.cpp b/common/arg.cpp index 0a551349f36..d7c481b8f57 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -4,7 +4,6 @@ #include "chat.h" #include "common.h" #include "download.h" -#include "hf-cache.h" #include "json-schema-to-grammar.h" #include "log.h" #include "sampling.h" @@ -51,8 +50,6 @@ #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 -extern const char * LICENSES[]; - using json = nlohmann::ordered_json; using namespace common_arg_utils; @@ -337,11 +334,13 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa struct handle_model_result { bool found_mmproj = false; common_params_model mmproj; + + bool found_mtp = false; + common_params_model mtp; }; static handle_model_result common_params_handle_model(struct common_params_model & model, - const std::string & bearer_token, - bool offline) { + const common_download_opts & opts) { handle_model_result result; if (!model.docker_repo.empty()) { @@ -353,10 +352,9 @@ static handle_model_result common_params_handle_model(struct common_params_model model.hf_file = model.path; model.path = ""; } - common_download_opts opts; - opts.bearer_token = bearer_token; - opts.offline = offline; - auto download_result = common_download_model(model, opts, true); + common_download_opts hf_opts = opts; + hf_opts.download_mmproj = true; // also look for mmproj when downloading hf model + auto download_result = common_download_model(model, hf_opts); if (download_result.model_path.empty()) { throw std::runtime_error("failed to download model from Hugging Face"); @@ -369,6 +367,11 @@ static handle_model_result common_params_handle_model(struct common_params_model result.found_mmproj = true; result.mmproj.path = download_result.mmproj_path; } + + if (!download_result.mtp_path.empty()) { + result.found_mtp = true; + result.mtp.path = download_result.mtp_path; + } } else if (!model.url.empty()) { if (model.path.empty()) { auto f = string_split<std::string>(model.url, '#').front(); @@ -376,9 +379,6 @@ static handle_model_result common_params_handle_model(struct common_params_model model.path = fs_get_cache_file(string_split<std::string>(f, '/').back()); } - common_download_opts opts; - opts.bearer_token = bearer_token; - opts.offline = offline; auto download_result = common_download_model(model, opts); if (download_result.model_path.empty()) { throw std::runtime_error("failed to download model from " + model.url); @@ -435,23 +435,49 @@ static bool parse_bool_value(const std::string & value) { // CLI argument parsing functions // -void common_params_handle_models(common_params & params, llama_example curr_ex) { - auto res = common_params_handle_model(params.model, params.hf_token, params.offline); - if (params.no_mmproj) { - params.mmproj = {}; - } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { - // optionally, handle mmproj model when -hf is specified - params.mmproj = res.mmproj; - } - // only download mmproj if the current example is using it - for (const auto & ex : mmproj_examples) { - if (curr_ex == ex) { - common_params_handle_model(params.mmproj, params.hf_token, params.offline); - break; +bool common_params_handle_models(common_params & params, llama_example curr_ex) { + const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(), + params.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end(); + + common_download_opts opts; + opts.bearer_token = params.hf_token; + opts.offline = params.offline; + opts.skip_download = params.skip_download; + opts.download_mtp = spec_type_draft_mtp; + + try { + auto res = common_params_handle_model(params.model, opts); + if (params.no_mmproj) { + params.mmproj = {}; + } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + // optionally, handle mmproj model when -hf is specified + params.mmproj = res.mmproj; + } + // only download mmproj if the current example is using it + for (const auto & ex : mmproj_examples) { + if (curr_ex == ex) { + common_params_handle_model(params.mmproj, opts); + break; + } + } + + // when --spec-type mtp is set and no draft model was provided explicitly, + // fall back to the MTP head discovered alongside the -hf model + if (spec_type_draft_mtp && res.found_mtp && + params.speculative.draft.mparams.path.empty() && + params.speculative.draft.mparams.hf_repo.empty() && + params.speculative.draft.mparams.url.empty()) { + params.speculative.draft.mparams.path = res.mtp.path; } + common_params_handle_model(params.speculative.draft.mparams, opts); + common_params_handle_model(params.vocoder.model, opts); + return true; + } catch (const common_skip_download_exception &) { + return false; + } catch (const std::exception &) { + throw; } - common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline); - common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); } static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { @@ -516,7 +542,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } if (!seen_args.insert(arg).second) { - LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + const bool skip = (arg == "--spec-type"); + + if (!skip) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } } auto & tmp = arg_to_options[arg]; auto opt = *tmp.first; @@ -565,12 +595,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // parse the first time to get -hf option (used for remote preset) parse_cli_args(); - // TODO: Remove later - try { - hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline); - } catch (const std::exception & e) { - LOG_WRN("HF cache migration failed: %s\n", e.what()); - } // export_graph_ops loads only metadata const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; @@ -879,7 +903,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } if (!seen_args.insert(arg).second) { - LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + const bool skip = (arg == "--spec-type"); + + if (!skip) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } } auto opt = *arg_to_options[arg]; std::string val; @@ -1013,11 +1041,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex // we define here to make sure it's included in llama-gen-docs if (ex == LLAMA_EXAMPLE_COMPLETION) { params.use_jinja = false; // disable jinja by default - } else if (ex == LLAMA_EXAMPLE_MTMD) { params.use_jinja = false; // disable jinja by default params.sampling.temp = 0.2; // lower temp by default for better quality - } else if (ex == LLAMA_EXAMPLE_SERVER) { params.n_parallel = -1; // auto by default } @@ -1038,7 +1064,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex sampler_type_names.pop_back(); // remove last semicolon } - /** * filter options by example * rules: @@ -1052,7 +1077,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } }; - add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", @@ -1069,16 +1093,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex exit(0); } )); - add_opt(common_arg( - {"--license"}, - "show source code license and dependencies", - [](common_params &) { - for (int i = 0; LICENSES[i]; ++i) { - printf("%s\n", LICENSES[i]); - } - exit(0); - } - )); add_opt(common_arg( {"-cl", "--cache-list"}, "show list of models in cache", @@ -1312,12 +1326,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-cpent", "--checkpoint-every-n-tokens"}, "N", - string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt), + {"-cms", "--checkpoint-min-step"}, "N", + string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step), [](common_params & params, int value) { - params.checkpoint_every_nt = value; + if (value < 0) { + throw std::invalid_argument("checkpoint-min-step must be non-negative"); + } + params.checkpoint_min_step = value; } - ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + ).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cram", "--cache-ram"}, "N", string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)" @@ -2806,7 +2823,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, int value) { params.embd_normalize = value; } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG})); + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG})); add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)", @@ -2863,28 +2880,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); + // Deprecated: use --ui-config instead (kept for backward compat) add_opt(common_arg( {"--webui-config"}, "JSON", - "JSON that provides default WebUI settings (overrides WebUI defaults)", + "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)", [](common_params & params, const std::string & value) { + params.ui_config_json = value; params.webui_config_json = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); + + add_opt(common_arg( + {"--ui-config"}, "JSON", + "JSON that provides default UI settings (overrides UI defaults)", + [](common_params & params, const std::string & value) { + params.ui_config_json = value; + params.webui_config_json = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG")); + + // Deprecated: use --ui-config-file instead (kept for backward compat) add_opt(common_arg( {"--webui-config-file"}, "PATH", - "JSON file that provides default WebUI settings (overrides WebUI defaults)", + "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)", [](common_params & params, const std::string & value) { - params.webui_config_json = read_file(value); + params.ui_config_json = read_file(value); + params.webui_config_json = params.ui_config_json; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); + + add_opt(common_arg( + {"--ui-config-file"}, "PATH", + "JSON file that provides default UI settings (overrides UI defaults)", + [](common_params & params, const std::string & value) { + params.ui_config_json = read_file(value); + params.webui_config_json = params.ui_config_json; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE")); + + // Deprecated: use --ui-mcp-proxy instead (kept for backward compat) add_opt(common_arg( {"--webui-mcp-proxy"}, {"--no-webui-mcp-proxy"}, - string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"), + "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy", [](common_params & params, bool value) { + params.ui_mcp_proxy = value; params.webui_mcp_proxy = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY")); + + add_opt(common_arg( + {"--ui-mcp-proxy"}, + {"--no-ui-mcp-proxy"}, + "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)", + [](common_params & params, bool value) { + params.ui_mcp_proxy = value; + params.webui_mcp_proxy = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY")); add_opt(common_arg( {"--tools"}, "TOOL1,TOOL2,...", "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n" @@ -2894,14 +2947,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.server_tools = parse_csv_row(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS")); + // Deprecated: use --ui/--no-ui instead (kept for backward compat) add_opt(common_arg( {"--webui"}, {"--no-webui"}, - string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI", [](common_params & params, bool value) { + params.ui = value; params.webui = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI")); + + add_opt(common_arg( + {"--ui"}, + {"--no-ui"}, + string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.ui = value; + params.webui = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), @@ -2944,7 +3009,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } key_file.close(); } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE")); add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", @@ -2972,7 +3037,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.default_template_kwargs[item.key()] = item.value().dump(); } } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS")); add_opt(common_arg( {"-to", "--timeout"}, "N", string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), @@ -3273,7 +3338,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params &, const std::string & value) { common_log_set_file(common_log_main(), value.c_str()); } - ).set_env("LLAMA_LOG_FILE")); + ).set_env("LLAMA_ARG_LOG_FILE")); add_opt(common_arg( {"--log-colors"}, "[on|off|auto]", "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n" @@ -3290,7 +3355,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("error: unknown value for --log-colors: '%s'\n", value.c_str())); } } - ).set_env("LLAMA_LOG_COLORS")); + ).set_env("LLAMA_ARG_LOG_COLORS")); add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", @@ -3305,7 +3370,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.offline = true; } - ).set_env("LLAMA_OFFLINE")); + ).set_env("LLAMA_ARG_OFFLINE")); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n" @@ -3313,13 +3378,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex " - 1: error\n" " - 2: warning\n" " - 3: info\n" - " - 4: debug\n" + " - 4: trace (more info)\n" + " - 5: debug\n" "(default: %d)\n", params.verbosity), [](common_params & params, int value) { params.verbosity = value; common_log_set_verbosity_thold(value); } - ).set_env("LLAMA_LOG_VERBOSITY")); + ).set_env("LLAMA_ARG_LOG_VERBOSITY")); add_opt(common_arg( {"--log-prefix"}, {"--no-log-prefix"}, @@ -3539,6 +3605,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.draft.p_min = std::stof(value); } ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN")); + add_opt(common_arg( + {"--spec-draft-backend-sampling"}, + {"--no-spec-draft-backend-sampling"}, + string_format("offload draft sampling to the backend (default: %s)", + params.speculative.draft.backend_sampling ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.speculative.draft.backend_sampling = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING")); add_opt(common_arg( {"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" @@ -3579,8 +3654,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("comma-separated list of types of speculative decoding to use (default: %s)\n", common_speculative_type_name_str(params.speculative.types).c_str()), [](common_params & params, const std::string & value) { - const auto enabled_types = string_split<std::string>(value, ','); - params.speculative.types = common_speculative_types_from_names(enabled_types); + const auto types_str = string_split<std::string>(value, ','); + auto types = common_speculative_types_from_names(types_str); + params.speculative.types.insert(params.speculative.types.end(), types.begin(), types.end()); } ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE")); add_opt(common_arg( @@ -4069,10 +4145,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--spec-default"}, string_format("enable default speculative decoding config"), [](common_params & params) { - params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD }; + params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MOD); params.speculative.ngram_mod.n_match = 24; params.speculative.ngram_mod.n_min = 48; params.speculative.ngram_mod.n_max = 64; + + // TODO: not sure if this is a good config - explore more settings and potentially enable it + //params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V); + //params.speculative.ngram_map_k4v.size_n = 8; + //params.speculative.ngram_map_k4v.size_m = 24; + //params.speculative.ngram_map_k4v.min_hits = 2; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); diff --git a/common/arg.h b/common/arg.h index 2a85f09f3eb..0010f2a9ac9 100644 --- a/common/arg.h +++ b/common/arg.h @@ -129,8 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com // see: https://github.com/ggml-org/llama.cpp/issues/18163 void common_params_add_preset_options(std::vector<common_arg> & args); -// Populate model paths (main model, mmproj, etc) from -hf if necessary -void common_params_handle_models(common_params & params, llama_example curr_ex); +// populate model paths (main model, mmproj, etc) from -hf if necessary +// return true if the model is ready to use +// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc) +// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed) +bool common_params_handle_models(common_params & params, llama_example curr_ex); // initialize argument parser context - used by test-arg-parser and preset common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index 6021fc4ede5..db3a6cc6fe3 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -43,11 +43,33 @@ common_chat_params peg_generator::generate_parser(const common_chat_template & const autoparser & autoparser) { // Create the result structure common_chat_params data; - data.prompt = common_chat_template_direct_apply(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.preserved_tokens = autoparser.preserved_tokens; + data.prompt = common_chat_template_direct_apply(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.preserved_tokens = autoparser.preserved_tokens; + + std::string parser_generation_prompt = data.generation_prompt; + + if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !inputs.continue_msg.empty()) { + // Build up generation prompt manually + const auto & msg = inputs.continue_msg; + + if (!autoparser.reasoning.start.empty()) { + data.generation_prompt = data.generation_prompt.substr(0, data.generation_prompt.find(autoparser.reasoning.start)); + data.generation_prompt += autoparser.reasoning.start + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += autoparser.reasoning.end; + } + } + + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += msg.render_content(); + } + + data.prompt += data.generation_prompt; + } - auto parser = autoparser.build_parser(inputs); + auto parser = autoparser.build_parser(inputs, parser_generation_prompt); data.parser = parser.save(); // Build grammar if tools are present @@ -87,7 +109,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template & return data; } -common_peg_arena autoparser::build_parser(const generation_params & inputs) const { +common_peg_arena autoparser::build_parser(const generation_params & inputs, const std::string & generation_prompt) const { if (!analysis_complete) { throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)"); } @@ -121,7 +143,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons } else { parser = content.build_parser(ctx); } - return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser; + return pure_content ? p.prefix(generation_prompt, reasoning.start) + parser : p.prefix(generation_prompt, reasoning.start) << parser; }); } diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp index 2499464cd82..81b17e5e1d2 100644 --- a/common/chat-auto-parser-helpers.cpp +++ b/common/chat-auto-parser-helpers.cpp @@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm namespace autoparser { +static const std::string ERR_TMPL = "#**ERROR**#"; + std::string apply_template(const common_chat_template & tmpl, const template_params & params) { generation_params tmpl_params; tmpl_params.messages = params.messages; @@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par return common_chat_template_direct_apply(tmpl, tmpl_params); } catch (const std::exception & e) { LOG_DBG("Template application failed: %s\n", e.what()); - return ""; + return ERR_TMPL; } } @@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants( std::string output_B = apply_template(tmpl, params_B); // Check for template application failures - if (output_A.empty() || output_B.empty()) { + if (output_A == ERR_TMPL || output_B == ERR_TMPL) { return std::nullopt; } diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h index 6c547409760..7858f6572f2 100644 --- a/common/chat-auto-parser.h +++ b/common/chat-auto-parser.h @@ -60,16 +60,21 @@ struct generation_params { common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO; bool stream = true; std::string grammar; - bool add_generation_prompt = false; - bool enable_thinking = true; - std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); - std::string generation_prompt; + bool add_generation_prompt = false; + common_chat_continuation continue_final_message = COMMON_CHAT_CONTINUATION_NONE; + common_chat_msg continue_msg; + bool enable_thinking = true; + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); json extra_context; bool add_bos = false; bool add_eos = false; bool is_inference = true; bool add_inference = false; bool mark_input = true; // whether to mark input strings in the jinja context + + bool has_continuation() const { + return continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !continue_msg.empty(); + } }; // ============================================================================ @@ -372,6 +377,8 @@ struct analyze_tools : analyze_base { struct autoparser { jinja::caps jinja_caps; + std::string user_start; + std::string assistant_start; analyze_reasoning reasoning; analyze_content content; analyze_tools tools; @@ -382,11 +389,15 @@ struct autoparser { autoparser() = default; + // Find the starting marker for the user message and assistant message + std::string detect_user_start_marker(const common_chat_template & tmpl); + std::string detect_assistant_start_marker(const common_chat_template & tmpl); + // Run full differential analysis on a template void analyze_template(const common_chat_template & tmpl); // Build the PEG parser for this template - common_peg_arena build_parser(const generation_params & inputs) const; + common_peg_arena build_parser(const generation_params & inputs, const std::string & generation_prompt) const; private: // Collect tokens from entire analysis to preserve diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 9c7c9678acd..0875c5347f4 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -8,6 +8,9 @@ #include "peg-parser.h" #include <algorithm> +#include <cctype> +#include <ostream> +#include <sstream> #define ANSI_RESET "\033[0m" #define ANSI_PURPLE "\033[1m\x1b[38;5;126m" @@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S"; static const std::string ARG_FIRST = "AA_ARG_FST_AA"; static const std::string ARG_SECOND = "BB_ARG_SND_BB"; static const std::string USER_MSG = "U_USER_MSG Hello END_U"; +static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V"; static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A"; static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R"; static const std::string CALL_ID_001 = "call00001"; @@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar analysis.content.end = "<|END_OF_TURN_TOKEN|>"; analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>"); analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>"); + analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>"; LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET); } }, @@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar analysis.tools.function.close = "```"; LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET); } - } + }, + // Nemotron Nano v2 + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos && + tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) { + + analysis.tools.format.mode = tool_format::JSON_NATIVE; + analysis.tools.format.section_start = ""; + analysis.tools.format.section_end = ""; + analysis.tools.format.per_call_start = "<TOOLCALL>"; + analysis.tools.format.per_call_end = "</TOOLCALL>"; + analysis.content.mode = content_mode::PLAIN; + analysis.content.start = ""; + analysis.content.end = ""; + analysis.reasoning.mode = reasoning_mode::TAG_BASED; + analysis.reasoning.start = "<think>\n\n"; + analysis.reasoning.end = "</think>"; + analysis.assistant_start = "<SPECIAL_11>Assistant"; + analysis.user_start = "<SPECIAL_11>User"; + analysis.preserved_tokens.clear(); + analysis.preserved_tokens.push_back("<SPECIAL_12>"); + analysis.preserved_tokens.push_back("<SPECIAL_11>"); + analysis.preserved_tokens.push_back("</think>"); + analysis.preserved_tokens.push_back("<TOOLCALL>"); + analysis.preserved_tokens.push_back("</TOOLCALL>"); + LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET); + } + }, + // Fireworks + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'" + " + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) { + analysis.assistant_start = "<|start_header_id|>assistant<|end_header_id|>"; + analysis.user_start = "<|start_header_id|>user<|end_header_id|>"; + LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET); + } + }, + // Solar Open + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) { + analysis.assistant_start = "<|begin|>assistant"; + LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET); + } + }, + // Apriel 1.6 + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) { + analysis.user_start = "<|begin_user|>"; + analysis.assistant_start = "<|begin_assistant|>"; + LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET); + } + }, + }); // Common JSON structures @@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) { reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls); content = analyze_content(tmpl, reasoning); tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools()); + assistant_start = detect_assistant_start_marker(tmpl); + user_start = detect_user_start_marker(tmpl); collect_preserved_tokens(); for (auto & workaround : workarounds) { @@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) { } LOG_DBG("\n--- Reasoning & Content Structure ---\n"); + LOG_DBG("user_msg_start: %s\n", user_start.c_str()); + LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str()); LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str()); LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str()); LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str()); @@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() { add_token(tools.call_id.suffix); } +std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) { + json user_msg = json{ + { "role", "user" }, + { "content", USER_MSG } + }; + + json assistant_no_reasoning = json{ + { "role", "assistant" }, + { "content", ASSISTANT_MSG } + }; + + template_params params; + params.messages = json::array({ user_msg }); + params.add_generation_prompt = false; + params.enable_thinking = true; + + auto comparison = compare_variants( + tmpl, params, [&](template_params & p) { + p.messages = json::array({ user_msg, assistant_no_reasoning }); + } + ); + + if (!comparison) { + LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__); + return ""; + } + + auto usermsg = comparison->diff.right; + if (usermsg.find(ASSISTANT_MSG) == std::string::npos) { + LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__); + } + + auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG)); + if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) { + ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start))); + } + if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) { + ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end))); + } + return trim_whitespace(ast_prefix); +} + +std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) { + json user_msg = json{ + { "role", "user" }, + { "content", USER_MSG } + }; + + json assistant = json{ + { "role", "assistant" }, + { "content", ASSISTANT_MSG } + }; + + json user_msg_two = json{ + { "role", "user" }, + { "content", USER_MSG_TWO } + }; + + template_params params; + params.messages = json::array({}); + params.add_generation_prompt = false; + params.enable_thinking = true; + + auto comparison = compare_variants( + tmpl, params, [&](template_params & p) { + p.messages = json::array({ user_msg }); + } + ); + + if (!comparison) { + LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__); + params.messages = json::array({ user_msg_two, assistant }); + comparison = compare_variants( + tmpl, params, [&](template_params & p) { + p.messages = json::array({ user_msg_two, assistant, user_msg }); + } + ); + if (!comparison) { + LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__); + return ""; + } + } + + auto usermsg = comparison->diff.right; + if (usermsg.find(USER_MSG) == std::string::npos) { + LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__); + } + + if (usermsg.find(ASSISTANT_MSG) != std::string::npos) { + usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size()); + } + + auto candidate = usermsg.substr(0, usermsg.find(USER_MSG)); + auto candidate_split = segmentize_markers(candidate); + std::stringstream result; + bool encountered_marker = false; + for (const auto & mrk : candidate_split) { + std::string lower_mrk = std::string(mrk.value); + std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(), + [](unsigned char c) { return std::tolower(c); }); + // heuristic to weed out potential end markers, but only at the start + if (mrk.type == segment_type::MARKER && !encountered_marker && + (lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) { + continue; + } + if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) { + continue; + } + encountered_marker |= mrk.type == segment_type::MARKER; + result << mrk.value; + } + return trim_whitespace(result.str()); +} + analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools) : analyze_base(tmpl) { LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET); diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index a4818859a6d..12e747d1ca1 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -358,35 +358,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) { if (is_potential_container) { value_content = normalize_container_value(value_content); } - - // Try to parse as JSON value (number, bool, null, object, array) - try { - ordered_json parsed = ordered_json::parse(value_content); - if (parsed.is_string()) { - // Don't add closing quote yet (added by arg_close) for monotonic streaming - std::string escaped = parsed.dump(); - if (!escaped.empty() && escaped.back() == '"') { - escaped.pop_back(); - } - value_to_add = escaped; - closing_quote_pending = true; - } else { - // Non-string values: use raw content to preserve whitespace for monotonicity - value_to_add = value_content; - } - } catch (...) { - if (node.is_partial && is_potential_container) { - // Partial container: pass through the already-normalized content - value_to_add = value_content; - } else { - // Not valid JSON - treat as string value - if (!closing_quote_pending) { - value_to_add = "\""; - closing_quote_pending = true; - } - value_to_add += escape_json_string_inner(value_content); - } - } + value_to_add += value_content; } args_target() += value_to_add; @@ -813,7 +785,7 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s if (delimiter.empty()) { return literal(s); } - return literal(s.substr(0, s.rfind(delimiter))); + return literal(s.substr(0, s.find(delimiter))); } common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) { diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index c684d773564..be92f17d909 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -90,7 +90,7 @@ class common_chat_peg_builder : public common_peg_parser_builder { // Use for schema-declared string types - won't be treated as potential JSON container common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); } - common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); } + common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); } // Return a parser that parses the prefix of a string, up to a given delimiter. diff --git a/common/chat.cpp b/common/chat.cpp index 70b9f5dc2c5..ef151691c38 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -70,6 +70,65 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) { return !msg.content.empty() || !msg.tool_calls.empty(); } +std::string common_chat_msg::render_content(const std::string & delimiter) const { + if (!content.empty() && !content_parts.empty()) { + throw std::runtime_error("Cannot specify both content and content_parts"); + } + if (!content.empty()) { + return content; + } + + std::string text; + for (const auto & part : content_parts) { + if (part.type == "text") { + if (!text.empty()) { + text += delimiter; + } + text += part.text; + } + } + return text; +} + +std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) { + if (delims.empty() || prompt.empty()) { + return {}; + } + + auto parser = build_peg_parser([&](common_peg_parser_builder & p) { + std::vector<std::string> all_delims; + std::vector<common_peg_parser> tagged_messages; + + all_delims.reserve(delims.size()); + tagged_messages.reserve(delims.size()); + for (const auto & d : delims) { + all_delims.push_back(d.delimiter); + } + + auto any_delim = p.until_one_of(all_delims); + for (const auto & d : delims) { + tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim)); + } + + return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end(); + }); + + common_peg_parse_context ctx(prompt); + const auto result = parser.parse(ctx); + if (!result.success()) { + return {}; + } + + std::vector<common_chat_msg_span> spans; + ctx.ast.visit(result, [&](const common_peg_ast_node & node) { + if (!node.tag.empty()) { + spans.push_back({ node.tag, node.start, node.end - node.start }); + } + }); + + return spans; +} + json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const { if (!content.empty() && !content_parts.empty()) { throw std::runtime_error("Cannot specify both content and content_parts"); @@ -451,6 +510,22 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too return result; } +common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value) { + if (value.is_boolean() && value.get<bool>()) { + return COMMON_CHAT_CONTINUATION_AUTO; + } + if (value.is_string()) { + auto value_str = value.get<std::string>(); + if (value_str == "reasoning_content") { + return COMMON_CHAT_CONTINUATION_REASONING; + } + if (value_str == "content") { + return COMMON_CHAT_CONTINUATION_CONTENT; + } + } + return COMMON_CHAT_CONTINUATION_NONE; +} + bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { if (use_jinja) { try { @@ -811,6 +886,36 @@ std::string common_chat_template_direct_apply( return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt); } +static std::string common_chat_template_generation_prompt_impl( + const common_chat_template & tmpl, + const autoparser::generation_params & inputs, + const std::optional<json> & messages_override = std::nullopt, + const std::optional<json> & tools_override = std::nullopt, + const std::optional<json> & additional_context = std::nullopt) { + + auto adjusted_messages = messages_override ? *messages_override : inputs.messages; + + autoparser::generation_params params = inputs; + params.add_generation_prompt = false; + params.continue_final_message = COMMON_CHAT_CONTINUATION_NONE; + std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context); + params.add_generation_prompt = true; + std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context); + + size_t prefix_len = 0; + size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size()); + while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) { + prefix_len++; + } + return gen_prompt.substr(prefix_len); +} + +std::string common_chat_template_generation_prompt( + const common_chat_template & tmpl, + const autoparser::generation_params & inputs) { + return common_chat_template_generation_prompt_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt); +} + static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const autoparser::generation_params & inputs) { common_chat_params data; @@ -863,6 +968,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_ data.thinking_start_tag = "[THINK]"; data.thinking_end_tag = "[/THINK]"; data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override = */ adjusted_messages); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.preserved_tokens = { "[THINK]", @@ -871,8 +977,19 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_ "[ARGS]", }; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = "[THINK]" + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += "[/THINK]" + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto generation_prompt = p.prefix(inputs.generation_prompt, "[THINK]"); + auto generation_prompt = p.eps(); auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps(); @@ -963,6 +1080,15 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp } data.prompt = prompt; + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages); + data.message_spans = common_chat_split_by_role(prompt, { + { "assistant", "<|start|>assistant" }, + { "user", "<|start|>user" }, + { "system", "<|start|>developer" }, + { "system", "<|start|>system" }, + { "tool", "<|start|>functions" }, + }); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; @@ -972,6 +1098,18 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp "<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>", }; + // Adjust prompt for continuation + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = "<|start|>assistant<|channel|>analysis<|message|>" + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += "<|end|><|start|>assistant<|channel|>final<|message|>" + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object(); auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE); @@ -1080,14 +1218,21 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ common_chat_params data; data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); if (inputs.add_generation_prompt && string_ends_with(data.prompt, "<turn|>\n")) { // This may happen if the model generates content + tool_call, the // template does not add the model's next turn and confuses the model // from emitting its proper reasoning token sequence. - data.prompt += "<|turn>model\n"; + data.generation_prompt = "<|turn>model\n"; + data.prompt += data.generation_prompt; } + data.message_spans = common_chat_split_by_role(data.prompt, { + { "user", "<|turn>user\n" }, + { "assistant", "<|turn>model\n" }, + }); + data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4; data.supports_thinking = true; data.thinking_start_tag = "<|channel>thought"; @@ -1101,13 +1246,25 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ "<|turn>", }; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = string_ends_with(data.prompt, "<turn|>\n") ? "<|turn>model\n" : ""; + data.generation_prompt += "<|channel>thought\n" + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += "<channel|>" + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object(); auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE); auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>")); + auto start = p.rule("start", p.optional(p.literal("<|turn>model\n"))); if (extract_reasoning) { p.rule("thought", p.literal("<|channel>thought") + p.space() + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>")); @@ -1224,15 +1381,22 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ const autoparser::generation_params & inputs) { common_chat_params data; - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.preserved_tokens = { + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.preserved_tokens = { ">>>all", }; auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + data.generation_prompt = "<|start_header_id|>assistant<|end_header_id|>\n\n>>>all\n" + msg.render_content(); + data.prompt += data.generation_prompt; + } + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { // Functionary v3.2 format: // - Normal content: >>>all\n{content} @@ -1244,7 +1408,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ // When no tools, content goes until end auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>")); auto content_until_end = p.literal("all\n") + p.content(p.rest()); - auto generation_prompt = p.literal(inputs.generation_prompt); + auto generation_prompt = p.literal("<|start_header_id|>assistant<|end_header_id|>\n\n>>>"); // If no tools or tool_choice is NONE, just parse content if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { @@ -1318,9 +1482,10 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp const autoparser::generation_params & inputs) { common_chat_params data; - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.supports_thinking = true; + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.supports_thinking = true; data.preserved_tokens = { "<|tool_calls_section_begin|>", "<|tool_calls_section_end|>", @@ -1343,10 +1508,22 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp const std::string THINK_START = "<think>"; const std::string THINK_END = "</think>"; + const std::string GEN_PROMPT = "<|im_assistant|>assistant<|im_middle|>"; data.thinking_start_tag = THINK_START; data.thinking_end_tag = THINK_END; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += THINK_END + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { // Kimi K2 Thinking format: // - Reasoning: <think>{reasoning}</think> @@ -1366,7 +1543,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp auto reasoning = extract_reasoning ? p.optional(THINK_START + p.reasoning( p.until_one_of({ THINK_END, "<|tool_calls_section_begin|>", "<|tool_call_begin|>" })) + p.optional(p.literal(THINK_END))) : p.eps(); - auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START); + auto generation_prompt = p.literal(GEN_PROMPT); // Content only parser (no tools) @@ -1442,6 +1619,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat common_chat_params data; data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; data.preserved_tokens = { @@ -1461,12 +1639,24 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat const std::string TOOL_CALL_END = "<|tool_call_end|>"; const std::string THINK_START = "<think>"; const std::string THINK_END = "</think>"; + const std::string GEN_PROMPT = "<|im_start|>assistant\n"; data.thinking_start_tag = THINK_START; data.thinking_end_tag = THINK_END; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += THINK_END + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START); + auto generation_prompt = p.literal(GEN_PROMPT); auto end = p.end(); auto reasoning = p.eps(); @@ -1521,6 +1711,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ common_chat_params data; data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; data.preserved_tokens = { @@ -1536,12 +1727,24 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ const std::string THINK_START = "<think>"; const std::string THINK_END = "</think>"; + const std::string GEN_PROMPT = "<|im_start|>assistant\n"; data.thinking_start_tag = THINK_START; data.thinking_end_tag = THINK_END; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += THINK_END + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START); + auto generation_prompt = p.literal(GEN_PROMPT); auto end = p.end(); auto reasoning = p.eps(); @@ -1592,6 +1795,7 @@ static common_chat_params common_chat_params_init_gigachat_v3( common_chat_params data; data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = false; data.preserved_tokens = { @@ -1599,6 +1803,12 @@ static common_chat_params common_chat_params_init_gigachat_v3( "<|role_sep|>\n", }; + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + data.generation_prompt = "assistant<|role_sep|>\n" + msg.render_content(); + data.prompt += data.generation_prompt; + } + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; const auto *tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n"; @@ -1634,7 +1844,7 @@ static common_chat_params common_chat_params_init_gigachat_v3( ret = p.content(p.rest()); } - return p.literal(inputs.generation_prompt) + ret; + return p.literal("assistant<|role_sep|>\n") + ret; }); data.parser = parser.save(); @@ -1662,12 +1872,13 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha const autoparser::generation_params & inputs) { common_chat_params data; - data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); - data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.supports_thinking = true; + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.supports_thinking = true; data.thinking_start_tag = "<think>"; data.thinking_end_tag = "</think>"; - data.preserved_tokens = { + data.preserved_tokens = { "๏ฝœDSML๏ฝœ", "<think>", "</think>", @@ -1687,9 +1898,21 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha const std::string INVOKE_END = "</" + DSML + "invoke>"; const std::string PARAM_START = "<" + DSML + "parameter"; const std::string PARAM_END = "</" + DSML + "parameter>"; + const std::string GEN_PROMPT = "<๏ฝœAssistant๏ฝœ>"; + + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += THINK_END + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { - auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START); + auto generation_prompt = p.literal(GEN_PROMPT); auto end = p.end(); auto reasoning = p.eps(); @@ -2116,21 +2339,6 @@ std::optional<common_chat_params> common_chat_try_specialized_template( return std::nullopt; } -static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) { - autoparser::generation_params params = inputs; - params.add_generation_prompt = false; - std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params); - params.add_generation_prompt = true; - std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params); - - size_t prefix_len = 0; - size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size()); - while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) { - prefix_len++; - } - return gen_prompt.substr(prefix_len); -} - static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates * tmpls, const struct common_chat_templates_inputs & inputs) { autoparser::generation_params params; @@ -2149,6 +2357,27 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ params.add_bos = tmpls->add_bos; params.add_eos = tmpls->add_eos; + params.continue_final_message = inputs.continue_final_message; + if (params.continue_final_message != COMMON_CHAT_CONTINUATION_NONE) { + params.add_generation_prompt = false; + + if (!inputs.messages.empty()) { + // Render messages[:-1] and store continuation message separately + params.continue_msg = inputs.messages.back(); + params.messages.erase(params.messages.size() - 1); + } + + if (params.continue_final_message == COMMON_CHAT_CONTINUATION_AUTO && !inputs.messages.empty()) { + // Resolve based on message content + params.continue_final_message = COMMON_CHAT_CONTINUATION_CONTENT; + if (!params.continue_msg.reasoning_content.empty() && + params.continue_msg.content.empty() && + params.continue_msg.content_parts.empty()) { + params.continue_final_message = COMMON_CHAT_CONTINUATION_REASONING; + } + } + } + if (src.find("<|channel|>") == std::string::npos) { // map developer to system for all models except for GPT-OSS workaround::map_developer_role_to_system(params.messages); @@ -2169,8 +2398,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ workaround::func_args_not_string(params.messages); } - params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params); - params.extra_context = common_chat_extra_context(); for (auto el : inputs.chat_template_kwargs) { params.extra_context[el.first] = json::parse(el.second); @@ -2200,17 +2427,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ auto params_copy = params; params_copy.reasoning_format = COMMON_REASONING_FORMAT_NONE; data.prompt = common_chat_template_direct_apply_impl(tmpl, params_copy); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, params); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; - data.generation_prompt = params.generation_prompt; - auto parser = build_chat_peg_parser([¶ms](common_chat_peg_builder &p) { - return p.prefix(params.generation_prompt) << p.content(p.rest()); + auto parser = build_chat_peg_parser([&data](common_chat_peg_builder &p) { + return p.literal(data.generation_prompt) << p.content(p.rest()); }); data.parser = parser.save(); return data; } if (auto result = common_chat_try_specialized_template(tmpl, src, params)) { - result->generation_prompt = params.generation_prompt; return *result; } @@ -2219,12 +2445,24 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ struct autoparser::autoparser autoparser; autoparser.analyze_template(tmpl); auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser); + + std::vector<common_chat_msg_delimiter> delimiters; + if (!autoparser.assistant_start.empty()) { + delimiters.push_back({ "assistant", autoparser.assistant_start }); + } + if (!autoparser.user_start.empty()) { + delimiters.push_back({ "user", autoparser.user_start }); + } + + if (!delimiters.empty()) { + auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters); + } + auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE; if (auto_params.supports_thinking) { auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start); auto_params.thinking_end_tag = trim_whitespace(autoparser.reasoning.end); } - auto_params.generation_prompt = params.generation_prompt; common_peg_arena arena; arena.load(auto_params.parser); LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str()); diff --git a/common/chat.h b/common/chat.h index 054f5ffe777..437cd7ca3b1 100644 --- a/common/chat.h +++ b/common/chat.h @@ -89,6 +89,8 @@ struct common_chat_msg { nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const; + std::string render_content(const std::string & delimiter = "\n\n") const; + bool empty() const { return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty(); @@ -141,6 +143,17 @@ struct common_chat_msg_diff { } }; +struct common_chat_msg_span { + std::string role; + std::size_t pos = 0; + std::size_t len = 0; +}; + +struct common_chat_msg_delimiter { + std::string role; + std::string delimiter; +}; + struct common_chat_tool { std::string name; std::string description; @@ -164,12 +177,22 @@ enum common_chat_format { COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats }; + +// Continuation method provided via `continue_final_message` +enum common_chat_continuation { + COMMON_CHAT_CONTINUATION_NONE, + COMMON_CHAT_CONTINUATION_AUTO, + COMMON_CHAT_CONTINUATION_REASONING, + COMMON_CHAT_CONTINUATION_CONTENT, +}; + struct common_chat_templates_inputs { std::vector<common_chat_msg> messages; std::string grammar; std::string json_schema; - bool add_generation_prompt = true; - bool use_jinja = true; + bool add_generation_prompt = true; + common_chat_continuation continue_final_message = COMMON_CHAT_CONTINUATION_NONE; + bool use_jinja = true; // Parameters below only supported when use_jinja is true std::vector<common_chat_tool> tools; common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; @@ -196,6 +219,7 @@ struct common_chat_params { std::vector<std::string> preserved_tokens; std::vector<std::string> additional_stops; std::string parser; + std::vector<common_chat_msg_span> message_spans; }; // per-message parsing syntax @@ -207,6 +231,8 @@ struct common_chat_parser_params { bool reasoning_in_content = false; std::string generation_prompt; bool parse_tool_calls = true; + bool is_continuation = false; + bool echo = false; // Include assistant prefilled msg in output bool debug = false; // Enable debug output for PEG parser common_peg_arena parser = {}; common_chat_parser_params() = default; @@ -267,6 +293,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::or std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools); +common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value); + // DEPRECATED: only used in tests nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false); @@ -279,11 +307,16 @@ std::string common_chat_template_direct_apply( const common_chat_template & tmpl, const autoparser::generation_params & inputs); +std::string common_chat_template_generation_prompt( + const common_chat_template & tmpl, + const autoparser::generation_params & inputs); + std::optional<common_chat_params> common_chat_try_specialized_template( const common_chat_template & tmpl, const std::string & src, autoparser::generation_params & params); + // specialized per-task preset struct common_chat_prompt_preset { std::string system; @@ -291,3 +324,5 @@ struct common_chat_prompt_preset { }; common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates); + +std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims); diff --git a/common/common.cpp b/common/common.cpp index b701edddb3f..81b8b750020 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -7,6 +7,7 @@ #include "log.h" #include "llama.h" #include "sampling.h" +#include "speculative.h" #include "unicode.h" #include <algorithm> @@ -372,7 +373,7 @@ void common_init() { llama_log_set(common_log_default_callback, NULL); } -void common_params_print_info(const common_params & params) { +void common_params_print_info(const common_params & params, bool print_devices) { #ifdef NDEBUG const char * build_type = ""; #else @@ -381,12 +382,16 @@ void common_params_print_info(const common_params & params) { LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type); LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold()); - LOG_INF("device_info:\n"); - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - size_t free, total; - ggml_backend_dev_memory(dev, &free, &total); - LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + + // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device + if (print_devices) { + LOG_INF("device_info:\n"); + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } } LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } @@ -440,6 +445,27 @@ std::string string_strip(const std::string & str) { return str.substr(start, end - start); } +std::string string_lcs(std::string_view a, std::string_view b) { + if (a.empty() || b.empty()) return {}; + + std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0)); + size_t best_len = 0; + size_t best_end_a = 0; + + for (size_t i = 1; i <= a.size(); ++i) { + for (size_t j = 1; j <= b.size(); ++j) { + if (a[i - 1] == b[j - 1]) { + dp[i][j] = dp[i - 1][j - 1] + 1; + if (dp[i][j] > best_len) { + best_len = dp[i][j]; + best_end_a = i; + } + } + } + } + return std::string(a.substr(best_end_a - best_len, best_len)); +} + std::string string_get_sortable_timestamp() { using clock = std::chrono::system_clock; @@ -1155,7 +1181,7 @@ struct common_init_result::impl { std::vector<llama_sampler_seq_config> samplers_seq_config; }; -common_init_result::common_init_result(common_params & params) : +common_init_result::common_init_result(common_params & params, bool model_only) : pimpl(new impl{}) { auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); @@ -1168,7 +1194,7 @@ common_init_result::common_init_result(common_params & params) : params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, - params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); @@ -1178,6 +1204,10 @@ common_init_result::common_init_result(common_params & params) : pimpl->model.reset(model); + if (model_only) { + return; + } + const llama_vocab * vocab = llama_model_get_vocab(model); // load and optionally apply lora adapters @@ -1281,8 +1311,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() { return pimpl->lora; } -common_init_result_ptr common_init_from_params(common_params & params) { - common_init_result_ptr res(new common_init_result(params)); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only) { + common_init_result_ptr res(new common_init_result(params, model_only)); llama_model * model = res->model(); if (model == NULL) { @@ -1290,6 +1320,10 @@ common_init_result_ptr common_init_from_params(common_params & params) { return res; } + if (model_only) { + return res; + } + llama_context * lctx = res->context(); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); @@ -1353,7 +1387,7 @@ common_init_result_ptr common_init_from_params(common_params & params) { } if (params.warmup) { - LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); llama_set_warmup(lctx, true); @@ -1435,6 +1469,12 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) { goto done; } + if (llama_n_rs_seq(ctx) > 0) { + LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__); + res = COMMON_CONTEXT_SEQ_RM_TYPE_RS; + goto done; + } + // try to remove the last tokens if (!llama_memory_seq_rm(mem, 0, 1, -1)) { LOG_TRC("%s: the context does not support partial sequence removal\n", __func__); @@ -1449,6 +1489,23 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) { return res; } +void common_context_seq_rm(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + auto * mem = llama_get_memory(ctx); + if (!llama_memory_seq_rm(mem, seq_id, p0, p1)) { + GGML_ABORT("%s", string_format("failed to remove sequence %d with p0=%d, p1=%d\n", seq_id, p0, p1).c_str()); + } +} + +void common_context_seq_cp(llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + auto * mem = llama_get_memory(ctx); + llama_memory_seq_cp(mem, seq_id_src, seq_id_dst, p0, p1); +} + +void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + auto * mem = llama_get_memory(ctx); + llama_memory_seq_add(mem, seq_id, p0, p1, delta); +} + void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) { std::vector<llama_adapter_lora *> loras; std::vector<float> scales; @@ -1505,6 +1562,8 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_ctx = params.n_ctx; cparams.n_seq_max = params.n_parallel; + cparams.n_rs_seq = params.speculative.need_n_rs_seq(); + cparams.n_outputs_max = std::max(params.n_outputs_max, 0); cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; cparams.n_threads = params.cpuparams.n_threads; @@ -2074,3 +2133,11 @@ void common_prompt_checkpoint::load_dft( GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n); } } + +void common_prompt_checkpoint::clear_tgt() { + data_tgt.clear(); +} + +void common_prompt_checkpoint::clear_dft() { + data_dft.clear(); +} diff --git a/common/common.h b/common/common.h index 552ae68266e..4381b13b0c7 100644 --- a/common/common.h +++ b/common/common.h @@ -13,6 +13,7 @@ #include <string_view> #include <vector> #include <map> +#include <algorithm> #if defined(_WIN32) && !defined(_WIN32_WINNT) #define _WIN32_WINNT 0x0A00 @@ -159,6 +160,7 @@ enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_MTP, // Multi-token prediction COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values @@ -275,6 +277,7 @@ struct common_params_sampling { std::vector<llama_token> reasoning_budget_end; // end tag token sequence std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag) std::string reasoning_budget_message; // message injected before end tag when budget exhausted + bool reasoning_control = false; // create the budget sampler on demand so reasoning can be ended at runtime bool backend_sampling = false; @@ -297,11 +300,13 @@ struct common_params_model { // draft-model-based speculative decoding parameters struct common_params_speculative_draft { - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding + int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.0f; // minimum speculative decoding probability (greedy) + + bool backend_sampling = true; // offload draft sampling to the backend (default: on) common_params_model mparams; @@ -355,6 +360,14 @@ struct common_params_speculative { bool has_dft() const { return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty(); } + + uint32_t need_n_rs_seq() const { + bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) { + return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP; + }); + + return needs_rs_seq ? draft.n_max : 0u; + } }; struct common_params_vocoder { @@ -419,6 +432,7 @@ struct common_params { int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode + int32_t n_outputs_max = 0; // max outputs in a batch (0 = n_batch) int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled) @@ -467,7 +481,7 @@ struct common_params { std::set<std::string> model_alias; // model aliases // NOLINT std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT - std::string hf_token = ""; // HF token // NOLINT + std::string hf_token = ""; // HF token (aka bearer token) // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT std::string prompt_file = ""; // store the external prompt file name // NOLINT @@ -495,6 +509,7 @@ struct common_params { int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector bool offline = false; + bool skip_download = false; // skip model file downloading int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line @@ -579,14 +594,14 @@ struct common_params { // server params int32_t port = 8080; // server listens on this network port bool reuse_port = false; // allow multiple sockets to bind to the same port - int32_t timeout_read = 600; // http read timeout in seconds + int32_t timeout_read = 3600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting bool cache_prompt = true; // whether to enable prompt caching bool cache_idle_slots = true; // save and clear idle slots upon starting a new task int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot - int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill + int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. std::string hostname = "127.0.0.1"; @@ -608,15 +623,17 @@ struct common_params { std::map<std::string, std::string> default_template_kwargs; - // webui configs -#ifdef LLAMA_WEBUI_DEFAULT_ENABLED - bool webui = LLAMA_WEBUI_DEFAULT_ENABLED != 0; -#else - bool webui = true; // default to enabled when not set -#endif + // UI configs + bool ui = true; + + // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead + bool webui = ui; bool webui_mcp_proxy = false; std::string webui_config_json; + bool ui_mcp_proxy = false; + std::string ui_config_json; + // "advanced" endpoints are disabled by default for better security bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET @@ -694,7 +711,7 @@ struct common_params { // initializes the logging system and prints info about the build void common_init(); -void common_params_print_info(const common_params & params); +void common_params_print_info(const common_params & params, bool print_devices = true); std::string common_params_get_system_info(const common_params & params); bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); @@ -721,6 +738,7 @@ std::string string_format(const char * fmt, ...); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); +std::string string_lcs(std::string_view a, std::string_view b); std::string string_join(const std::vector<std::string> & values, const std::string & separator); std::vector<std::string> string_split(const std::string & str, const std::string & delimiter); @@ -845,7 +863,7 @@ struct common_sampler; // note: defines the model, context, samplers, ets. lifetimes struct common_init_result { - common_init_result(common_params & params); + common_init_result(common_params & params, bool model_only = false); ~common_init_result(); llama_model * model(); @@ -863,7 +881,7 @@ struct common_init_result { using common_init_result_ptr = std::unique_ptr<common_init_result>; -common_init_result_ptr common_init_from_params(common_params & params); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); @@ -880,15 +898,20 @@ std::string common_get_model_endpoint(); // enum common_context_seq_rm_type { - COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module) - COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences - COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only + COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module) + COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences + COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only + COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq }; // check if the llama_context can remove sequences // note: clears the memory of the context common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx); +// aborts execution on failure +void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1); +void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); +void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); // // Batch utils @@ -1070,4 +1093,7 @@ struct common_prompt_checkpoint { llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) const; + + void clear_tgt(); + void clear_dft(); }; diff --git a/common/download.cpp b/common/download.cpp index 0bf12ad4a3b..40f6eb780f4 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url, const bool file_exists = std::filesystem::exists(path); + if (!file_exists && opts.skip_download) { + return -2; // file is missing and download is disabled + } + if (file_exists && skip_etag) { LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str()); return 304; // 304 Not Modified - fake cached response @@ -357,6 +361,10 @@ static int common_download_file_single_online(const std::string & url, LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str()); return 304; // 304 Not Modified - fake cached response } + // pass this point, the file exists but is different from the server version, so we need to redownload it + if (opts.skip_download) { + return -2; // special code to indicate that the download was skipped due to etag mismatch + } if (remove(path.c_str()) != 0) { LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); return -1; @@ -566,8 +574,11 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files, return result; } -static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, - const std::string & model) { +// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"), +// preferring deeper shared directory prefix with the model, then closest quantization +static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files, + const std::string & model, + const std::string & keyword) { hf_cache::hf_file best; size_t best_depth = 0; int best_diff = 0; @@ -579,20 +590,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, for (const auto & f : files) { if (!string_ends_with(f.path, ".gguf") || - f.path.find("mmproj") == std::string::npos) { + f.path.find(keyword) == std::string::npos) { continue; } - auto mmproj_parts = string_split<std::string>(f.path, '/'); - auto mmproj_dir = mmproj_parts.end() - 1; + auto sib_parts = string_split<std::string>(f.path, '/'); + auto sib_dir = sib_parts.end() - 1; auto [_, dir] = std::mismatch(model_parts.begin(), model_dir, - mmproj_parts.begin(), mmproj_dir); - if (dir != mmproj_dir) { + sib_parts.begin(), sib_dir); + if (dir != sib_dir) { continue; } - size_t depth = dir - mmproj_parts.begin(); + size_t depth = dir - sib_parts.begin(); auto bits = extract_quant_bits(f.path); auto diff = std::abs(bits - model_bits); @@ -606,6 +617,16 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, return best; } +static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files, + const std::string & model) { + return find_best_sibling(files, model, "mmproj"); +} + +static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files, + const std::string & model) { + return find_best_sibling(files, model, "mtp-"); +} + static bool gguf_filename_is_model(const std::string & filepath) { if (!string_ends_with(filepath, ".gguf")) { return false; @@ -617,7 +638,8 @@ static bool gguf_filename_is_model(const std::string & filepath) { } return filename.find("mmproj") == std::string::npos && - filename.find("imatrix") == std::string::npos; + filename.find("imatrix") == std::string::npos && + filename.find("mtp-") == std::string::npos; } static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files, @@ -673,11 +695,13 @@ struct hf_plan { hf_cache::hf_file primary; hf_cache::hf_files model_files; hf_cache::hf_file mmproj; + hf_cache::hf_file mtp; }; static hf_plan get_hf_plan(const common_params_model & model, const common_download_opts & opts, - bool download_mmproj) { + bool download_mmproj, + bool download_mtp) { hf_plan plan; hf_cache::hf_files all; @@ -723,6 +747,10 @@ static hf_plan get_hf_plan(const common_params_model & model, plan.mmproj = find_best_mmproj(all, primary.path); } + if (download_mtp) { + plan.mtp = find_best_mtp(all, primary.path); + } + return plan; } @@ -755,22 +783,26 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode } common_download_model_result common_download_model(const common_params_model & model, - const common_download_opts & opts, - bool download_mmproj) { + const common_download_opts & opts) { common_download_model_result result; std::vector<download_task> tasks; hf_plan hf; + bool download_mmproj = opts.download_mmproj; + bool download_mtp = opts.download_mtp; bool is_hf = !model.hf_repo.empty(); if (is_hf) { - hf = get_hf_plan(model, opts, download_mmproj); + hf = get_hf_plan(model, opts, download_mmproj, download_mtp); for (const auto & f : hf.model_files) { tasks.push_back({f.url, f.local_path}); } if (!hf.mmproj.path.empty()) { tasks.push_back({hf.mmproj.url, hf.mmproj.local_path}); } + if (!hf.mtp.path.empty()) { + tasks.push_back({hf.mtp.url, hf.mtp.local_path}); + } } else if (!model.url.empty()) { tasks = get_url_tasks(model); } else { @@ -782,18 +814,22 @@ common_download_model_result common_download_model(const common_params_model & return result; } - std::vector<std::future<bool>> futures; + std::vector<std::future<int>> futures; for (const auto & task : tasks) { futures.push_back(std::async(std::launch::async, [&task, &opts, is_hf]() { - int status = common_download_file_single(task.url, task.path, opts, is_hf); - return is_http_status_ok(status); + return common_download_file_single(task.url, task.path, opts, is_hf); } )); } for (auto & f : futures) { - if (!f.get()) { + int status = f.get(); + if (status == -2 && opts.skip_download) { + throw common_skip_download_exception(); + } + bool is_ok = is_http_status_ok(status); + if (!is_ok) { return {}; } } @@ -807,6 +843,10 @@ common_download_model_result common_download_model(const common_params_model & if (!hf.mmproj.path.empty()) { result.mmproj_path = hf_cache::finalize_file(hf.mmproj); } + + if (!hf.mtp.path.empty()) { + result.mtp_path = hf_cache::finalize_file(hf.mtp); + } } else { result.model_path = model.path; } @@ -946,7 +986,8 @@ std::vector<common_cached_model_info> common_list_cached_models() { for (const auto & f : files) { auto split = get_gguf_split_info(f.path); if (split.index != 1 || split.tag.empty() || - split.prefix.find("mmproj") != std::string::npos) { + split.prefix.find("mmproj") != std::string::npos || + split.prefix.find("mtp-") != std::string::npos) { continue; } if (seen.insert(f.repo_id + ":" + split.tag).second) { diff --git a/common/download.h b/common/download.h index edc3e9f1a71..ebeedd6058c 100644 --- a/common/download.h +++ b/common/download.h @@ -52,6 +52,9 @@ struct common_download_opts { std::string bearer_token; common_header_list headers; bool offline = false; + bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid + bool download_mmproj = false; + bool download_mtp = false; common_download_callback * callback = nullptr; }; @@ -59,6 +62,12 @@ struct common_download_opts { struct common_download_model_result { std::string model_path; std::string mmproj_path; + std::string mtp_path; +}; + +// throw if the file is missing or invalid (e.g. ETag check failed) +struct common_skip_download_exception : public std::runtime_error { + common_skip_download_exception() : std::runtime_error("skip download") {} }; // Download model from HuggingFace repo or URL @@ -83,12 +92,12 @@ struct common_download_model_result { // when opts.offline=true, no network requests are made // when download_mmproj=true, searches for mmproj in same directory as model or any parent directory // then with the closest quantization bits +// when download_mtp=true, applies the same sibling search for an MTP-head GGUF // -// returns result with model_path and mmproj_path (empty on failure) +// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure) common_download_model_result common_download_model( const common_params_model & model, - const common_download_opts & opts = {}, - bool download_mmproj = false + const common_download_opts & opts = {} ); // returns list of cached models @@ -96,6 +105,7 @@ std::vector<common_cached_model_info> common_list_cached_models(); // download single file from url to local path // returns status code or -1 on error +// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true) // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash) int common_download_file_single(const std::string & url, const std::string & path, diff --git a/common/fit.cpp b/common/fit.cpp index c10cb7f08b4..668d892e908 100644 --- a/common/fit.cpp +++ b/common/fit.cpp @@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error { using std::runtime_error::runtime_error; }; -static std::vector<llama_device_memory_data> common_get_device_memory_data( +std::vector<llama_device_memory_data> common_get_device_memory_data( const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, diff --git a/common/fit.h b/common/fit.h index e066092ec6c..643d3420095 100644 --- a/common/fit.h +++ b/common/fit.h @@ -1,6 +1,11 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" +#include "llama.h" +#include "../src/llama-ext.h" + +#include <vector> enum common_params_fit_status { COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit @@ -30,3 +35,14 @@ void common_fit_print( struct llama_context_params * cparams); void common_memory_breakdown_print(const struct llama_context * ctx); + +// Load a model + context with no_alloc and return the per-device memory breakdown. +std::vector<llama_device_memory_data> common_get_device_memory_data( + const char * path_model, + const struct llama_model_params * mparams, + const struct llama_context_params * cparams, + std::vector<ggml_backend_dev_t> & devs, + uint32_t & hp_ngl, + uint32_t & hp_n_ctx_train, + uint32_t & hp_n_expert, + enum ggml_log_level log_level); diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index 20f33e4c7f4..ba7417a12bb 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -11,7 +11,6 @@ #include <filesystem> #include <fstream> #include <atomic> -#include <regex> // migration only #include <string> #include <string_view> #include <stdexcept> @@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id, if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) { file.oid = item["lfs"]["oid"].get<std::string>(); } - if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) { - file.size = item["lfs"]["size"].get<size_t>(); - } } else if (item.contains("oid") && item["oid"].is_string()) { file.oid = item["oid"].get<std::string>(); } - if (file.size == 0 && item.contains("size") && item["size"].is_number()) { - file.size = item["size"].get<size_t>(); - } if (!file.oid.empty() && !is_valid_oid(file.oid)) { LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str()); @@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) { return file.final_path; } -// delete everything after this line, one day - -// copied from download.cpp without the tag part -struct gguf_split_info { - std::string prefix; // tag included - int index; - int count; -}; - -static gguf_split_info get_gguf_split_info(const std::string & path) { - static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase); - std::smatch m; - - std::string prefix = path; - if (!string_remove_suffix(prefix, ".gguf")) { - return {}; - } - - int index = 1; - int count = 1; - - if (std::regex_match(prefix, m, re_split)) { - index = std::stoi(m[2].str()); - count = std::stoi(m[3].str()); - prefix = m[1].str(); - } - - return {std::move(prefix), index, count}; -} - -static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) { - static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)"); - std::smatch match; - if (std::regex_match(filename, match, re)) { - return {match[1].str(), match[2].str()}; - } - return {}; -} - -static std::string make_old_cache_filename(const std::string & owner, - const std::string & repo, - const std::string & filename) { - auto result = owner + "_" + repo + "_" + filename; - string_replace_all(result, "/", "_"); - return result; -} - -struct migrate_file { - std::string path; - std::string sha256; - size_t size; - fs::path old_path; - fs::path etag_path; - const hf_file * file; -}; - -using migrate_files = std::vector<migrate_file>; - -static bool collect_file(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const std::string & path, - const std::string & sha256, - const hf_files & files, - migrate_files & to_migrate) { - - const hf_file * file = nullptr; - - for (const auto & f : files) { - if (f.path == path) { - file = &f; - break; - } - } - - std::string old_filename = make_old_cache_filename(owner, repo, path); - fs::path old_path = old_cache / old_filename; - fs::path etag_path = old_path.string() + ".etag"; - - if (!fs::exists(old_path)) { - if (file && fs::exists(file->final_path)) { - return true; - } - LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str()); - return false; - } - - if (!file) { - LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str()); - return false; - } - - if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str()); - return false; - } - - if (file->size > 0) { - size_t size = fs::file_size(old_path); - if (size != file->size) { - LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size); - return false; - } - } - - to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file}); - return true; -} - -static bool collect_files(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const nl::json & node, - const hf_files & files, - migrate_files & to_migrate) { - - if (!node.contains("rfilename") || - !node.contains("lfs") || - !node["lfs"].contains("sha256")) { - return true; - } - - std::string path = node["rfilename"]; - std::string sha256 = node["lfs"]["sha256"]; - - auto split = get_gguf_split_info(path); - - if (split.count <= 1) { - return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate); - } - - std::vector<std::pair<std::string, std::string>> splits; - - for (const auto & f : files) { - auto split_f = get_gguf_split_info(f.path); - if (split_f.count == split.count && split_f.prefix == split.prefix) { - // sadly the manifest only provides the sha256 of the first file (index == 1) - // the rest will be verified using the size... - std::string f_sha256 = (split_f.index == 1) ? sha256 : ""; - splits.emplace_back(f.path, f_sha256); - } - } - - if ((int)splits.size() != split.count) { - LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size()); - return false; - } - - for (const auto & [f_path, f_sha256] : splits) { - if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) { - return false; - } - } - - return true; -} - -static bool migrate_file(const migrate_file & file) { - std::error_code ec; - - fs::path new_path(file.file->local_path); - fs::create_directories(new_path.parent_path(), ec); - - if (!fs::exists(new_path, ec)) { - fs::rename(file.old_path, new_path, ec); - if (ec) { - fs::copy_file(file.old_path, new_path, ec); - if (ec) { - LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str()); - return false; - } - } - fs::remove(file.old_path, ec); - } - fs::remove(file.etag_path, ec); - - std::string filename = finalize_file(*file.file); - LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str()); - return true; -} - -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { - fs::path old_cache = fs_get_cache_directory(); - if (!fs::exists(old_cache)) { - return; - } - - if (offline) { - LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__); - return; // -hf is not going to work - } - - bool warned = false; - - for (const auto & entry : fs::directory_iterator(old_cache)) { - if (!entry.is_regular_file()) { - continue; - } - auto filename = entry.path().filename().string(); - auto [owner, repo] = parse_manifest_name(filename); - - if (owner.empty() || repo.empty()) { - continue; - } - - if (!warned) { - warned = true; - LOG_WRN("================================================================================\n" - "WARNING: Migrating cache to HuggingFace cache directory\n" - " Old cache: %s\n" - " New cache: %s\n" - "This one-time migration moves models previously downloaded with -hf\n" - "from the legacy llama.cpp cache to the standard HuggingFace cache.\n" - "Models downloaded with --model-url are not affected.\n" - "================================================================================\n", - old_cache.string().c_str(), get_cache_directory().string().c_str()); - } - - auto repo_id = owner + "/" + repo; - auto files = get_repo_files(repo_id, token); - - if (files.empty()) { - LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str()); - continue; - } - - migrate_files to_migrate; - bool ok = true; - - try { - std::ifstream manifest(entry.path()); - auto json = nl::json::parse(manifest); - for (const char * key : {"ggufFile", "mmprojFile"}) { - if (json.contains(key)) { - if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) { - ok = false; - break; - } - } - } - } catch (const std::exception & e) { - LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what()); - continue; - } - - if (!ok) { - LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__); - continue; - } - - for (const auto & file : to_migrate) { - if (!migrate_file(file)) { - ok = false; - break; - } - } - - if (!ok) { - LOG_WRN("%s: migration failed: could not migrate all files\n", __func__); - continue; - } - - LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str()); - fs::remove(entry.path()); - } -} - } // namespace hf_cache diff --git a/common/hf-cache.h b/common/hf-cache.h index 9e46f977437..23fa0adb729 100644 --- a/common/hf-cache.h +++ b/common/hf-cache.h @@ -14,7 +14,6 @@ struct hf_file { std::string final_path; std::string oid; std::string repo_id; - size_t size = 0; // only for the migration }; using hf_files = std::vector<hf_file>; @@ -30,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {}); // Create snapshot path (link or move/copy) and return it std::string finalize_file(const hf_file & file); -// TODO: Remove later -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false); - } // namespace hf_cache diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 8e3978f7ed0..9364159767e 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -471,7 +471,7 @@ void common_ngram_map_draft(common_ngram_map & map, sum_occur += curr_occur; } - LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__, + LOG_DBG("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__, key_offset, max_occur, sum_occur, slot_max, curr_key.values[0].value_idx, curr_key.values[0].value_num, @@ -482,7 +482,7 @@ void common_ngram_map_draft(common_ngram_map & map, // Print the tokens of the four values (if idx != 0), use LOG_INF for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) { if (curr_key.values[v].value_idx != 0) { - LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str()); + LOG_DBG("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str()); } } @@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, + LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, key_offset, slot_max, curr_key.key_num, draft.size()); diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp index 76f7257f611..1b5a09a5eb6 100644 --- a/common/ngram-mod.cpp +++ b/common/ngram-mod.cpp @@ -1,5 +1,7 @@ #include "ngram-mod.h" +#include <algorithm> + // // common_ngram_mod // diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index c6e1f86c91e..ce41d029b05 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -171,22 +171,12 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) { ctx->force_pos = 0; } -// forward declaration for use in clone static struct llama_sampler * common_reasoning_budget_init_state( const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens, const std::vector<llama_token> & end_tokens, const std::vector<llama_token> & forced_tokens, int32_t budget, common_reasoning_budget_state initial_state); -static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; - return common_reasoning_budget_init_state( - ctx->vocab, - ctx->start_matcher.tokens, - ctx->end_matcher.tokens, - ctx->forced_tokens, - ctx->budget, - ctx->state); -} +static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl); static void common_reasoning_budget_free(struct llama_sampler * smpl) { delete (common_reasoning_budget_ctx *) smpl->ctx; @@ -205,6 +195,15 @@ static struct llama_sampler_i common_reasoning_budget_i = { /* .backend_set_input = */ nullptr, }; +static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx; + + return llama_sampler_init( + /* .iface = */ &common_reasoning_budget_i, + /* .ctx = */ new common_reasoning_budget_ctx(*ctx) + ); +} + static struct llama_sampler * common_reasoning_budget_init_state( const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens, @@ -248,3 +247,24 @@ common_reasoning_budget_state common_reasoning_budget_get_state(const struct lla } return ((const common_reasoning_budget_ctx *)smpl->ctx)->state; } + +bool common_reasoning_budget_force(struct llama_sampler * smpl) { + if (!smpl) { + return false; + } + + auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx; + + // only a sampler that is actively counting down the budget may be forced; + // any other state (idle, already forcing/waiting, or done) is left untouched + if (ctx->state != REASONING_BUDGET_COUNTING) { + return false; + } + + ctx->state = REASONING_BUDGET_FORCING; + ctx->force_pos = 0; + ctx->end_matcher.reset(); + LOG_INF("reasoning-budget: forced into forcing state (manual transition)\n"); + + return true; +} diff --git a/common/reasoning-budget.h b/common/reasoning-budget.h index ef37f46ee4d..0cf689a5663 100644 --- a/common/reasoning-budget.h +++ b/common/reasoning-budget.h @@ -40,3 +40,7 @@ struct llama_sampler * common_reasoning_budget_init( common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE); common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl); + +// Manually transition the reasoning budget sampler into the FORCING state. +// Returns true if the transition occurred. +bool common_reasoning_budget_force(struct llama_sampler * smpl); diff --git a/common/sampling.cpp b/common/sampling.cpp index 5665d0a706c..85f8ed50b35 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -293,7 +293,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } // reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression) - if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) { + if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0 || params.reasoning_control)) { rbudget = common_reasoning_budget_init( vocab, params.reasoning_budget_start, @@ -661,6 +661,14 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } +bool common_sampler_reasoning_budget_force(struct common_sampler * gsmpl) { + if (!gsmpl) { + return false; + } + + return common_reasoning_budget_force(gsmpl->rbudget); +} + // helpers llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) { diff --git a/common/sampling.h b/common/sampling.h index 49506a00cd8..19cbbbaba36 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -87,6 +87,9 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); +// force the reasoning budget sampler (if any) to begin forcing its end sequence now. +bool common_sampler_reasoning_budget_force(struct common_sampler * gsmpl); + // helpers // access the internal list of current candidate tokens diff --git a/common/speculative.cpp b/common/speculative.cpp index 476e1398ed8..73830fda6c9 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -3,6 +3,7 @@ #include "common.h" #include "ggml.h" #include "llama.h" +#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP) #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" @@ -23,6 +24,7 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro {"none", COMMON_SPECULATIVE_TYPE_NONE}, {"draft-simple", COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE}, {"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3}, + {"draft-mtp", COMMON_SPECULATIVE_TYPE_DRAFT_MTP}, {"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, @@ -30,6 +32,18 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro {"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; +static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) { + std::string result; + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i] == nullptr) { + continue; + } + if (!result.empty()) result += ", "; + result += ggml_backend_dev_name(devices[i]); + } + return result.empty() ? "default" : result; +} + struct common_speculative_config { common_speculative_type type; common_params_speculative params; @@ -142,7 +156,13 @@ struct common_speculative_impl { virtual void draft(common_speculative_draft_params_vec & dparams) = 0; - virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; + virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; + + // true if this implementation requires the target context to extract post-norm embeddings + virtual bool need_embd() const = 0; + + // true if this implementation requires the target context to extract pre-norm embeddings + virtual bool need_embd_pre_norm() const { return false; } }; struct common_speculative_impl_draft_simple : public common_speculative_impl { @@ -159,6 +179,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; auto * ctx_tgt = this->params.ctx_tgt; + LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); // TODO: optimize or pass from outside? @@ -335,16 +365,24 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } + + bool need_embd() const override { + return false; + } }; struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + { + LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -359,9 +397,384 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // TODO: implement } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } + + bool need_embd() const override { + return false; + } +}; + +struct common_speculative_impl_draft_mtp : public common_speculative_impl { + common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) + + llama_batch batch; + + std::vector<common_sampler_ptr> smpls; + + // backend sampler chain per seq, attached to ctx_dft + std::vector<llama_sampler *> backend_chains; + + int32_t n_embd = 0; + + // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. + // The last h-row of one process() call needs the first token of the NEXT + // call to pair with, so it's stashed here until that next call fires. + std::vector<std::vector<float>> pending_h; // [n_seq][n_embd] + + std::vector<int32_t> i_batch_beg; + std::vector<int32_t> i_batch_end; + + // Hidden rows from the most recent target verification batch, grouped by seq. + // Row 0 corresponds to the sampled token, row N to the Nth accepted draft token. + std::vector<std::vector<float>> verify_h; + std::vector<int32_t> verify_h_rows; + + // Per-seq draft length from the last draft() call, used in accept() to + // roll back ctx_dft's recurrent state past the AR draft's redundant + // pre-advancement before process() mirrored the verify batch. + std::vector<uint16_t> last_n_drafted; + + common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq) + , params(params.draft) + { + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set"); + + n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + + LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + + const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); + batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1); + // llama_batch_init allocates only one of token/embd; MTP needs both. + // TODO: fix, how to call without malloc + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b); + + smpls.resize(n_seq); + for (auto & s : smpls) { + common_params_sampling sparams; + sparams.no_perf = false; + sparams.top_k = 10; + sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; + s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); + } + + // offload draft sampling to the backend + backend_chains.assign(n_seq, nullptr); + if (this->params.backend_sampling) { + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(chain, llama_sampler_init_top_k(10)); + + if (!llama_set_sampler(ctx_dft, seq_id, chain)) { + LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id); + llama_sampler_free(chain); + chain = nullptr; + } + backend_chains[seq_id] = chain; + } + } + + llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false); + llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); + + pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f)); + + i_batch_beg.assign(n_seq, -1); + i_batch_end.assign(n_seq, -1); + + verify_h.assign(n_seq, {}); + verify_h_rows.assign(n_seq, 0); + + last_n_drafted.assign(n_seq, 0); + } + + ~common_speculative_impl_draft_mtp() override { + auto * ctx_dft = this->params.ctx_dft; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) { + if (backend_chains[seq_id] == nullptr) { + continue; + } + if (ctx_dft) { + llama_set_sampler(ctx_dft, seq_id, nullptr); + } + llama_sampler_free(backend_chains[seq_id]); + } + backend_chains.clear(); + + if (batch.token != nullptr) { + free(batch.token); + batch.token = nullptr; + } + llama_batch_free(batch); + } + + void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { + const int32_t N = (int32_t) prompt.size(); + if (N <= 0) { + return; + } + auto * ctx_dft = this->params.ctx_dft; + const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); + if (pos_max < N - 1) { + LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " + "process() hook may not have run on every prefill ubatch " + "(need_embd / logits=1 on every prompt position?). " + "Drafts may degrade.\n", + __func__, (int) pos_max, N - 1); + } + } + + bool process(const llama_batch & batch_in) override { + if (batch_in.n_tokens <= 0) { + return true; + } + + // TODO: how to make it work with vision tokens? + if (batch_in.token == nullptr || batch_in.embd != nullptr) { + return true; + } + + const int32_t n_tokens = batch_in.n_tokens; + + // remember the frist and last batch index for each sequence + std::fill(i_batch_beg.begin(), i_batch_beg.end(), -1); + std::fill(i_batch_end.begin(), i_batch_end.end(), -1); + + for (int k = 0; k < n_tokens; ++k) { + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + GGML_ASSERT(batch_in.n_seq_id[k] == 1); + + if (batch_in.seq_id[k][0] == seq_id) { + i_batch_end[seq_id] = k; + if (i_batch_beg[seq_id] < 0) { + i_batch_beg[seq_id] = k; + } + } + } + } + + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + + const size_t row_bytes = (size_t) n_embd * sizeof(float); + + common_batch_clear(batch); + + for (int k = 0; k < n_tokens; ++k) { + common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); + } + + // shift the tgt embeddings to the right by one position + // assumes that the tokens in the batch are sequential for each sequence + // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] + // ^--- this is a problem + // TODO:this is generally true, but would be nice to assert it + { + const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt); + std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + + //{ + // // string with seq_ids in the batch + // std::stringstream ss; + // for (int i = 0; i < n_tokens; ++i) { + // ss << batch_in.seq_id[i][0] << ","; + // } + // LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str()); + //} + } + + // fill the pending embeddings from a previous run + auto set_h = [&](int idx, const float * h_row) { + std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); + }; + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } + + set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); + } + + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); + return false; + } + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_end[seq_id] < 0) { + continue; + } + + const int32_t n_rows = i_batch_end[seq_id] - i_batch_beg[seq_id] + 1; + verify_h_rows[seq_id] = n_rows; + verify_h[seq_id].resize((size_t) n_rows * n_embd); + + for (int32_t i = 0; i < n_rows; ++i) { + const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i); + std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes); + } + + std::memcpy(pending_h[seq_id].data(), + verify_h[seq_id].data() + (size_t) (n_rows - 1) * n_embd, row_bytes); + } + + return true; + } + + void draft(common_speculative_draft_params_vec & dparams) override { + auto & ctx_dft = params.ctx_dft; + + common_batch_clear(batch); + + // keep track of which sequences are still drafting + int n_drafting = 0; + std::vector<bool> drafting(n_seq); + + const float * h_row = nullptr; + const size_t row_bytes = (size_t) n_embd * sizeof(float); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + + if (!dp.drafting) { + continue; + } + + n_drafting++; + drafting[seq_id] = true; + common_sampler_reset(smpls[seq_id].get()); + + common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true); + + h_row = pending_h[seq_id].data(); + std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); + } + + int ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode returned %d\n", __func__, ret); + return; + } + + int i = 0; + + while (n_drafting > 0) { + int i_batch = 0; + + common_batch_clear(batch); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (!drafting[seq_id]) { + continue; + } + + auto * smpl = smpls[seq_id].get(); + + common_sampler_sample(smpl, ctx_dft, i_batch, true); + h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch); + ++i_batch; + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p, + common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + } + + // add drafted token for each sequence + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + + common_sampler_accept(smpl, id, true); + + auto & dp = dparams.at(seq_id); + auto & result = *dp.result; + + result.push_back(id); + + if (params.n_max <= (int) result.size()) { + drafting[seq_id] = false; + n_drafting--; + continue; + } + + common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); + } + + if (batch.n_tokens == 0) { + break; + } + + // evaluate the drafted tokens on the draft model + ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret); + break; + } + + ++i; + } + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + if (!dp.drafting) { + continue; + } + + if (dp.result->size() < (size_t) params.n_min) { + dp.result->clear(); + } + + last_n_drafted[seq_id] = (uint16_t) dp.result->size(); + } + } + + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + return; + } + + const int32_t n_rows = verify_h_rows[seq_id]; + if (n_rows <= 0) { + return; + } + + const int32_t i_h = std::min<int32_t>(n_accepted, n_rows - 1); + const size_t row_bytes = (size_t) n_embd * sizeof(float); + std::memcpy(pending_h[seq_id].data(), verify_h[seq_id].data() + (size_t) i_h * n_embd, row_bytes); + } + + bool need_embd() const override { + return false; + } + + bool need_embd_pre_norm() const override { + return true; + } }; // state of self-speculation (simple implementation, not ngram-map) @@ -376,7 +789,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) , params(params.ngram_simple) - , config(config) {} + , config(config) + { + LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__); + LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__, + this->params.size_n, this->params.size_m, this->params.min_hits); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -400,26 +818,31 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } + + bool need_embd() const override { + return false; + } }; struct common_speculative_impl_ngram_map_k : public common_speculative_impl { - common_params_speculative_ngram_map params; - // n_seq configs std::vector<common_ngram_map> config; common_speculative_impl_ngram_map_k( - const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq) - , params(params.ngram_map_k) { + { for (uint32_t i = 0; i < n_seq; i++) { this->config.push_back(config); } + + LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str()); + LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__, + config.size_key, config.size_value, config.key_only, config.min_hits); } void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { @@ -446,11 +869,19 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { GGML_ASSERT((seq_id < (llama_seq_id) config.size())); + if (is_other) { + return; + } + common_ngram_map_accept(config[seq_id], n_accepted); } + + bool need_embd() const override { + return false; + } }; struct common_speculative_impl_ngram_mod : public common_speculative_impl { @@ -466,7 +897,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { // the last position in the prompt that was added to the ngram container size_t i_last = 0; - // length of the last drafted nโ€‘gram (number of tokens returned by draft) + // length of the last drafted n-gram (number of tokens returned by draft) size_t n_draft_last = 0; // consecutive accept rounds with low acceptance fraction (< 0.5) @@ -484,8 +915,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { , verbose(std::getenv("LLAMA_TRACE") != nullptr) { static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); - LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__, - this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024); + LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__); + LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__, + this->params.n_match, this->params.n_max, this->params.n_min); + LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__, + mod.size(), (float)(mod.size_bytes())/1024/1024); if (this->params.n_match < 16) { LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, " @@ -575,7 +1009,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } result.resize(result.size() - n); - // store length of drafted nโ€‘gram for later acceptance analysis + // store length of drafted n-gram for later acceptance analysis sinfo.n_draft_last = result.size(); } @@ -597,17 +1031,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { + if (is_other) { + return; + } + auto & sinfo = sinfos[seq_id]; // compute acceptance fraction if we have a recorded draft length if (sinfo.n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last; - if (f_acc < 0.5) { + if (f_acc < 0.25) { sinfo.n_low++; - if (sinfo.n_low >= 3) { + if (sinfo.n_low >= 5) { if (verbose) { - LOG_WRN("%s: low acceptance streak (%d) โ€“ resetting ngram_mod\n", __func__, sinfo.n_low); + LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low); } mod.reset(); @@ -619,6 +1057,10 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } } } + + bool need_embd() const override { + return false; + } }; struct common_speculative_impl_ngram_cache : public common_speculative_impl { @@ -653,6 +1095,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { , save_dynamic(save_dynamic) , save_static(save_static) { + LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__); + LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__, + n_draft, + path_static.empty() ? "none" : path_static.c_str(), + path_dynamic.empty() ? "none" : path_dynamic.c_str()); + sinfos.resize(n_seq); if (!path_static.empty()) { @@ -749,9 +1197,13 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } + + bool need_embd() const override { + return false; + } }; struct common_speculative { @@ -820,6 +1272,7 @@ std::string common_speculative_type_to_str(common_speculative_type type) { case COMMON_SPECULATIVE_TYPE_NONE: return "none"; case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: return "draft-simple"; case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3"; + case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: return "draft-mtp"; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v"; @@ -864,6 +1317,40 @@ static uint32_t common_get_enabled_speculative_configs(const std::vector<common_ return result; } +int32_t common_speculative_n_max(const common_params_speculative * spec) { + int32_t n_max = 0; + + for (const auto type : spec->types) { + switch (type) { + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: + case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: + n_max = std::max(n_max, std::max(0, spec->draft.n_max)); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: + n_max = std::max(n_max, (int32_t) spec->ngram_simple.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + n_max = std::max(n_max, (int32_t) spec->ngram_map_k.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: + n_max = std::max(n_max, (int32_t) spec->ngram_map_k4v.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: + n_max = std::max(n_max, std::max(0, spec->ngram_mod.n_max)); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: + n_max = std::max(n_max, (int32_t) 8); + break; + case COMMON_SPECULATIVE_TYPE_NONE: + case COMMON_SPECULATIVE_TYPE_COUNT: + break; + } + } + + return n_max; +} + // initialization of the speculative decoding system // common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq) { @@ -872,11 +1359,9 @@ common_speculative * common_speculative_init(common_params_speculative & params, { uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); - bool has_draft_model_path = !params.draft.mparams.path.empty(); - bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); - // bool has_mtp = false; // TODO: add MTP here bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 + bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; bool has_ngram_cache = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE)); bool has_ngram_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE)); @@ -885,7 +1370,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, bool has_ngram_mod = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MOD)); // when adding a new type - update here the logic above - static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 8); + static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 9); // this list here defines the priority of the speculators // the one with highest priority are listed first @@ -906,29 +1391,20 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } - if (has_draft_simple) { - if (!has_draft_model_path) { - LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__); - has_draft_simple = false; - } - } else if (has_draft_model_path) { - LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__); - has_draft_simple = true; - } - if (has_draft_simple) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params)); } - // TODO: add MTP here if (has_draft_eagle3) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params)); } + if (has_mtp) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params)); + } } std::vector<std::unique_ptr<common_speculative_impl>> impls = {}; for (const common_speculative_config & config : configs) { - LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str()); switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; @@ -940,6 +1416,10 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::make_unique<common_speculative_impl_draft_eagle3>(config.params, n_seq)); break; } + case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: { + impls.push_back(std::make_unique<common_speculative_impl_draft_mtp>(config.params, n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple); @@ -958,11 +1438,16 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::move(state)); break; } - case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: { + impls.push_back( + std::make_unique<common_speculative_impl_ngram_map_k>( + get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( std::make_unique<common_speculative_impl_ngram_map_k>( - config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { @@ -1040,6 +1525,34 @@ bool common_speculative_process(common_speculative * spec, const llama_batch & b return result; } +bool common_speculative_need_embd(common_speculative * spec) { + if (spec == nullptr) { + return false; + } + + for (auto & impl : spec->impls) { + if (impl->need_embd()) { + return true; + } + } + + return false; +} + +bool common_speculative_need_embd_pre_norm(common_speculative * spec) { + if (spec == nullptr) { + return false; + } + + for (auto & impl : spec->impls) { + if (impl->need_embd_pre_norm()) { + return true; + } + } + + return false; +} + void common_speculative_draft(common_speculative * spec) { if (spec == nullptr) { return; @@ -1122,10 +1635,6 @@ void common_speculative_draft(common_speculative * spec) { } void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, uint16_t n_accepted) { - if (n_accepted == 0) { - return; - } - common_speculative_impl * impl = spec->impl_last[seq_id]; GGML_ASSERT(impl); @@ -1137,9 +1646,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u impl->n_acc_tokens += n_accepted; } - impl->accept(seq_id, n_accepted); + impl->accept(seq_id, n_accepted, false); impl->n_call_accept++; } + + // accept with the rest of the implementations, using is_other == true + for (auto & impl_other : spec->impls) { + if (impl_other.get() != impl) { + impl_other->accept(seq_id, n_accepted, true); + } + } } void common_speculative_print_stats(const common_speculative * spec) { @@ -1159,7 +1675,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, diff --git a/common/speculative.h b/common/speculative.h index 51f0b059fa4..deba7dac720 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -20,6 +20,9 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // convert type to string std::string common_speculative_type_to_str(enum common_speculative_type type); +// return the max number of draft tokens based on the speculative parameters +int32_t common_speculative_n_max(const common_params_speculative * spec); + common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq); void common_speculative_free(common_speculative * spec); @@ -53,6 +56,12 @@ void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, co // process the batch and update the internal state of the speculative context bool common_speculative_process(common_speculative * spec, const llama_batch & batch); +// true if any implementation requires target post-norm embeddings to be extracted +bool common_speculative_need_embd(common_speculative * spec); + +// true if any implementation requires target pre-norm embeddings to be extracted +bool common_speculative_need_embd_pre_norm(common_speculative * spec); + // generate drafts for the sequences specified with `common_speculative_get_draft_params` void common_speculative_draft(common_speculative * spec); diff --git a/conversion/__init__.py b/conversion/__init__.py new file mode 100644 index 00000000000..22200574028 --- /dev/null +++ b/conversion/__init__.py @@ -0,0 +1,339 @@ +from __future__ import annotations + +from .base import ( + ModelBase, TextModel, MmprojModel, ModelType, SentencePieceTokenTypes, + logger, _mistral_common_installed, _mistral_import_error_msg, + get_model_architecture, LazyTorchTensor, +) +from typing import Type + + +__all__ = [ + "ModelBase", "TextModel", "MmprojModel", "ModelType", "SentencePieceTokenTypes", + "get_model_architecture", "LazyTorchTensor", "logger", + "_mistral_common_installed", "_mistral_import_error_msg", + "get_model_class", "print_registered_models", "load_all_models", +] + + +TEXT_MODEL_MAP: dict[str, str] = { + "AfmoeForCausalLM": "afmoe", + "ApertusForCausalLM": "llama", + "ArceeForCausalLM": "llama", + "ArcticForCausalLM": "arctic", + "AudioFlamingo3ForConditionalGeneration": "qwen", + "BaiChuanForCausalLM": "baichuan", + "BaichuanForCausalLM": "baichuan", + "BailingMoeForCausalLM": "bailingmoe", + "BailingMoeV2ForCausalLM": "bailingmoe", + "BambaForCausalLM": "granite", + "BertForMaskedLM": "bert", + "BertForSequenceClassification": "bert", + "BertModel": "bert", + "BitnetForCausalLM": "bitnet", + "BloomForCausalLM": "bloom", + "BloomModel": "bloom", + "CamembertModel": "bert", + "ChameleonForCausalLM": "chameleon", + "ChameleonForConditionalGeneration": "chameleon", + "ChatGLMForConditionalGeneration": "chatglm", + "ChatGLMModel": "chatglm", + "CodeShellForCausalLM": "codeshell", + "CogVLMForCausalLM": "cogvlm", + "Cohere2ForCausalLM": "command_r", + "CohereForCausalLM": "command_r", + "DbrxForCausalLM": "dbrx", + "DeciLMForCausalLM": "deci", + "DeepseekForCausalLM": "deepseek", + "DeepseekV2ForCausalLM": "deepseek", + "DeepseekV3ForCausalLM": "deepseek", + "DeepseekV32ForCausalLM": "deepseek", + "DistilBertForMaskedLM": "bert", + "DistilBertForSequenceClassification": "bert", + "DistilBertModel": "bert", + "Dots1ForCausalLM": "dots1", + "DotsOCRForCausalLM": "qwen", + "DreamModel": "dream", + "Ernie4_5ForCausalLM": "ernie", + "Ernie4_5_ForCausalLM": "ernie", + "Ernie4_5_MoeForCausalLM": "ernie", + "EuroBertModel": "bert", + "Exaone4_5_ForConditionalGeneration": "exaone", + "Exaone4ForCausalLM": "exaone", + "ExaoneForCausalLM": "exaone", + "ExaoneMoEForCausalLM": "exaone", + "FalconForCausalLM": "falcon", + "FalconH1ForCausalLM": "falcon_h1", + "FalconMambaForCausalLM": "mamba", + "GPT2LMHeadModel": "gpt2", + "GPTBigCodeForCausalLM": "starcoder", + "GPTNeoXForCausalLM": "gptneox", + "GPTRefactForCausalLM": "refact", + "Gemma2ForCausalLM": "gemma", + "Gemma3ForCausalLM": "gemma", + "Gemma3ForConditionalGeneration": "gemma", + "Gemma3TextModel": "gemma", + "Gemma3nForCausalLM": "gemma", + "Gemma3nForConditionalGeneration": "gemma", + "Gemma4ForConditionalGeneration": "gemma", + "Gemma4ForCausalLM": "gemma", + "GemmaForCausalLM": "gemma", + "Glm4ForCausalLM": "glm", + "Glm4MoeForCausalLM": "glm", + "Glm4MoeLiteForCausalLM": "glm", + "Glm4vForConditionalGeneration": "glm", + "Glm4vMoeForConditionalGeneration": "glm", + "GlmForCausalLM": "chatglm", + "GlmMoeDsaForCausalLM": "glm", + "GlmOcrForConditionalGeneration": "glm", + "GptOssForCausalLM": "gpt_oss", + "GraniteForCausalLM": "granite", + "GraniteMoeForCausalLM": "granite", + "GraniteMoeHybridForCausalLM": "granite", + "GraniteMoeSharedForCausalLM": "granite", + "GraniteSpeechForConditionalGeneration": "granite", + "Grok1ForCausalLM": "grok", + "GrokForCausalLM": "grok", + "GroveMoeForCausalLM": "grovemoe", + "HunYuanDenseV1ForCausalLM": "hunyuan", + "HunYuanMoEV1ForCausalLM": "hunyuan", + "HunYuanVLForConditionalGeneration": "hunyuan", + "IQuestCoderForCausalLM": "llama", + "InternLM2ForCausalLM": "internlm", + "InternLM3ForCausalLM": "internlm", + "JAISLMHeadModel": "jais", + "Jais2ForCausalLM": "jais", + "JambaForCausalLM": "jamba", + "JanusForConditionalGeneration": "januspro", + "JinaBertForMaskedLM": "bert", + "JinaBertModel": "bert", + "JinaEmbeddingsV5Model": "bert", + "KORMoForCausalLM": "qwen", + "KimiK25ForConditionalGeneration": "deepseek", + "KimiLinearForCausalLM": "kimi_linear", + "KimiLinearModel": "kimi_linear", + "KimiVLForConditionalGeneration": "deepseek", + "LFM2ForCausalLM": "lfm2", + "LLaDAMoEModel": "llada", + "LLaDAMoEModelLM": "llada", + "LLaDAModelLM": "llada", + "LLaMAForCausalLM": "llama", + "Lfm25AudioTokenizer": "lfm2", + "Lfm2ForCausalLM": "lfm2", + "Lfm2Model": "lfm2", + "Lfm2MoeForCausalLM": "lfm2", + "Llama4ForCausalLM": "llama", + "Llama4ForConditionalGeneration": "llama", + "LlamaBidirectionalModel": "llama", + "LlamaForCausalLM": "llama", + "LlamaModel": "llama", + "LlavaForConditionalGeneration": "llama", + "LlavaStableLMEpochForCausalLM": "stablelm", + "MPTForCausalLM": "mpt", + "MT5ForConditionalGeneration": "t5", + "MaincoderForCausalLM": "maincoder", + "Mamba2ForCausalLM": "mamba", + "MambaForCausalLM": "mamba", + "MambaLMHeadModel": "mamba", + "MiMoV2FlashForCausalLM": "mimo", + "MiMoV2ForCausalLM": "mimo", + "MiniCPM3ForCausalLM": "minicpm", + "MiniCPMForCausalLM": "minicpm", + "MiniCPMV4_6ForConditionalGeneration": "minicpm", + "MiniMaxM2ForCausalLM": "minimax", + "Ministral3ForCausalLM": "mistral3", + "Mistral3ForConditionalGeneration": "mistral3", + "MistralForCausalLM": "llama", + "MixtralForCausalLM": "llama", + "ModernBertForMaskedLM": "bert", + "ModernBertForSequenceClassification": "bert", + "ModernBertModel": "bert", + "NemotronForCausalLM": "nemotron", + "NemotronHForCausalLM": "nemotron", + "NeoBERT": "bert", + "NeoBERTForSequenceClassification": "bert", + "NeoBERTLMHead": "bert", + "NomicBertModel": "bert", + "OLMoForCausalLM": "olmo", + "Olmo2ForCausalLM": "olmo", + "Olmo3ForCausalLM": "olmo", + "OlmoForCausalLM": "olmo", + "OlmoeForCausalLM": "olmo", + "OpenELMForCausalLM": "openelm", + "OrionForCausalLM": "orion", + "PLMForCausalLM": "plm", + "PLaMo2ForCausalLM": "plamo", + "PLaMo3ForCausalLM": "plamo", + "PaddleOCRVLForConditionalGeneration": "ernie", + "PanguEmbeddedForCausalLM": "pangu", + "Phi3ForCausalLM": "phi", + "Phi4ForCausalLMV": "phi", + "PhiForCausalLM": "phi", + "PhiMoEForCausalLM": "phi", + "Plamo2ForCausalLM": "plamo", + "Plamo3ForCausalLM": "plamo", + "PlamoForCausalLM": "plamo", + "QWenLMHeadModel": "qwen", + "Qwen2AudioForConditionalGeneration": "qwen", + "Qwen2ForCausalLM": "qwen", + "Qwen2Model": "qwen", + "Qwen2MoeForCausalLM": "qwen", + "Qwen2VLForConditionalGeneration": "qwenvl", + "Qwen2VLModel": "qwenvl", + "Qwen2_5OmniModel": "qwenvl", + "Qwen2_5_VLForConditionalGeneration": "qwenvl", + "Qwen3ASRForConditionalGeneration": "qwen3vl", + "Qwen3ForCausalLM": "qwen", + "Qwen3Model": "qwen", + "Qwen3MoeForCausalLM": "qwen", + "Qwen3NextForCausalLM": "qwen", + "Qwen3OmniMoeForConditionalGeneration": "qwen3vl", + "Qwen3VLForConditionalGeneration": "qwen3vl", + "Qwen3VLMoeForConditionalGeneration": "qwen3vl", + "Qwen3_5ForCausalLM": "qwen", + "Qwen3_5ForConditionalGeneration": "qwen", + "Qwen3_5MoeForCausalLM": "qwen", + "Qwen3_5MoeForConditionalGeneration": "qwen", + "RND1": "qwen", + "RWForCausalLM": "falcon", + "RWKV6Qwen2ForCausalLM": "rwkv", + "RWKV7ForCausalLM": "rwkv", + "RobertaForSequenceClassification": "bert", + "RobertaModel": "bert", + "RuGPT3XLForCausalLM": "gpt2", + "Rwkv6ForCausalLM": "rwkv", + "Rwkv7ForCausalLM": "rwkv", + "RwkvHybridForCausalLM": "rwkv", + "Sarashina2VisionForCausalLM": "sarashina2", + "SarvamMoEForCausalLM": "bailingmoe", + "SeedOssForCausalLM": "olmo", + "SmallThinkerForCausalLM": "smallthinker", + "SmolLM3ForCausalLM": "llama", + "SolarOpenForCausalLM": "glm", + "StableLMEpochForCausalLM": "stablelm", + "StableLmForCausalLM": "stablelm", + "Starcoder2ForCausalLM": "starcoder", + "Step3p5ForCausalLM": "step3", + "StepVLForConditionalGeneration": "step3", + "T5EncoderModel": "t5", + "T5ForConditionalGeneration": "t5", + "T5WithLMHeadModel": "t5", + "TalkieForCausalLM": "talkie", + "UMT5ForConditionalGeneration": "t5", + "UMT5Model": "t5", + "UltravoxModel": "ultravox", + "VLlama3ForCausalLM": "llama", + "VoxtralForConditionalGeneration": "llama", + "WavTokenizerDec": "wavtokenizer", + "XLMRobertaForSequenceClassification": "bert", + "XLMRobertaModel": "bert", + "XverseForCausalLM": "xverse", + "YoutuForCausalLM": "deepseek", + "YoutuVLForConditionalGeneration": "deepseek", + "modeling_grove_moe.GroveMoeForCausalLM": "grovemoe", + "modeling_sarvam_moe.SarvamMoEForCausalLM": "bailingmoe", +} + + +MMPROJ_MODEL_MAP: dict[str, str] = { + "AudioFlamingo3ForConditionalGeneration": "ultravox", + "CogVLMForCausalLM": "cogvlm", + "DeepseekOCR2ForCausalLM": "deepseek", + "DeepseekOCRForCausalLM": "deepseek", + "DotsOCRForCausalLM": "dotsocr", + "Exaone4_5_ForConditionalGeneration": "exaone", + "Gemma3ForConditionalGeneration": "gemma", + "Gemma3nForConditionalGeneration": "gemma", + "Gemma4ForConditionalGeneration": "gemma", + "Glm4vForConditionalGeneration": "qwen3vl", + "Glm4vMoeForConditionalGeneration": "qwen3vl", + "GlmOcrForConditionalGeneration": "qwen3vl", + "GlmasrModel": "ultravox", + "GraniteSpeechForConditionalGeneration": "granite", + "HunYuanVLForConditionalGeneration": "hunyuan", + "Idefics3ForConditionalGeneration": "smolvlm", + "InternVisionModel": "internvl", + "JanusForConditionalGeneration": "januspro", + "KimiK25ForConditionalGeneration": "kimivl", + "KimiVLForConditionalGeneration": "kimivl", + "Lfm2AudioForConditionalGeneration": "lfm2", + "Lfm2VlForConditionalGeneration": "lfm2", + "LightOnOCRForConditionalGeneration": "lighton_ocr", + "Llama4ForConditionalGeneration": "llama4", + "LlavaForConditionalGeneration": "llava", + "MERaLiON2ForConditionalGeneration": "ultravox", + "MiMoV2ForCausalLM": "mimo", + "MiniCPMV4_6ForConditionalGeneration": "minicpm", + "Mistral3ForConditionalGeneration": "llava", + "NemotronH_Nano_VL_V2": "nemotron", + "PaddleOCRVisionModel": "ernie", + "Phi4ForCausalLMV": "phi", + "Qwen2AudioForConditionalGeneration": "ultravox", + "Qwen2VLForConditionalGeneration": "qwenvl", + "Qwen2VLModel": "qwenvl", + "Qwen2_5OmniModel": "qwenvl", + "Qwen2_5_VLForConditionalGeneration": "qwenvl", + "Qwen3ASRForConditionalGeneration": "qwen3vl", + "Qwen3OmniMoeForConditionalGeneration": "qwen3vl", + "Qwen3VLForConditionalGeneration": "qwen3vl", + "Qwen3VLMoeForConditionalGeneration": "qwen3vl", + "Qwen3_5ForConditionalGeneration": "qwen3vl", + "Qwen3_5MoeForConditionalGeneration": "qwen3vl", + "RADIOModel": "nemotron", + "Sarashina2VisionForCausalLM": "sarashina2", + "SmolVLMForConditionalGeneration": "smolvlm", + "StepVLForConditionalGeneration": "step3", + "UltravoxModel": "ultravox", + "VoxtralForConditionalGeneration": "ultravox", + "YoutuVLForConditionalGeneration": "youtuvl", +} + + +_TEXT_MODEL_MODULES = sorted(set(TEXT_MODEL_MAP.values())) +_MMPROJ_MODEL_MODULES = sorted(set(MMPROJ_MODEL_MAP.values())) + + +_loaded_text_modules: set[str] = set() +_loaded_mmproj_modules: set[str] = set() + + +def load_all_models() -> None: + """Import all model modules to trigger @ModelBase.register() decorators.""" + if len(_loaded_text_modules) != len(_TEXT_MODEL_MODULES): + for module_name in _TEXT_MODEL_MODULES: + if module_name not in _loaded_text_modules: + try: + __import__(f"conversion.{module_name}") + _loaded_text_modules.add(module_name) + except Exception as e: + logger.warning(f"Failed to load model module {module_name}: {e}") + + if len(_loaded_mmproj_modules) != len(_MMPROJ_MODEL_MODULES): + for module_name in _MMPROJ_MODEL_MODULES: + if module_name not in _loaded_mmproj_modules: + try: + __import__(f"conversion.{module_name}") + _loaded_mmproj_modules.add(module_name) + except Exception as e: + logger.warning(f"Failed to load model module {module_name}: {e}") + + +def get_model_class(name: str, mmproj: bool = False) -> Type[ModelBase]: + """Dynamically import and return a model class by its HuggingFace architecture name.""" + relevant_map = MMPROJ_MODEL_MAP if mmproj else TEXT_MODEL_MAP + if name not in relevant_map: + raise NotImplementedError(f"Architecture {name!r} not supported!") + module_name = relevant_map[name] + __import__(f"conversion.{module_name}") + model_type = ModelType.MMPROJ if mmproj else ModelType.TEXT + return ModelBase._model_classes[model_type][name] + + +def print_registered_models() -> None: + load_all_models() + logger.error("TEXT models:") + for name in sorted(TEXT_MODEL_MAP.keys()): + logger.error(f" - {name}") + logger.error("MMPROJ models:") + for name in sorted(MMPROJ_MODEL_MAP.keys()): + logger.error(f" - {name}") diff --git a/conversion/afmoe.py b/conversion/afmoe.py new file mode 100644 index 00000000000..5e66a51da61 --- /dev/null +++ b/conversion/afmoe.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llama import LlamaModel + + +@ModelBase.register("AfmoeForCausalLM") +class AfmoeModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.AFMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # MoE parameters + if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None: + self.gguf_writer.add_expert_shared_count(n_shared_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None: + self.gguf_writer.add_leading_dense_block_count(n_dense_layers) + + # Route normalization and scaling + if (route_norm := self.hparams.get("route_norm")) is not None: + self.gguf_writer.add_expert_weights_norm(route_norm) + if (route_scale := self.hparams.get("route_scale")) is not None: + self.gguf_writer.add_expert_weights_scale(route_scale) + + # Sliding window attention + if (sliding_window := self.hparams.get("sliding_window")) is not None: + self.gguf_writer.add_sliding_window(sliding_window) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle expert weights - they're already merged in the HF format + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["gate_proj", "up_proj", "down_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename_to_retrieve]) + del self._experts[bid][ename_to_retrieve] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) + + return + else: + return + + yield from ModelBase.modify_tensors(self, data_torch, name, bid) diff --git a/conversion/arctic.py b/conversion/arctic.py new file mode 100644 index 00000000000..775cacaab9f --- /dev/null +++ b/conversion/arctic.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import json +import sys + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + +from .llama import LlamaModel + + +@ModelBase.register("ArcticForCausalLM") +class ArcticModel(TextModel): + model_arch = gguf.MODEL_ARCH.ARCTIC + + def set_vocab(self): + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + # Read the whole vocabulary from the tokenizer.model file + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + + if "added_tokens_decoder" in tokenizer_config_json: + added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] + for token_id, token_json in added_tokens_decoder.items(): + token_id = int(token_id) + if token_id >= vocab_size: + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + token_content = token_json["content"] + token_type = SentencePieceTokenTypes.USER_DEFINED + token_score = -10000.0 + + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model + if ("special" in token_json) and token_json["special"]: + if token_content == tokenizer_config_json["unk_token"]: + token_type = SentencePieceTokenTypes.UNKNOWN + else: + token_type = SentencePieceTokenTypes.CONTROL + token_score = 0.0 + + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + tokens[token_id] = token_content.encode("utf-8") + toktypes[token_id] = token_type + scores[token_id] = token_score + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/baichuan.py b/conversion/baichuan.py new file mode 100644 index 00000000000..4cf34057cd9 --- /dev/null +++ b/conversion/baichuan.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(TextModel): + model_arch = gguf.MODEL_ARCH.BAICHUAN + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": + logger.info(f"Unpacking and permuting layer {bid}") + yield from [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), + ] + else: + yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] diff --git a/conversion/bailingmoe.py b/conversion/bailingmoe.py new file mode 100644 index 00000000000..319ff6dabee --- /dev/null +++ b/conversion/bailingmoe.py @@ -0,0 +1,216 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("BailingMoeForCausalLM") +class BailingMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + n_embd = self.hparams["hidden_size"] + if (head_dim := self.hparams.get("head_dim")) is None: + head_dim = n_embd // n_head + + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + + if name.endswith("attention.dense.weight"): + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) + return + elif name.endswith("query_key_value.weight"): + q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) + + yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return + elif name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield from super().modify_tensors(data_torch, new_name, bid) + + return + + new_name = self.map_tensor_name(name) + + if new_name == output_name and self.hparams.get("norm_head"): + data_torch = data_torch.float() + data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 + + yield from super().modify_tensors(data_torch, new_name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("BailingMoeV2ForCausalLM") +class BailingMoeV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0): + self.block_count = self.hparams["num_hidden_layers"] + nextn_layers + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"])) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(nextn_layers) + + _experts: list[dict[str, Tensor]] | None = None + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "mlp.experts" in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM") +class SarvamMoEModel(BailingMoeV2Model): + model_arch = gguf.MODEL_ARCH.BAILINGMOE2 + # Sarvam-MoE shares the BailingMoeV2 architecture; only differences: + # - full rotary (no partial_rotary_factor) + # - expert bias is zero-mean normalized at load time + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.endswith(".expert_bias"): + # Sarvam normalizes expert bias to zero mean + inner = gen + + def gen(): + t = inner() + return t - t.mean() + return super().filter_tensors((name, gen)) diff --git a/conversion/base.py b/conversion/base.py new file mode 100644 index 00000000000..55682c827d8 --- /dev/null +++ b/conversion/base.py @@ -0,0 +1,2606 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import ast +import logging +import contextlib +import json +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from hashlib import sha256 +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from itertools import chain +from transformers import AutoConfig + +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py')) +import gguf +from gguf.vocab import MistralTokenizerType, MistralVocab + +try: + from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import] + SentencePieceTokenizer, + ) + + _mistral_common_installed = True + _mistral_import_error_msg = "" +except ImportError: + _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) + _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + + _mistral_common_installed = False + TokenizerVersion: Any = None + Tekkenizer: Any = None + SentencePieceTokenizer: Any = None + _mistral_import_error_msg = ( + "Mistral format requires `mistral-common` to be installed. Please run " + "`pip install mistral-common[image,audio]` to install it." + ) + + +logger = logging.getLogger("hf-to-gguf") + + +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") + + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class ModelType(IntEnum): + TEXT = 1 + MMPROJ = 2 + + +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { + ModelType.TEXT: {}, + ModelType.MMPROJ: {}, + } + + dir_model: Path + ftype: gguf.LlamaFileType + fname_out: Path + is_big_endian: bool + endianess: gguf.GGUFEndian + use_temp_file: bool + lazy: bool + dry_run: bool + hparams: dict[str, Any] + model_tensors: dict[str, Callable[[], Tensor]] + gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + metadata: gguf.Metadata + dir_model_card: Path + remote_hf_model_id: str | None + + # subclasses should define this! + model_arch: gguf.MODEL_ARCH + + # subclasses should initialize this! + block_count: int + tensor_map: gguf.TensorNameMap + + # Mistral format specifics + is_mistral_format: bool = False + disable_mistral_community_chat_template: bool = False + sentence_transformers_dense_modules: bool = False + + # MTP (multi-token prediction) export modes; set by main() before instantiation. + # Architectures opt in by overriding the handling (see _Qwen35MtpMixin). + mtp_only: bool = False + no_mtp: bool = False + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, + disable_mistral_community_chat_template: bool = False, + sentence_transformers_dense_modules: bool = False, + fuse_gate_up_exps: bool = False, + fp8_as_q8: bool = False): + if type(self) is ModelBase or \ + type(self) is TextModel or \ + type(self) is MmprojModel: + raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + + if self.is_mistral_format and not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + + self.dir_model = dir_model + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.use_temp_file = use_temp_file + self.lazy = not eager or (remote_hf_model_id is not None) + self.dry_run = dry_run + self.remote_hf_model_id = remote_hf_model_id + self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.fuse_gate_up_exps = fuse_gate_up_exps + self._gate_exp_buffer: dict[int, Tensor] = {} + self._up_exp_buffer: dict[int, Tensor] = {} + self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams + self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + self._is_nvfp4 = False + self._is_mxfp4 = False + self._fp8_as_q8 = fp8_as_q8 + self._fp8_dequantized: set[str] = set() + + # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype + # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. + if self.ftype == gguf.LlamaFileType.GUESSED: + for _, tensor in self.get_tensors(): + if tensor.dim() < 2: + continue + + if tensor.dtype == torch.bfloat16: + self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16") + break + elif tensor.dtype == torch.float16: + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + logger.info("heuristics detected float16 tensor dtype, setting --outtype f16") + break + else: + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16") + + # Configure GGUF Writer + self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + + # Mistral specific + self.disable_mistral_community_chat_template = disable_mistral_community_chat_template + + @classmethod + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors: dict[str, Callable[[], Tensor]] = {} + + if remote_hf_model_id is not None: + is_safetensors = True + + logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) + for name, remote_tensor in remote_tensors.items(): + data_gen = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r) # noqa: E731 + if titem := self.filter_tensors((name, data_gen)): + tname, tgen = titem + tensors[tname] = tgen + + return tensors + + prefix = "model" if not self.is_mistral_format else "consolidated" + part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") + is_safetensors: bool = len(part_names) > 0 + if not is_safetensors: + part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + + tensor_names_from_index: set[str] = set() + tensor_names_from_parts: set[str] = set() + + if not self.is_mistral_format: + index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(index_file, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + tensor_names_from_index.update(weight_map.keys()) + part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment] + part_names = sorted(part_dict.keys()) + else: + weight_map = {} + else: + weight_map = {} + + for part_name in part_names: + logger.info(f"gguf: indexing model part '{part_name}'") + ctx: ContextManager[Any] + if is_safetensors: + ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name)) + else: + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + + with ctx as model_part: + assert model_part is not None + + for name in model_part.keys(): + tensor_names_from_parts.add(name) + if is_safetensors: + data: gguf.utility.LocalTensor = model_part[name] + if self.lazy: + data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731 + else: + dtype = LazyTorchTensor._dtype_str_map[data.dtype] + data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731 + else: + data_torch: Tensor = model_part[name] + if self.lazy: + data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731 + else: + data_gen = lambda data=data_torch: data # noqa: E731 + if titem := self.filter_tensors((name, data_gen)): + tname, tgen = titem + tensors[tname] = tgen + + # verify tensor name presence and identify potentially missing files + if len(tensor_names_from_index) > 0: + if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0: + missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index)) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") + else: + raise ValueError("Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}") + + return tensors + + @staticmethod + def _scale_is_trivial(scale: Tensor) -> bool: + return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6 + + def _write_scale_tensor(self, scale_name: str, scale: Tensor): + if not self._scale_is_trivial(scale): + scale_f32 = scale.float().numpy().flatten() + logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])") + self.gguf_writer.add_tensor(scale_name, scale_f32) + + def _write_scales_tensor(self, scale_name: str, scales: list[float]): + if not np.allclose(scales, 1.0, atol=1e-6): + scale_vals = np.array(scales, dtype=np.float32) + logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])") + self.gguf_writer.add_tensor(scale_name, scale_vals) + + def dequant_model(self): + # If all quantized tensors were already handled (e.g. pure NVFP4), skip + if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors): + return + + tensors_to_remove: list[str] = [] + new_tensors: dict[str, Callable[[], Tensor]] = {} + + if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict): + quant_method = quant_config.get("quant_method") + + def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor: + weight = weight.view(torch.uint8) + orig_shape = weight.shape + + shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape))))) + data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift + data = data & 3 + data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:])) + + # The scale is inverted + return data / scale.float() + + def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor: + scale = scale.float() + + if block_size is not None: + dim_offset = scale.ndim - len(block_size) + for i, size in enumerate(block_size): + scale = scale.repeat_interleave(size, dim_offset + i) + # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) + scale = scale[tuple(slice(0, size) for size in weight.shape)] + + # align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1]) + while scale.ndim < weight.ndim: + scale = scale.unsqueeze(-1) + + return weight.float() * scale + + # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476 + def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor: + bits = quant_config["bits"] + assert bits in (2, 3, 4, 8) + assert qweight.dtype == qzeros.dtype + maxq = (2 ** bits) - 1 + weight = None + zeros = None + pack_dtype_bits = qweight.dtype.itemsize * 8 + + if bits in [2, 4, 8]: + pack_factor = pack_dtype_bits // bits + wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0) + if self.lazy: + wf = LazyTorchTensor.from_eager(wf) + + zeros = torch.bitwise_right_shift( + qzeros.unsqueeze(2).expand(-1, -1, pack_factor), + wf.unsqueeze(0) + ).to(torch.int16 if bits == 8 else torch.int8) + zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape) + + weight = torch.bitwise_and( + torch.bitwise_right_shift( + qweight.unsqueeze(1).expand(-1, pack_factor, -1), + wf.unsqueeze(-1) + ).to(torch.int16 if bits == 8 else torch.int8), + maxq + ) + elif bits == 3: + raise NotImplementedError("3-bit gptq dequantization is not yet implemented") + + assert weight is not None + assert zeros is not None + + weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) + + # gptq_v2 doesn't need to offset zeros + if quant_config.get("checkpoint_format", "gptq") == "gptq": + zeros += 1 + + return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T + + def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int): + assert w.dtype == torch.int32 + shape = tuple(shape_tensor.tolist()) + assert len(shape) == 2 + mask = (1 << num_bits) - 1 + + shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32) + if self.lazy: + shifts = LazyTorchTensor.from_eager(shifts) + + if zero_point is None: + offset = 1 << (num_bits - 1) + else: + assert len(zero_point.shape) == 2 + offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask + offset = offset.reshape(-1, zero_point.shape[1]) + # trim padding, and prepare for broadcast + # NOTE: the zero-point is packed along dim 0 + offset = offset[:shape[0], :].unsqueeze(-1) + + # extract values + # NOTE: the weights are packed along dim 1 + unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask + unpacked = unpacked.reshape(shape[0], -1) + + # trim padding + unpacked = unpacked[:, :shape[1]] + + # prepare for broadcast of the scale + unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size) + unpacked = unpacked - offset + + return (unpacked * scale.unsqueeze(-1).float()).reshape(shape) + + if quant_method == "bitnet": + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s()) + tensors_to_remove.append(name) + elif quant_method == "fp8": + block_size = quant_config.get("weight_block_size") + for name in self.model_tensors.keys(): + if name.endswith("_scale_inv"): + weight_name = name.removesuffix("_scale_inv") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) + tensors_to_remove.append(name) + if self._fp8_as_q8: + self._fp8_dequantized.add(weight_name) + if name.endswith(".activation_scale"): # unused + tensors_to_remove.append(name) + if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused + tensors_to_remove.append(name) + # mistral format + if name.endswith(".qscale_weight"): + weight_name = name.removesuffix("qscale_weight") + "weight" + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) + tensors_to_remove.append(name) + if self._fp8_as_q8: + self._fp8_dequantized.add(weight_name) + if name.endswith(".qscale_act"): + tensors_to_remove.append(name) + elif quant_method == "gptq": + for name in self.model_tensors.keys(): + if name.endswith(".qweight"): + base_name = name.removesuffix(".qweight") + g_idx = self.model_tensors[base_name + ".g_idx"] + qweight = self.model_tensors[base_name + ".qweight"] + qzeros = self.model_tensors[base_name + ".qzeros"] + scales = self.model_tensors[base_name + ".scales"] + new_tensors[base_name + ".weight"] = ( + lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq( + g(), w(), z(), s() + ) + ) + tensors_to_remove += [ + base_name + n + for n in ( + ".g_idx", + ".qzeros", + ".qweight", + ".scales", + ) + ] + elif quant_method == "compressed-tensors": + quant_format = quant_config["format"] + groups = quant_config["config_groups"] + nvfp4_compressed_tensors = ( + quant_format == "nvfp4-pack-quantized" + or quant_format == "mixed-precision" + and bool(groups) + and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict)) + ) + + if len(groups) > 1 and not nvfp4_compressed_tensors: + raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") + weight_config = tuple(groups.values())[0]["weights"] + + if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized": + block_size = weight_config.get("block_structure", None) + strategy = weight_config.get("strategy") + assert strategy == "channel" or strategy == "block" + assert weight_config.get("group_size") is None # didn't find a model using this yet + is_fp8 = ( + quant_format == "float-quantized" + and weight_config.get("type") == "float" + and weight_config.get("num_bits") == 8 + ) + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size) + tensors_to_remove.append(name) + if self._fp8_as_q8 and is_fp8: + self._fp8_dequantized.add(weight_name) + elif quant_format == "pack-quantized": + assert weight_config.get("strategy") == "group" + assert weight_config.get("type", "int") == "int" + num_bits = weight_config.get("num_bits") + group_size = weight_config.get("group_size") + assert isinstance(num_bits, int) + assert isinstance(group_size, int) + for name in self.model_tensors.keys(): + if name.endswith(".weight_packed"): + base_name = name.removesuffix("_packed") + w = self.model_tensors[name] + scale = self.model_tensors[base_name + "_scale"] + shape = self.model_tensors[base_name + "_shape"] + zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None) + new_tensors[base_name] = ( + lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed( + w(), scale(), shape(), zero_point(), num_bits, group_size, + ) + ) + tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")] + if (base_name + "_zero_point") in self.model_tensors: + tensors_to_remove.append(base_name + "_zero_point") + elif nvfp4_compressed_tensors: + # Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors + pass + else: + raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported") + elif quant_method == "modelopt": + # Mixed-precision ModelOpt models: NVFP4 tensors are handled by + # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and + # are dequantized here. k/v scale tensors are unused. + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + if weight_name not in self.model_tensors: + tensors_to_remove.append(name) + continue + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + is_fp8_weight = False + if self._fp8_as_q8: + is_fp8_weight = w().dtype in (torch.float8_e4m3fn, torch.float8_e5m2) + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) + tensors_to_remove.append(name) + if is_fp8_weight: + self._fp8_dequantized.add(weight_name) + if name.endswith((".input_scale", ".k_scale", ".v_scale")): + tensors_to_remove.append(name) + elif quant_method is not None: + raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") + + for name in tensors_to_remove: + if name in self.model_tensors: + del self.model_tensors[name] + + for name, value in new_tensors.items(): + self.model_tensors[name] = value + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + if "language_model." in name: + name = name.replace("language_model.", "") + + return name, gen + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, gen in self.model_tensors.items(): + yield name, gen() + + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in name: + assert bid is not None + name = name.format(bid=bid) + return name + suffix + + def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + return False + key_name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in key_name: + if bid is None: + return False + key_name = key_name.format(bid=bid) + else: + if bid is not None: + return False + return name == (key_name + suffix) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + + def set_gguf_parameters(self): + raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + # Handle gate/up expert tensor fusion if enabled + if self.fuse_gate_up_exps and bid is not None: + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): + self._gate_exp_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + self._up_exp_buffer[bid] = data_torch + + # Check if both gate and up are buffered for this layer + if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: + gate_data = self._gate_exp_buffer.pop(bid) + up_data = self._up_exp_buffer.pop(bid) + # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) + fused_data = torch.cat([gate_data, up_data], dim=1) + fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) + logger.info(f"Fused gate_exps and up_exps for layer {bid}") + return [(fused_name, fused_data)] + + # If we buffered a gate/up tensor, wait for the other + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + return [] + + return [(new_name, data_torch)] + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del new_name, bid # unused + # Force FP8-original tensors to Q8_0 when requested; Q8_0 is faster than F16/BF16. + if self._fp8_as_q8 and name in self._fp8_dequantized and n_dims >= 2: + return gguf.GGMLQuantizationType.Q8_0 + return False + + # some models need extra generated tensors (like rope_freqs) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + return () + + @staticmethod + def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]: + """Repack NVFP4 ModelOpt tensors into ggml super-block layout. + Preserves original E4M3 scale bits as UE4M3 (strip sign bit). + The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul(). + Returns (raw_data, logical_shape).""" + + out_features = weight.shape[0] + n_blocks = scale.shape[1] + + # Unpack ModelOpt nibble-packed weights + w = weight.reshape(out_features, n_blocks, 8) + vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16) + + # Preserve original E4M3 scale bits as UE4M3 (strip sign bit) + d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F + qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy() + + # Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements + n_super = n_blocks // 4 + d_grouped = d_ue.reshape(out_features, n_super, 4) + qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32) + raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36) + return raw, [out_features, n_super * 64] + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + new_name = self.map_tensor_name(name) + + raw, shape = self._nvfp4_pack(weight, scale) + logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4") + self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4) + + self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2) + self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale) + + def _generate_nvfp4_tensors(self): + # Per-layer expert merging to avoid holding all experts in memory + expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {} + expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} + expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} + expert_shapes: dict[tuple[int, str], list[int]] = {} + n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 + consumed: list[str] = [] + + for name in self.model_tensors.keys(): + if not name.endswith(".weight"): + continue + scale_name = name.replace(".weight", ".weight_scale") + scale2_name = name.replace(".weight", ".weight_scale_2") + input_scale_name = name.replace(".weight", ".input_scale") + if scale_name not in self.model_tensors: + continue + # Force eager materialization of lazy tensors + weight = LazyTorchTensor.to_eager(self.model_tensors[name]()) + scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]()) + + # Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales) + if scale.ndim < 2: + continue + + scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))()) + input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))()) + + # Mark tensors for removal from model_tensors (already written to gguf) + consumed.extend([name, scale_name]) + if scale2_name in self.model_tensors: + consumed.append(scale2_name) + if input_scale_name in self.model_tensors: + consumed.append(input_scale_name) + + # Check if this is a per-expert tensor + m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name) + if m: + expert_id = int(m.group(1)) + proj_type = m.group(2) + bid_m = re.search(r'\.layers\.(\d+)\.', name) + bid = int(bid_m.group(1)) if bid_m else 0 + key = (bid, proj_type) + + raw, shape = self._nvfp4_pack(weight, scale) + + if key not in expert_blocks: + expert_blocks[key] = [] + expert_scales[key] = [] + expert_input_scales[key] = [] + expert_shapes[key] = shape + expert_blocks[key].append((expert_id, raw.copy())) + # Collect per-expert scale2 (scalar per expert) + expert_scales[key].append((expert_id, float(scale2.float().sum()))) + # Collect per-expert input_scale (scalar per expert) + expert_input_scales[key].append((expert_id, float(input_scale.float().sum()))) + + # Flush when all experts for this (layer, proj) are collected + if n_experts > 0 and len(expert_blocks[key]) >= n_experts: + self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) + else: + self._repack_nvfp4(name, weight, scale, scale2, input_scale) + + # Flush any remaining experts (fallback if n_experts was unknown) + for bid, proj_type in list(expert_blocks.keys()): + self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) + + # Remove consumed tensors so get_tensors/modify_tensors won't see them + for name in consumed: + self.model_tensors.pop(name, None) + + # Remove any remaining unused auxiliary tensors + for name in list(self.model_tensors.keys()): + if name.endswith((".k_scale", ".v_scale")): + del self.model_tensors[name] + + def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type): + experts = expert_blocks.pop(key) + scales = expert_scales.pop(key) + input_scales = expert_input_scales.pop(key) + shape = expert_shapes.pop(key) + + experts.sort(key=lambda x: x[0]) + merged = np.stack([e[1] for e in experts], axis=0) + merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight" + new_name = self.map_tensor_name(merged_name) + logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4") + self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4) + + scales.sort(key=lambda x: x[0]) + self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales]) + + input_scales.sort(key=lambda x: x[0]) + self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales]) + + del experts, merged + + def prepare_tensors(self): + # detect NVFP4 quantization (ModelOpt and Compressed-tensors formats) + quantization_config = self.hparams.get("quantization_config") or {} + quant_algo = quantization_config.get("quant_algo") + quant_method = quantization_config.get("quant_method") + quant_format = quantization_config.get("format") + quant_groups = quantization_config.get("config_groups") or {} + quant_layers = quantization_config.get("quantized_layers") or {} + quant_config_file = self.dir_model / "hf_quant_config.json" + + if (not quant_algo or not quant_layers) and quant_config_file.is_file(): + with open(quant_config_file, "r", encoding="utf-8") as f: + hf_quant_config = json.load(f) + quant_config = hf_quant_config.get("quantization") or {} + producer = hf_quant_config.get("producer") or {} + producer_name = (producer.get("name") or "").lower() + if quant_method is None: + self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name + quant_method = producer_name + quant_algo = quant_config.get("quant_algo", quant_algo) + quant_method = quant_config.get("quant_method", quant_method) + quant_format = quant_config.get("format", quant_format) + quant_groups = quant_config.get("config_groups", quant_groups) or {} + quant_layers = quant_config.get("quantized_layers", quant_layers) or {} + + # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with + # per-layer NVFP4/FP8) instead of a single global "NVFP4" value. + nvfp4_compressed_tensors = quant_method == "compressed-tensors" and ( + quant_format == "nvfp4-pack-quantized" + or quant_format == "mixed-precision" + and bool(quant_groups) + and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict)) + ) + if quant_algo != "NVFP4": + if nvfp4_compressed_tensors: + quant_algo = "NVFP4" + elif any(str(v.get("quant_algo")).endswith("NVFP4") for v in quant_layers.values() if isinstance(v, dict)): + quant_algo = "NVFP4" + + self._is_nvfp4 = quant_algo == "NVFP4" + self._is_mxfp4 = quant_method == "mxfp4" + + # NVFP4 weights are repacked and written directly to gguf_writer. + # This must run before dequant_model so NVFP4 tensors are removed + # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant. + if self._is_nvfp4: + if nvfp4_compressed_tensors: + # Convert compressed-tensors 'global' scales into the reciprocal + def inverse_scale(gen): + def load(): + scale = LazyTorchTensor.to_eager(gen()).float() + return 1.0 / scale + return load + + # Change the compressed-tensors names to the ModelOpt names for handling consistently later + for name in list(self.model_tensors.keys()): + if name.endswith(".weight_packed"): + weight_name = name.removesuffix("_packed") + if weight_name not in self.model_tensors: + self.model_tensors[weight_name] = self.model_tensors.pop(name) + elif name.endswith(".weight_global_scale"): + scale2_name = name.replace(".weight_global_scale", ".weight_scale_2") + if scale2_name not in self.model_tensors: + self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name)) + elif name.endswith(".input_global_scale"): + input_scale_name = name.replace(".input_global_scale", ".input_scale") + if input_scale_name not in self.model_tensors: + self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name)) + self._generate_nvfp4_tensors() + + self.dequant_model() + + # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) + if self.tensor_map.mapping: + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + else: + max_name_len = len("vision_encoder.weight,") # Default reasonable length + + for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break + + for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): + # TODO: why do we squeeze here? + # data = data_torch.squeeze().numpy() + data = data_torch.numpy() + + n_dims = len(data.shape) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) + + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + if n_dims <= 1 or new_name.endswith("_norm.weight"): + data_qtype = gguf.GGMLQuantizationType.F32 + + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SHORTCONV_CONV, + gguf.MODEL_TENSOR.TIME_MIX_FIRST, + gguf.MODEL_TENSOR.TIME_MIX_W1, + gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, + gguf.MODEL_TENSOR.V_ENC_EMBD_POS, + gguf.MODEL_TENSOR.A_ENC_EMBD_POS, + gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, + gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, + # Kimi KDA conv weights should be F32 + gguf.MODEL_TENSOR.SSM_CONV1D_Q, + gguf.MODEL_TENSOR.SSM_CONV1D_K, + gguf.MODEL_TENSOR.SSM_CONV1D_V, + # DSA indexer weights should be F32 + gguf.MODEL_TENSOR.INDEXER_PROJ, + ) + ) + or new_name[-7:] not in (".weight", ".lora_a", ".lora_b") + ): + data_qtype = gguf.GGMLQuantizationType.F32 + + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + gguf.MODEL_TENSOR.ALTUP_ROUTER, + gguf.MODEL_TENSOR.LAUREL_L, + gguf.MODEL_TENSOR.LAUREL_R, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + ): + # TODO: use Q4_K and Q6_K + data_qtype = gguf.GGMLQuantizationType.F16 + + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + data_qtype = gguf.GGMLQuantizationType.TQ1_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + data_qtype = gguf.GGMLQuantizationType.TQ2_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + + # n_dims is implicit in the shape + logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # If we are using HF model id, set the metadata name to the model id + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): + if self._is_nvfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 + elif self._is_mxfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def write_vocab(self): + raise NotImplementedError("write_vocab() must be implemented in subclasses") + + def write(self): + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.startswith(prefix) and filename.endswith(suffix): + part_names.append(filename) + + part_names.sort() + + return part_names + + @staticmethod + def load_hparams(dir_model: Path, is_mistral_format: bool): + if is_mistral_format: + with open(dir_model / "params.json", "r", encoding="utf-8") as f: + config = json.load(f) + return config + + try: + # for security reason, we don't allow loading remote code by default + # if a model need remote code, we will fallback to config.json + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() + except Exception as e: + logger.warning(f"Failed to load model config from {dir_model}: {e}") + logger.warning("Trying to load config.json instead") + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if "llm_config" in config: + # rename for InternVL + config["text_config"] = config["llm_config"] + if "lm_config" in config: + # rename for GlmASR + config["text_config"] = config["lm_config"] + if "thinker_config" in config: + # rename for Qwen2.5-Omni + config["text_config"] = config["thinker_config"]["text_config"] + if "language_config" in config: + # rename for DeepSeekOCR + config["text_config"] = config["language_config"] + if "lfm" in config: + # rename for LFM2-Audio + config["text_config"] = config["lfm"] + return config + + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: AnyModel) -> AnyModel: + model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT + for name in names: + cls._model_classes[model_type][name] = modelcls + return modelcls + return func + + @classmethod + def print_registered_models(cls): + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") + + @classmethod + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: + try: + return cls._model_classes[model_type][arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + + +class TextModel(ModelBase): + model_type = ModelType.TEXT + hf_arch: str + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self.is_mistral_format: + self.hf_arch = get_model_architecture(self.hparams, self.model_type) + else: + self.hf_arch = "" + + if "text_config" in self.hparams: + # move the text_config to the root level + self.hparams = {**self.hparams, **self.hparams["text_config"]} + + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} + + rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True) + local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True) + + # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters + if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: + if local_rope_theta is not None: + self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} + if "rope_theta" not in self.rope_parameters and rope_theta is not None: + self.rope_parameters["rope_theta"] = rope_theta + if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: + self.rope_parameters["rope_type"] = rope_type + + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip multimodal tensors + if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ + or "visual." in name or "vision." in name or "audio." in name or "talker." in name \ + or "vision_" in name or "audio_" in name \ + or "token2wav." in name or "code2wav." in name \ + or "projector." in name or "pre_mm_projector_norm" in name \ + or "image_newline" in name or "view_seperator" in name \ + or "patch_embed" in name or "patch_embedding" in name \ + or "patch_merger." in name or "model.connector." in name: + return None + + return super().filter_tensors(item) + + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + logger.info("Set model tokenizer") + self.set_vocab() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if self.hparams.get("is_causal") is False: + self.gguf_writer.add_causal_attention(False) + logger.info("gguf: causal attention = False") + + # TODO: Handle "sliding_attention" similarly when models start implementing it + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + if (rope_type := rope_params.get("rope_type")) is not None: + rope_factor = rope_params.get("factor") + rope_gguf_type = gguf.RopeScalingType.NONE + if rope_type == "linear" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.LINEAR + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + elif rope_type == "yarn" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.YARN + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) + if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None: + self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor) + if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None: + self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor) + if (yarn_beta_fast := rope_params.get("beta_fast")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast) + if (yarn_beta_slow := rope_params.get("beta_slow")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow) + # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + elif rope_type == "su" or rope_type == "longrope": + rope_gguf_type = gguf.RopeScalingType.LONGROPE + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + elif rope_type == "dynamic": + # HunYuan, handled in model class + pass + elif rope_type.lower() == "llama3": + # Handled in generate_extra_tensors + pass + else: + logger.warning(f"Unknown RoPE type: {rope_type}") + logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + + if "mrope_section" in self.rope_parameters: + mrope_section = self.rope_parameters["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"gguf: mrope sections: {mrope_section[:4]}") + + if (rope_theta := rope_params.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) + logger.info(f"gguf: rope theta swa = {local_rope_theta}") + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + if (n_expert_groups := self.hparams.get("n_group")) is not None: + self.gguf_writer.add_expert_group_count(n_expert_groups) + logger.info(f"gguf: expert groups count = {n_expert_groups}") + if (n_group_used := self.hparams.get("topk_group")) is not None: + self.gguf_writer.add_expert_group_used_count(n_group_used) + logger.info(f"gguf: expert groups used count = {n_group_used}") + + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None: + if score_func == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif score_func == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported expert score gating function value: {score_func}") + logger.info(f"gguf: expert score gating function = {score_func}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def write_vocab(self): + if len(self.gguf_writer.tensors) != 1: + raise ValueError('Splitting the vocabulary is not supported') + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + def does_token_look_special(self, token: str | bytes) -> bool: + if isinstance(token, (bytes, bytearray)): + token_text = token.decode(encoding="utf-8") + elif isinstance(token, memoryview): + token_text = token.tobytes().decode(encoding="utf-8") + else: + token_text = token + + # Some models mark some added tokens which ought to be control tokens as not special. + # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) + seems_special = token_text in ( + "<pad>", # deepseek-coder + "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2} + ) + + seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) + seems_special = seems_special or (token_text.startswith("<๏ฝœ") and token_text.endswith("๏ฝœ>")) # deepseek-coder + + # TODO: should these be marked as UNUSED instead? (maybe not) + seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2} + + return seems_special + + # used for GPT-2 BPE and WordPiece vocabs + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + return tokens, toktypes, tokpre + + # NOTE: this function is generated by convert_hf_to_gguf_update.py + # do not modify it manually! + # ref: https://github.com/ggml-org/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n๐Ÿš€ (normal) ๐Ÿ˜ถ\u200d๐ŸŒซ๏ธ (multiple emojis concatenated) โœ… ๐Ÿฆ™๐Ÿฆ™ 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 แž€แžถแž“แŸ‹แžแŸ‚แž–แžทแžŸแŸแžŸแžขแžถแž…๐Ÿ˜ ?ๆˆ‘ๆƒณๅœจappleๅทฅไฝœ1314151ๅคฉ๏ฝž ------======= ะฝะตั‰ะพ ะฝะฐ ะ‘ัŠะปะณะฐั€ัะบะธ \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": + # ref: https://huggingface.co/THUDM/glm-4-9b-chat + res = "chatglm-bpe" + if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516": + # ref: https://huggingface.co/THUDM/glm-4-9b-chat + res = "chatglm-bpe" + if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": + # ref: https://huggingface.co/THUDM/glm-4-9b-hf + res = "glm4" + if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902": + # ref: https://huggingface.co/zai-org/GLM-4.5-Air + res = "glm4" + if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267": + # ref: https://huggingface.co/zai-org/GLM-4.7-Flash + res = "glm4" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" + if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": + # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct + res = "hunyuan" + if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6": + # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct + res = "hunyuan-dense" + if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": + # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base + res = "falcon-h1" + if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base + res = "falcon-h1" + if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896": + # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base + res = "falcon-h1" + if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": + # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base + res = "falcon-h1" + if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890": + # ref: https://huggingface.co/moonshotai/Kimi-K2-Base + res = "kimi-k2" + if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": + # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B + res = "qwen2" + if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f": + # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6 + res = "qwen35" + if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": + # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer + res = "grok-2" + if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": + # ref: https://huggingface.co/aari1995/German_Semantic_V3 + res = "jina-v2-de" + if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4": + # ref: https://huggingface.co/evilfreelancer/ruGPT3XL + res = "gpt-2" + if chkhsh == "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7": + # ref: https://huggingface.co/LiquidAI/LFM2.5-8B-A1B + res = "lfm2" + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 + res = "bert-bge" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" + if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": + # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 + res = "bert-bge-large" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/mosaicml/mpt-7b + res = "mpt" + if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": + # ref: https://huggingface.co/bigcode/starcoder2-3b + res = "starcoder" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b + res = "stablelm2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" + if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1": + # ref: https://huggingface.co/CohereLabs/tiny-aya-base + res = "tiny_aya" + if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": + # ref: https://huggingface.co/Qwen/Qwen1.5-7B + res = "qwen2" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf + res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-base + res = "dbrx" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + res = "jina-v1-en" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en + res = "jina-v2-en" + if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es + res = "jina-v2-es" + if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de + res = "jina-v2-de" + if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": + # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano + res = "jina-v5-nano" + if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + res = "smaug-bpe" + if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": + # ref: https://huggingface.co/LumiOpen/Poro-34B-chat + res = "poro-chat" + if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code + res = "jina-v2-code" + if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": + # ref: https://huggingface.co/LumiOpen/Viking-7B + res = "viking" + if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": + # ref: https://huggingface.co/core42/jais-13b + res = "jais" + if chkhsh == "bc5108ee1eb6a3d600cadd065f63190fbd0554dbc9e4bbd6a0d977970afc8d2a": + # ref: https://huggingface.co/inceptionai/Jais-2-8B-Chat + res = "jais-2" + if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": + # ref: https://huggingface.co/WisdomShell/CodeShell-7B + res = "codeshell" + if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" + if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M + res = "smollm" + if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": + # ref: https://huggingface.co/bigscience/bloom + res = "bloom" + if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": + # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small + res = "gpt3-finnish" + if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct + res = "exaone" + if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": + # ref: https://huggingface.co/microsoft/phi-2 + res = "phi-2" + if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": + # ref: https://huggingface.co/facebook/chameleon-7b + res = "chameleon" + if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" + if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": + # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct + res = "gigachat" + if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": + # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct + res = "megrez" + if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 + res = "deepseek-v3" + if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + res = "deepseek-r1-qwen" + if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": + # ref: https://huggingface.co/Xenova/gpt-4o + res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" + if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15": + # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview + res = "trillion" + if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": + # ref: https://huggingface.co/inclusionAI/Ling-lite + res = "bailingmoe" + if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": + # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct + res = "llama4" + if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": + # ref: https://huggingface.co/mistral-community/pixtral-12b + res = "pixtral" + if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": + # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base + res = "seed-coder" + if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf": + # ref: https://huggingface.co/skt/A.X-4.0 + res = "a.x-4.0" + if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4": + # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct + res = "midm-2.0" + if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51": + # ref: https://huggingface.co/LiquidAI/LFM2.5-350M + res = "lfm2" + if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B + res = "exaone4" + if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": + # ref: https://huggingface.co/JetBrains/Mellum-4b-base + res = "mellum" + if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152": + # ref: https://huggingface.co/answerdotai/ModernBERT-base + res = "modern-bert" + if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df": + # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer + res = "afmoe" + if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": + # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0 + res = "bailingmoe2" + if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e": + # ref: https://huggingface.co/ibm-granite/granite-docling-258M + res = "granite-docling" + if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95": + # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2 + res = "minimax-m2" + if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665": + # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer + res = "kormo" + if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": + # ref: https://huggingface.co/tencent/Youtu-LLM-2B + res = "youtu" + if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91": + # ref: https://huggingface.co/upstage/Solar-Open-100B + res = "solar-open" + if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f": + # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B + res = "exaone-moe" + if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4": + # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct + res = "qwen35" + if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": + # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash + res = "joyai-llm" + if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": + # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 + res = "kanana2" + if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": + # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B + res = "f2llmv2" + if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57": + # ref: https://huggingface.co/sarvamai/sarvam-30b + res = "sarvam-moe" + if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af": + # ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf + res = "talkie" + if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4": + # ref: https://huggingface.co/openbmb/MiniCPM5-1B + res = "minicpm5" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + # Marker: End get_vocab_base_pre + + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_whitespace(self) -> None: + tokens, toktypes, _ = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("whitespace") + self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_hybriddna(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + # k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get + # dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each + # k-mer's own id (llama.cpp strips it on detokenization) + for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute] + reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + tokpre = self.get_vocab_base_pre(tokenizer) + self.gguf_writer.add_tokenizer_model("hybriddna") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_qwen(self): + from .qwen import QwenModel + + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokpre = self.get_vocab_base_pre(tokenizer) + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self, add_to_gguf=True): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _create_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.find_hparam([ + "vocab_size_per_layer_input", # gemma3n + "vocab_size", + ], optional=True) or tokenizer.vocab_size() + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + if token_id >= vocab_size: + logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') + break + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, token_data in added_tokens_decoder.items(): + token_id = int(token_id) + token: str = token_data["content"] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') + if token_data.get("special") or self.does_token_look_special(token): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + else: + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + scores[token_id] = -1000.0 + tokens[token_id] = token.encode("utf-8") + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + return tokens, scores, toktypes + + def _set_vocab_llama_hf(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = ['<s>'.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + if special_vocab.chat_template is None: + template_path = Path(__file__).parent.parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja" + if template_path.is_file(): + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + else: + template = "rwkv-world" + special_vocab.chat_template = template + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + # hack: Override these as they have already been set (incorrectly) + special_vocab.special_token_ids["bos"] = 0 + special_vocab.special_token_ids["eos"] = 0 + + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): + tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + vocab_reader = gguf.GGUFReader(tokenizer_path, "r") + + default_pre = "mpt" if model_name == "gpt-neox" else "default" + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) + assert field # tokenizer model + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field # token list + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + + if model_name == "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) + assert field # token scores + self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field # token types + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + + if model_name != "llama-spm": + field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field # token merges + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: + self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) + if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: + self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) + + def _try_set_pooling_type(self) -> None: + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"].endswith("Pooling"): + pooling_path = mod["path"] + break + + mode_mapping = { + "mean": gguf.PoolingType.MEAN, + "cls": gguf.PoolingType.CLS, + "lasttoken": gguf.PoolingType.LAST, + } + + # get pooling type + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling.get("pooling_mode_mean_tokens"): + pooling_type = gguf.PoolingType.MEAN + elif pooling.get("pooling_mode_cls_token"): + pooling_type = gguf.PoolingType.CLS + elif pooling.get("pooling_mode_lasttoken"): + pooling_type = gguf.PoolingType.LAST + elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping: + pooling_type = mode_mapping[pooling_mode] + else: + raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") + self.gguf_writer.add_pooling_type(pooling_type) + + def _set_vocab_glmedge(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_glm(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + # Special tokens + # Note: Using <|endoftext|> (151329) for eot causes endless generation + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331 + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336 + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329 + special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338 + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_interns1(self): + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute] + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("bos", 151643) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_mistral(self): + from .mistral import MistralModel + + if not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + + vocab = MistralVocab(self.dir_model) + logger.info( + f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." + ) + + self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) + + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size, ( + f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" + ) + + if vocab.tokenizer_type == MistralTokenizerType.tekken: + self.gguf_writer.add_tokenizer_pre("tekken") + self.gguf_writer.add_token_merges( + vocab.extract_vocab_merges_from_model() + ) + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." + ) + + self.gguf_writer.add_bos_token_id(vocab.bos_id) + self.gguf_writer.add_eos_token_id(vocab.eos_id) + self.gguf_writer.add_unk_token_id(vocab.unk_id) + self.gguf_writer.add_pad_token_id(vocab.pad_id) + + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_vocab_size(vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(False) + + local_template_file_path = self.dir_model / "chat_template.jinja" + + if self.is_mistral_format and local_template_file_path.is_file(): + # Ministral-3 and other new Mistral models come with chat templates. + # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main + logger.info("Using an existing Mistral local chat template.") + + with open(local_template_file_path, "r", encoding="utf-8") as f: + template = f.read() + elif not self.is_mistral_format or not self.disable_mistral_community_chat_template: + template_dir = Path(__file__).parent.parent / "models/templates/" + + # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. + if self.is_mistral_format: + logger.info( + "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " + "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." + ) + template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) + else: + logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") + template = None + + if template is not None: + self.gguf_writer.add_chat_template(template) + + def _set_vocab_plamo(self): + # PLaMo models use a custom tokenizer with a .jsonl file + tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl" + tokenizer_config_path = self.dir_model / "tokenizer_config.json" + + if not tokenizer_jsonl_path.is_file(): + raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") + + # Load tokenizer config + with open(tokenizer_config_path, "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + + # Load tokens from JSONL file (actually a list format) + tokens = [] + scores = [] + toktypes = [] + + with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f: + for line_num, line in enumerate(f): + if line.strip(): + token_data = json.loads(line) + # Format: [token, score, type, ?, ?, ?, ?] + token = token_data[0].encode("utf-8") + score = float(token_data[1]) + token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL" + + tokens.append(token) + scores.append(score) + + if token_type_str == "UNKNOWN": + toktypes.append(gguf.TokenType.UNKNOWN) + elif token_type_str == "CONTROL": + toktypes.append(gguf.TokenType.CONTROL) + elif token_type_str == "BYTE": + toktypes.append(gguf.TokenType.BYTE) + else: + token_str = token_data[0] + if token_str.startswith("<|plamo:") and token_str.endswith("|>"): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + vocab_size = self.hparams["vocab_size"] + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("plamo2") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None: + token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8")) + self.gguf_writer.add_bos_token_id(token_id) + if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None: + token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8")) + self.gguf_writer.add_eos_token_id(token_id) + if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None: + token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8")) + self.gguf_writer.add_pad_token_id(token_id) + if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None: + token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8")) + self.gguf_writer.add_sep_token_id(token_id) + if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None: + token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8")) + self.gguf_writer.add_unk_token_id(token_id) + + # Add <|plamo:op|> as EOT to ensure appropriate end of generation + self.gguf_writer.add_eot_token_id(4) + + self.gguf_writer.add_add_space_prefix(False) + + +class MmprojModel(ModelBase): + model_type = ModelType.MMPROJ + model_arch = gguf.MODEL_ARCH.MMPROJ + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] + + n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"] + + has_vision_encoder: bool = True # by default + has_audio_encoder: bool = False + + # for models having multiple encoders, we need to separate their hparams + hparams_vision: dict[str, Any] | None = None + hparams_audio: dict[str, Any] | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.MMPROJ: + raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") + + # get n_embd of the text model + if not self.is_mistral_format: + if "text_config" not in self.hparams: + self.hparams["text_config"] = {} + if "audio_config" not in self.hparams: + self.hparams["audio_config"] = {} + text_config = {**self.hparams, **self.hparams["text_config"]} + self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + else: + text_config = { + k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] + } + # mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size) + self.n_embd_text = text_config.get("dim", 0) + + assert self.n_embd_text > 0, "n_embd not found in hparams" + + # move vision config to the top level, while preserving the original hparams in global_config + import copy + self.global_config = copy.deepcopy(self.hparams) + self.hparams_vision = self.get_vision_config() + self.hparams_audio = self.get_audio_config() + + if self.hparams_vision is None and self.hparams_audio is None: + raise ValueError("vision_config / audio_config not found in hparams") + + # for compat with vision-only models + self.hparams = self.hparams_vision or self.hparams_audio or self.hparams + + # TODO @ngxson : this is a hack to support both vision and audio encoders + have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder + self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True) + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) + + # load preprocessor config + self.preprocessor_config = {} + + # prefer preprocessor_config.json if possible + preprocessor_config_path = self.dir_model / "preprocessor_config.json" + if preprocessor_config_path.is_file(): + with open(preprocessor_config_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + # move media_proc_cfg to root level for compat + if "media_proc_cfg" in cfg: + cfg = { + **cfg, + **cfg["media_proc_cfg"], + } + # merge configs + self.preprocessor_config = {**self.preprocessor_config, **cfg} + + # prefer processor_config.json if possible + processor_config_path = self.dir_model / "processor_config.json" + if processor_config_path.is_file(): + with open(processor_config_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + # move image_processor to root level for compat + if "image_processor" in cfg: + cfg = { + **cfg, + **cfg["image_processor"], + } + # merge configs + self.preprocessor_config = {**self.preprocessor_config, **cfg} + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip non-multimodal tensors + if "language_model." in name: + return None + + return super().filter_tensors(item) + + def get_vision_config(self) -> dict[str, Any] | None: + config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" + return self.global_config.get(config_name) + + def get_audio_config(self) -> dict[str, Any] | None: + mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config" + return self.global_config.get(mm_config_key) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + output_type: str = self.ftype.name.partition("_")[2] + + if self.fname_out.is_dir(): + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None) + self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf" + else: + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + + if self.has_vision_encoder: + self.gguf_writer.add_clip_has_vision_encoder(True) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + + # vision config + self.image_size = self.find_vparam(["image_size"]) + self.gguf_writer.add_vision_image_size(self.image_size) + self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"])) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) + self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"])) + + # preprocessor config + image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] + image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] + + self.gguf_writer.add_vision_image_mean(image_mean) + self.gguf_writer.add_vision_image_std(image_std) + + if self.has_audio_encoder: + self.gguf_writer.add_clip_has_audio_encoder(True) + self.gguf_writer.add_audio_projection_dim(self.n_embd_text) + + # audio config + self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"])) + self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"])) + self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys)) + self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"])) + + if not self.has_vision_encoder and not self.has_audio_encoder: + raise ValueError("MmprojModel must have either vision or audio encoder") + + def write_vocab(self): + raise ValueError("MmprojModel does not support vocab writing") + + def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any: + assert self.hparams_vision is not None + return self._find_param(self.hparams_vision, keys, optional) + + def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any: + assert self.hparams_audio is not None + return self._find_param(self.hparams_audio, keys, optional) + + def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in obj), None) + if key is not None: + return obj[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name: + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + torch.uint8: np.uint8, + } + + # only used when byteswapping data. Only correct size is needed + # TODO: uncomment uint64, uint32, and uint16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_byteswap_map: dict[torch.dtype, type] = { + torch.float64: np.float64, + torch.float32: np.float32, + torch.bfloat16: np.float16, + torch.float16: np.float16, + torch.int64: np.int64, + # torch.uint64: np.uint64, + torch.int32: np.int32, + # torch.uint32: np.uint32, + torch.int16: np.int16, + # torch.uint16: np.uint16, + torch.int8: np.int8, + torch.uint8: np.uint8, + torch.bool: np.uint8, + torch.float8_e4m3fn: np.uint8, + torch.float8_e5m2: np.uint8, + } + + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + args=(self,), + func=(lambda s: s.numpy()) + ) + + @classmethod + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:]) + return cast(torch.Tensor, lazy) + + @classmethod + def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor: + def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor: + def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: + if sys.byteorder == 'big': + # switch data back to big endian + tensor = tensor.view(dtype).byteswap(inplace=False) + return tensor + dtype = cls._dtype_str_map[tensor.dtype] + numpy_dtype = cls._dtype_byteswap_map[dtype] + return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape) + dtype = cls._dtype_str_map[t.dtype] + shape = t.shape + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r)) + return cast(torch.Tensor, lazy) + + @classmethod + def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): + def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: + if sys.byteorder == 'big': + # switch data back to big endian + tensor = tensor.view(dtype).byteswap(inplace=False) + return tensor + dtype = cls._dtype_str_map[remote_tensor.dtype] + numpy_dtype = cls._dtype_byteswap_map[dtype] + shape = remote_tensor.shape + meta = cls.meta_with_dtype_and_shape(dtype, shape) + lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape)) + return cast(torch.Tensor, lazy) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + assert len(args) + return args[0].numpy() + + return cls._wrap_fn(func)(*args, **kwargs) + + +def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str: + # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders + # maybe we should fallback to text model's arch in that case, since not many models have both + text_config = hparams.get("text_config", {}) + vision_config = hparams.get("vision_config", {}) + arch = None + if (arches := hparams.get("architectures")) is not None and len(arches) > 0: + arch = arches[0] + elif "ssm_cfg" in hparams: + # For non-hf Mamba and Mamba2 models + arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM" + + # Step3-VL keeps text config under text_config but uses a custom top-level architecture. + # For text conversion we route to a dedicated text-only class. + # TODO: refactor this later to avoid adding exception here + if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Exaone4_5_ForConditionalGeneration"): + return arch + + # if "architectures" is found in the sub-config, use that instead + if model_type == ModelType.TEXT and text_config.get("architectures") is not None: + arch = text_config["architectures"][0] + elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None: + arch = vision_config["architectures"][0] + if arch is None: + raise ValueError("Failed to detect model architecture") + return arch diff --git a/conversion/bert.py b/conversion/bert.py new file mode 100644 index 00000000000..9eb320e58aa --- /dev/null +++ b/conversion/bert.py @@ -0,0 +1,625 @@ +from __future__ import annotations + +import json +import os + +from pathlib import Path +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + + +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") +class BertModel(TextModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + if cls_out_labels := self.hparams.get("id2label"): + if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0": + # Remove dummy labels added by AutoConfig + cls_out_labels = None + self.cls_out_labels = cls_out_labels + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + self._try_set_pooling_type() + + if self.cls_out_labels: + self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())]) + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.vocab_size = len(tokens) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + # convert to phantom space vocab + def phantom(tok, toktype): + if toktype == gguf.TokenType.CONTROL: + return tok + if tok.startswith("##"): + return tok[2:] + return "\u2581" + tok + assert len(tokens) == len(toktypes) + tokens = list(map(phantom, tokens, toktypes)) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("bert."): + name = name[5:] + + if name.endswith(".gamma"): + name = name[:-6] + ".weight" + + if name.endswith(".beta"): + name = name[:-5] + ".bias" + + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return None + + if name.startswith("cls.predictions"): + return None + + if name.startswith("cls.seq_relationship"): + return None + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.cls_out_labels: + # For BertForSequenceClassification (direct projection layer) + if name == "classifier.weight": + name = "classifier.out_proj.weight" + + if name == "classifier.bias": + name = "classifier.out_proj.bias" + + yield from super().modify_tensors(data_torch, name, bid) + + def _xlmroberta_tokenizer_init(self) -> None: + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def _xlmroberta_set_vocab(self) -> None: + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' + + tokenizer_json = {} + tokenizer_config_json = {} + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'tokenizer.json' + tokenizer_config_path = self.dir_model / 'tokenizer_config.json' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + from base64 import b64decode + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + + with open(tokenizer_path, "r", encoding="utf-8") as fp: + tokenizer_json = json.load(fp) + + if tokenizer_config_path.is_file(): + with open(tokenizer_config_path, "r", encoding="utf-8") as fp: + tokenizer_config_json = json.load(fp) + + add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute] + remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute] + precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) + + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute] + else: + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + if isinstance(tokenizer, SentencePieceProcessor): + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + else: + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + unk_token = tokenizer_config_json.get("unk_token") + unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload] + + for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute] + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] + if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute] + text = piece.encode("utf-8") + score = tokenizer_json["model"]["vocab"][token_id][1] + + toktype = SentencePieceTokenTypes.NORMAL + if token_id == unk_token_id: + toktype = SentencePieceTokenTypes.UNKNOWN + elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.CONTROL + elif token_id in added_vocab.values(): + toktype = SentencePieceTokenTypes.USER_DEFINED + # No reliable way to detect this, but jina doesn't have any + # elif tokenizer.IsByte(token_id): + # toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + if isinstance(tokenizer, SentencePieceProcessor): + # realign tokens (see HF tokenizer code) + tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1] + scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] + toktypes = [ + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.UNKNOWN, + ] + toktypes[3:-1] + + if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: + # Add mask token missing from sentencepiece.bpe.model + tokens[250001] = b'<mask>' + scores[250001] = 0.0 + toktypes[250001] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification") +class DistilBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def set_gguf_parameters(self): + self.gguf_writer.add_layer_norm_eps(1e-12) + logger.info("gguf: layer norm epsilon = 1e-12") + super().set_gguf_parameters() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("distilbert."): + name = name[11:] + + # These layers act as MLM head, so we don't need them + if name.startswith("vocab_"): + return None + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("RobertaModel", "RobertaForSequenceClassification") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model, False) + + self.is_moe = bool(hparams.get("moe_every_n_layers")) + self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() + if self._tokenizer_is_xlmroberta: + self._xlmroberta_tokenizer_init() + + npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048) + if npos == 8192 and mtp == 2048: + self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens. + elif npos == 2048 and mtp == 2048: + self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens. + else: + raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}") + + assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" + + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors unless MoE + assert self.hparams["qkv_proj_bias"] == self.is_moe + assert self.hparams["mlp_fc1_bias"] == self.is_moe + assert self.hparams["mlp_fc2_bias"] == self.is_moe + + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_vocab(self) -> None: + if self._tokenizer_is_xlmroberta: + return self._xlmroberta_set_vocab() + return super().set_vocab() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # If the tensor is an experts bias tensor, skip it. + if "mlp.experts.bias" in name: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + if "mlp.experts.mlp.w1" in name: + data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) + name += ".weight" + + if "mlp.experts.mlp.w2" in name: + data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) + data_torch = data_torch.transpose(1, 2) + name += ".weight" + + yield from super().modify_tensors(data_torch, name, bid) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.is_moe: + self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + + def _is_tokenizer_xlmroberta(self) -> bool: + with open(self.dir_model / "tokenizer.json") as f: + tokenizer_json = json.load(f) + toktyp = tokenizer_json["model"]["type"] + if toktyp == "Unigram": + return True + if toktyp == "WordPiece": + return False + raise ValueError(f"unknown tokenizer: {toktyp}") + + +@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification") +class NeoBert(BertModel): + model_arch = gguf.MODEL_ARCH.NEO_BERT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # NeoBERT uses 2/3 of the intermediate size as feed forward length + self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3)) + self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + + self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("decoder."): + return None + + if name.startswith("model."): + name = name[6:] + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") +class EuroBertModel(TextModel): + model_arch = gguf.MODEL_ARCH.EUROBERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(False) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # EuroBert is bidirectional (encoder) + self.gguf_writer.add_causal_attention(False) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + self._try_set_pooling_type() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("model."): + name = name[6:] + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + _lora_files = {} + _lora_names = [] + + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model, False) + + if lora_names := hparams.get("lora_adaptations"): + self._lora_names = lora_names + self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + self._xlmroberta_tokenizer_init() + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if self._lora_names: + for name in self._lora_names: + fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-") + self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run) + + return super().generate_extra_tensors() + + def set_type(self): + for lora_writer in self._lora_files.values(): + lora_writer.add_type(gguf.GGUFType.ADAPTER) + lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + super().set_type() + + def set_vocab(self): + self._xlmroberta_set_vocab() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # jina-embeddings-v3 + if ".parametrizations." in name: + name = name.replace(".parametrizations.", ".") + if name.endswith(".original"): + name = name[:-9] + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): + if name.startswith("pooler.dense"): + return + + num_loras = data_torch.size(0) + assert num_loras == len(self._lora_names) + + # Split out each LoRA in their own GGUF + for i, lora_writer in enumerate(self._lora_files.values()): + new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower() + data = data_torch[i, :, :] + # Transpose/flip token_embd/types into correct shape + if new_name == "token_embd.weight.lora_b": + data = data.T + elif new_name.startswith("token_types.weight."): + new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") + lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) + + return + + yield from super().modify_tensors(data_torch, name, bid) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # jina-embeddings-v3 + lora_alpha = self.hparams.get("lora_alpha") + if lora_prompt_prefixes := self.hparams.get("task_instructions"): + assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) + for lora_name, lora_writer in self._lora_files.items(): + lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0) + lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name) + if lora_prompt_prefixes: + lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name]) + + def write(self): + super().write() + for lora_writer in self._lora_files.values(): + lora_writer.write_header_to_file() + lora_writer.write_kv_data_to_file() + lora_writer.write_tensors_to_file(progress=True) + lora_writer.close() + + +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") +class JinaBertV2Model(BertModel): + model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 + + def set_vocab(self): + tokenizer_class = 'BertTokenizer' + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_class = json.load(f)['tokenizer_class'] + + if tokenizer_class == 'BertTokenizer': + super().set_vocab() + elif tokenizer_class == 'RobertaTokenizer': + pre_tokenizer_type = None + tokenizer_json_path = self.dir_model / "tokenizer.json" + if tokenizer_json_path.is_file(): + with open(tokenizer_json_path, "r", encoding="utf-8") as f: + pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type") + + if pre_tokenizer_type == "Whitespace": + self._set_vocab_whitespace() + else: + self._set_vocab_gpt2() + self.gguf_writer.add_token_type_count(2) + else: + raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') + + +@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification") +class ModernBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.MODERN_BERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + self.gguf_writer.add_add_sep_token(True) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["local_attention"]) + if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None: + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("model."): + name = name[6:] + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.cls_out_labels: + # For BertForSequenceClassification (direct projection layer) + if name == "classifier.weight": + name = "classifier.out_proj.weight" + + if name == "classifier.bias": + name = "classifier.out_proj.bias" + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/bitnet.py b/conversion/bitnet.py new file mode 100644 index 00000000000..a66446abee2 --- /dev/null +++ b/conversion/bitnet.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("BitnetForCausalLM") +class BitnetModel(TextModel): + model_arch = gguf.MODEL_ARCH.BITNET + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def weight_quant(self, weight: Tensor) -> Tensor: + dtype = weight.dtype + weight = weight.float() + scale = weight.abs().mean().clamp(min=1e-5) + iscale = 1 / scale + # TODO: multiply by the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 + result = (weight * iscale).round().clamp(-1, 1) / iscale + return result.type(dtype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): + # transform weight into 1/0/-1 (in fp32) + data_torch = self.weight_quant(data_torch) + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/bloom.py b/conversion/bloom.py new file mode 100644 index 00000000000..d98edf6d500 --- /dev/null +++ b/conversion/bloom.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import re + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("BloomForCausalLM", "BloomModel") +class BloomModel(TextModel): + model_arch = gguf.MODEL_ARCH.BLOOM + + def set_gguf_parameters(self): + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + assert n_head is not None + assert n_embed is not None + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + assert n_head is not None + assert n_embed is not None + + name = re.sub(r'transformer\.', '', name) + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/chameleon.py b/conversion/chameleon.py new file mode 100644 index 00000000000..a996bfa53cf --- /dev/null +++ b/conversion/chameleon.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + +from .llama import LlamaModel + + +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(TextModel): + model_arch = gguf.MODEL_ARCH.CHAMELEON + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) + + def set_vocab(self): + self._set_vocab_gpt2() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # ignore image tokenizer for now + # TODO: image support for Chameleon + if name.startswith("model.vqmodel"): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + hidden_dim = self.hparams.get("hidden_size") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if name.endswith(("q_norm.weight", "q_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) + if name.endswith(("k_norm.weight", "k_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) + + yield from super().modify_tensors(data_torch, name, bid) + + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 + @staticmethod + def _reverse_hf_permute(data_torch, n_heads, hidden_dim): + head_dim = hidden_dim // n_heads + data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) + data_torch = data_torch.repeat_interleave(n_heads, 0) + return data_torch diff --git a/conversion/chatglm.py b/conversion/chatglm.py new file mode 100644 index 00000000000..7e323b89004 --- /dev/null +++ b/conversion/chatglm.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf + + +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.CHATGLM + + def set_vocab_chatglm3(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytes] = [] + toktypes: list[int] = [] + scores: list[float] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute] + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] + role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens + for token_id in range(vocab_size): + piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] + if token_id == 0: + piece = "<unk>" + elif token_id == 1: + piece = "<bos>" + elif token_id == 2: + piece = "<eos>" + + text = piece.encode("utf-8") # ty: ignore[unresolved-attribute] + score = 0.0 + # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), + # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() + if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type] + score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute] + + if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute] + if piece in special_tokens: + toktype = SentencePieceTokenTypes.CONTROL + elif len(piece) == 0: # ty: ignore[invalid-argument-type] + text = f"[PAD{token_id}]".encode("utf-8") + toktype = SentencePieceTokenTypes.UNUSED + else: + toktype = SentencePieceTokenTypes.USER_DEFINED + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + continue + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute] + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + # glm3 needs prefix and suffix formatted as: + # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" + self.gguf_writer.add_tokenizer_pre("chatglm-spm") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): + self.set_vocab_chatglm3() + return + + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) + assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + # only add special tokens when they were not already loaded from config.json + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + assert n_embed is not None + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + assert n_head is not None + n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) + self.gguf_writer.add_file_type(self.ftype) + if "attention_dim" in self.hparams: + rope_dim = self.hparams["attention_dim"] + else: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_add_bos_token(False) + rope_freq = 10000 + if "rope_ratio" in self.hparams: + rope_freq = rope_freq * self.hparams["rope_ratio"] + self.gguf_writer.add_rope_freq_base(rope_freq) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".rotary_pos_emb.inv_freq"): + return None + + name = name.removeprefix("transformer.") + + return super().filter_tensors((name, gen)) diff --git a/conversion/codeshell.py b/conversion/codeshell.py new file mode 100644 index 00000000000..8bfc3178d46 --- /dev/null +++ b/conversion/codeshell.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("CodeShellForCausalLM") +class CodeShellModel(TextModel): + model_arch = gguf.MODEL_ARCH.CODESHELL + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) diff --git a/conversion/cogvlm.py b/conversion/cogvlm.py new file mode 100644 index 00000000000..d92df55d46b --- /dev/null +++ b/conversion/cogvlm.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + +from .llama import LlamaModel + + +@ModelBase.register("CogVLMForCausalLM") +class CogVLMVisionModel(MmprojModel): + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("model.vision."): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("CogVLMForCausalLM") +class CogVLMModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.COGVLM diff --git a/conversion/command_r.py b/conversion/command_r.py new file mode 100644 index 00000000000..603288d165c --- /dev/null +++ b/conversion/command_r.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("CohereForCausalLM") +class CommandR2Model(TextModel): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + +@ModelBase.register("Cohere2ForCausalLM") +class Cohere2Model(TextModel): + model_arch = gguf.MODEL_ARCH.COHERE2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + rotary_pct = self.hparams["rotary_pct"] + hidden_size = self.hparams["hidden_size"] + num_attention_heads = self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Cohere2 runtime in llama.cpp expects no bias tensors; + # the actual weight only contains 0-value tensors as bias, we can skip them + if name.endswith(".bias"): + if torch.any(data_torch != 0): + raise ValueError(f"Bias tensor {name!r} is not zero.") + logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.") + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/dbrx.py b/conversion/dbrx.py new file mode 100644 index 00000000000..207ebcb8931 --- /dev/null +++ b/conversion/dbrx.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("DbrxForCausalLM") +class DbrxModel(TextModel): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_block_count(self.block_count) + + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + self.gguf_writer.add_layer_norm_eps(1e-5) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + experts = False + + for exp_tensor_name in exp_tensor_names.keys(): + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + experts = True + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: + data_torch = data_torch.permute(*permute_tensor) + break + + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + + yield from super().modify_tensors(data_torch, new_name, bid) + + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + del name, new_name, bid # unused + + return n_dims > 1 diff --git a/conversion/deci.py b/conversion/deci.py new file mode 100644 index 00000000000..46d8568c5a4 --- /dev/null +++ b/conversion/deci.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import math + +from typing import Any, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("DeciLMForCausalLM") +class DeciModel(TextModel): + model_arch = gguf.MODEL_ARCH.DECI + + @staticmethod + def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: + # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) + return DeciModel._find_multiple(intermediate_size, 256) + + @staticmethod + def _find_multiple(n: int, k: int) -> int: + # DeciLM-specific code + if n % k == 0: + return n + return n + k - (n % k) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] + assert self.block_count == len(_block_configs) + self._num_kv_heads = list() + self._num_heads = list() + _ffn_multipliers = list() + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + # ***dummy layer*** for nemotron 253B + # if n_heads_in_group is None and ffn_mult is None + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0 + for il in range(len(_block_configs)): + if _block_configs[il]["attention"]["n_heads_in_group"] is None: + if _block_configs[il]["attention"]["replace_with_linear"] is True: + self._num_kv_heads.append(0) + self._num_heads.append(self.hparams["num_attention_heads"]) + else: + self._num_kv_heads.append(0) + self._num_heads.append(0) + else: + self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) + self._num_heads.append(self.hparams["num_attention_heads"]) + if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer + _ffn_multipliers.append(0.0) + else: + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(_ffn_multipliers) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) + assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) + self._ffn_dims: list[int] = [ + DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) + for multiplier in _ffn_multipliers + ] + + def set_vocab(self): + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + else: + # DeciLM-7B + self._set_vocab_llama_hf() + + def set_gguf_parameters(self): + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(self._ffn_dims) + if (rope_theta := self.rope_parameters.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_head_count(self._num_heads) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_file_type(self.ftype) + else: # DeciLM-7B + super().set_gguf_parameters() + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] + assert self.block_count == len(self._num_kv_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + if bid is not None: + if "num_key_value_heads_per_layer" in self.hparams: + n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] + elif "block_configs" in self.hparams: + n_kv_head = self._num_kv_heads[bid] + n_head = self._num_heads[bid] + else: + n_kv_head = self.hparams.get("num_key_value_heads") + else: + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() diff --git a/conversion/deepseek.py b/conversion/deepseek.py new file mode 100644 index 00000000000..72520cc9f6a --- /dev/null +++ b/conversion/deepseek.py @@ -0,0 +1,461 @@ +from __future__ import annotations + +import re + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .qwen import QwenModel + + +@ModelBase.register("DeepseekOCRForCausalLM") +class DeepseekOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(self.clip_projector_type) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length ** 0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + # @bluebread: there's no window_size in config but just add it here anyway + self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) + + # SAM configuration + sam_hparams = hparams['sam'] + self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) + self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) + self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) + + def get_vision_config(self) -> dict[str, Any]: + vision_config: dict[str, Any] | None = self.global_config.get("vision_config") + + if not vision_config: + raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") + + vision_config['sam'] = vision_config['width']['sam_vit_b'] + if vision_config['width'].get('clip-l-14-224') is not None: + vision_config.update(vision_config['width']['clip-l-14-224']) + if isinstance(vision_config['width'], int): + vision_config['hidden_size'] = vision_config['width'] + if vision_config.get('heads') is not None: + vision_config['num_heads'] = vision_config['heads'] + vision_config['intermediate_size'] = vision_config['heads'] * 4 + + return vision_config + + def tensor_force_quant(self, name, new_name, bid, n_dims): + for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'): + if nq_name in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("view_seperator"): + data_torch = data_torch.unsqueeze(0) + yield from super().modify_tensors(data_torch, name, bid) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Only process vision-related tensors, skip language model tensors + # Vision components: sam_model, vision_model, projector, image_newline, view_seperator + # Language model components to skip: lm_head, embed_tokens, layers, norm + if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): + return None + + if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("DeepseekOCR2ForCausalLM") +class DeepseekOCR2VisionModel(DeepseekOCRVisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2 + + def set_gguf_parameters(self): + # the vision tower's qwen2 encoder is built from fixed defaults, + # see build_qwen2_decoder_as_encoder() in deepencoderv2.py + if self.hparams.get("patch_size") is None: + self.hparams["patch_size"] = 16 + if self.hparams.get("intermediate_size") is None: + self.hparams["intermediate_size"] = 4864 + if self.hparams.get("num_attention_heads") is None: + self.hparams["num_attention_heads"] = 14 + super().set_gguf_parameters() + # qwen2 encoder is GQA: 14 Q heads, 2 KV heads + self.gguf_writer.add_vision_head_count_kv(2) + + def get_vision_config(self) -> dict[str, Any]: + vision_config = super().get_vision_config() + vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim'] + if vision_config.get('layers') is None: + vision_config['layers'] = 24 + return vision_config + + +@ModelBase.register("DeepseekForCausalLM") +class DeepseekModel(TextModel): + model_arch = gguf.MODEL_ARCH.DEEPSEEK + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register( + "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "KimiVLForConditionalGeneration", + "KimiK25ForConditionalGeneration", + "YoutuForCausalLM", + "YoutuVLForConditionalGeneration", +) +class DeepseekV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + # TODO @ngxson : remove this when we support MTP for deepseek models + skip_mtp = True + + merge_expert = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + # special handling for Deepseek OCR + if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"): + self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + # default jinja template + self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + # DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower) + if "sam_model" in name or "qwen2_model" in name: + return None + return super().filter_tensors(item) + + def set_vocab(self): + try: + self._set_vocab_gpt2() + return + except Exception: + pass + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokpre = self.get_vocab_base_pre(tokenizer) + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + else: + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + + def set_gguf_parameters(self): + is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) + + if is_ocr: + self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) + else: + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) + + super().set_gguf_parameters() + hparams = self.hparams + + # first_k_dense_replace: number of leading layers using dense FFN instead of MoE + # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers + # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers + has_moe = hparams.get("n_routed_experts") is not None + first_k_dense_replace = hparams.get("first_k_dense_replace") + if first_k_dense_replace is None: + # Default: if no MoE, all layers are dense; if MoE, none are dense + first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0 + self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + kv_lora_rank = hparams.get("kv_lora_rank", 512) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + + # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA + if not is_ocr: + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(kv_lora_rank) + self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + + # MoE parameters (required by C++ code for DEEPSEEK2 arch) + # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length + moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False) + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + if (n_routed_experts := hparams.get("n_routed_experts")) is not None: + self.gguf_writer.add_expert_count(n_routed_experts) + + # expert_shared_count is required by C++ code, default to 0 for non-MoE models + n_shared_experts = hparams.get("n_shared_experts", 0) + self.gguf_writer.add_expert_shared_count(n_shared_experts) + + # When not set, C++ code will use scale_w = false to skip the no-op scaling + if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) + + if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob: + self.gguf_writer.add_expert_weights_norm(norm_topk_prob) + + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # skip lm_head.weight if tie_word_embeddings is True + if self.hparams.get("tie_word_embeddings", False): + if name == "lm_head.weight" or name == "model.lm_head.weight": + logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") + return + + # skip Multi-Token Prediction (MTP) layers + if self.skip_mtp: + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return + + # process the experts separately + if self.merge_expert and name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("DeepseekV32ForCausalLM") +class DeepseekV32Model(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK32 + skip_mtp = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + assert getattr(tokenizer, "add_bos_token", False), "Change value of add_bos_token to true in tokenizer_config.json file." + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + # DSA indexer parameters + self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"]) + self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"]) + self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"]) diff --git a/conversion/dots1.py b/conversion/dots1.py new file mode 100644 index 00000000000..7ac299a6e65 --- /dev/null +++ b/conversion/dots1.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .qwen import Qwen2MoeModel + + +@ModelBase.register("Dots1ForCausalLM") +class Dots1Model(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.DOTS1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams["num_experts"] = self.hparams["n_routed_experts"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + if "shared_experts" in name: + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/dotsocr.py b/conversion/dotsocr.py new file mode 100644 index 00000000000..f87f62abde9 --- /dev/null +++ b/conversion/dotsocr.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("DotsOCRForCausalLM") +class DotsOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 0 # dynamic resolution + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"])) + self.gguf_writer.add_vision_use_silu(True) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("vision_tower."): + return None + + if "vision_tower.blocks." in name and ".mlp." in name: + # note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here + # x = F.silu(self.fc1(x)) * self.fc3(x) + # x = self.fc2(x) + # fc1 -> gate, fc2 -> down, fc3 -> up + # mapping original names to Qwen2.5 naming scheme + name = name.replace("vision_tower.blocks.", "visual.blocks.") + name = name.replace(".fc1", ".gate_proj") + name = name.replace(".fc2", ".down_proj") + name = name.replace(".fc3", ".up_proj") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/dream.py b/conversion/dream.py new file mode 100644 index 00000000000..459e8d46afb --- /dev/null +++ b/conversion/dream.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("DreamModel") +class DreamModel(TextModel): + model_arch = gguf.MODEL_ARCH.DREAM + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] + vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) + assert max(vocab_dict.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + # Check if it's a special token - treat special tokens as CONTROL tokens + if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + # Dream models use non-causal attention for diffusion + self.gguf_writer.add_causal_attention(False) + + # Add Dream-specific parameters + mask_token_id = self.hparams.get("mask_token_id") + if mask_token_id is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Dream model tensors should be mapped directly since it's the base model + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/ernie.py b/conversion/ernie.py new file mode 100644 index 00000000000..aa8a3bc8ee5 --- /dev/null +++ b/conversion/ernie.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +import json +import math +import re + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM") +class Ernie4_5Model(TextModel): + model_arch = gguf.MODEL_ARCH.ERNIE4_5 + + def set_vocab(self): + self._set_vocab_sentencepiece() + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "ernie." in name: + name = name.replace("ernie.", "model.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + if (head_dim := self.hparams.get("head_dim")) is None: + head_dim = self.hparams["hidden_size"] // num_heads + + # split the qkv weights + # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size] + if "qkv_proj" in name: + name_q = name.replace("qkv_proj.weight", "q_proj.weight") + name_k = name.replace("qkv_proj.weight", "k_proj.weight") + name_v = name.replace("qkv_proj.weight", "v_proj.weight") + total_q_dim = num_heads * head_dim + total_k_dim = num_kv_heads * head_dim + total_v_dim = num_kv_heads * head_dim + q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) + yield from super().modify_tensors(q_proj_weight, name_q, bid) + yield from super().modify_tensors(k_proj_weight, name_k, bid) + yield from super().modify_tensors(v_proj_weight, name_v, bid) + # split the up_gate_proj into gate and up + # up_gate_proj shape: [2 * intermediate_size, hidden_size] + elif "up_gate_proj" in name: + name_up = name.replace("up_gate_proj.weight", "up_proj.weight") + name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") + dim_half = data_torch.shape[0] // 2 + gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Ernie4_5_MoeForCausalLM") +class Ernie4_5MoeModel(Ernie4_5Model): + model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE + _experts: list[dict[str, Tensor]] | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._experts = [{} for _ in range(self.block_count)] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_k"]) + self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"]) + self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"]) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None: + self.gguf_writer.add_expert_shared_count(shared_expert_count) + if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) + match = re.match(r"model.mtp_block.(\d+)", name) + if match: + return None + + # skip all other MTP tensors for now + match = re.match(r"model.mtp_emb_norm.(\d+)", name) + if match: + return None + + match = re.match(r"model.mtp_hidden_norm.(\d+)", name) + if match: + return None + + match = re.match(r"model.mtp_linear_proj.(\d+)", name) + if match: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["moe_num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["gate_proj", "up_proj", "down_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename_to_retrieve]) + del self._experts[bid][ename_to_retrieve] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + yield from super().modify_tensors(data_torch, merged_name, bid) + else: + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("PaddleOCRVLForConditionalGeneration") +class PaddleOCRModel(Ernie4_5Model): + model_arch = gguf.MODEL_ARCH.PADDLEOCR + + +@ModelBase.register("PaddleOCRVisionModel") +class PaddleOCRVisionModel(MmprojModel): + # PaddleOCR-VL uses a modified version of Siglip + min_pixels: int = 0 + max_pixels: int = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.min_pixels = self.preprocessor_config["min_pixels"] + self.max_pixels = self.preprocessor_config["max_pixels"] + self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR) + self.gguf_writer.add_vision_max_pixels(self.max_pixels) + self.gguf_writer.add_vision_min_pixels(self.min_pixels) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "vision_model" not in name and "mlp_AR" not in name: + return None + name = name.replace("visual.", "model.") + if "packing_position_embedding" in name: + # unused + return None + if "vision_model.head" in name: + # we don't yet support image embeddings for this model + return None + + return super().filter_tensors((name, gen)) diff --git a/conversion/exaone.py b/conversion/exaone.py new file mode 100644 index 00000000000..b21f0278429 --- /dev/null +++ b/conversion/exaone.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +import math + +from pathlib import Path +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf +from .qwenvl import Qwen2VLVisionModel + + +@ModelBase.register("ExaoneForCausalLM") +class ExaoneModel(TextModel): + model_arch = gguf.MODEL_ARCH.EXAONE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + assert (hparams["activation_function"] == "silu") + + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) + rotary_factor = rotary_factor if rotary_factor is not None else 1.0 + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = self.rope_parameters.get("rope_theta", 10000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + +@ModelBase.register("Exaone4ForCausalLM") +class Exaone4Model(TextModel): + model_arch = gguf.MODEL_ARCH.EXAONE4 + + def set_vocab(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if hparams.get("sliding_window") is not None: + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + if "layer_types" in hparams: + self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]]) + elif "sliding_window_pattern" in hparams: + sliding_window_pattern = [] + if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG + for i in range(hparams["num_hidden_layers"]): + sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L") + if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4 + for i in range(hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0) + if len(sliding_window_pattern) == hparams["num_hidden_layers"]: + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10_000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 16.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + +@ModelBase.register("ExaoneMoEForCausalLM") +class ExaoneMoEModel(Exaone4Model): + model_arch = gguf.MODEL_ARCH.EXAONE_MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + moe_intermediate_size = self.hparams["moe_intermediate_size"] + num_shared_experts = self.hparams["num_shared_experts"] + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + self.gguf_writer.add_expert_shared_count(num_shared_experts) + self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0)) + self.gguf_writer.add_leading_dense_block_count(n_dense_layer) + self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0)) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("mtp."): + if name.find("layers.") != -1: + # `mtp.layers.0.[module_name]` format + name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}") + else: + # mtp fc/norm weights + remapper = { + "mtp.fc": "model.layers.{bid}.eh_proj", + "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm", + "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm", + "mtp.norm": "model.layers.{bid}.shared_head.norm", + } + _n = Path(name) + new_name = remapper[_n.stem] + _n.suffix + + # set shared weights for all NextN/MTP layers + for bid in range(self.hparams['num_hidden_layers'], self.block_count): + yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid) + return + + if name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield from super().modify_tensors(data_torch, new_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("Exaone4_5_ForConditionalGeneration") +class Exaone4_5_TextModel(Exaone4Model): + """Text tower of EXAONE 4.5; Tensors match EXAONE4""" + + model_arch = gguf.MODEL_ARCH.EXAONE4 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0) + if n_nextn > 0: + self.block_count = self.hparams["num_hidden_layers"] + n_nextn + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0) + if n_nextn > 0: + self.gguf_writer.add_nextn_predict_layers(n_nextn) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("mtp."): + n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0) + if n_nextn <= 0: + return + nh = self.hparams["num_hidden_layers"] + if ".layers." in name: + share = self.hparams.get("mtp_share_layers", False) + mtp_bid = bid if bid is not None else 0 + if share: + for k in range(n_nextn): + nn = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{nh + k}") + yield from super().modify_tensors(data_torch, nn, nh + k) + return + name = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{mtp_bid + nh}") + else: + remapper = { + "mtp.fc": gguf.MODEL_TENSOR.NEXTN_EH_PROJ, + "mtp.pre_fc_norm_embedding": gguf.MODEL_TENSOR.NEXTN_ENORM, + "mtp.pre_fc_norm_hidden": gguf.MODEL_TENSOR.NEXTN_HNORM, + "mtp.norm": gguf.MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, + } + _n = Path(name) + key = _n.stem + if key not in remapper: + return + for bid_mtp in range(nh, self.block_count): + mapped_name = self.format_tensor_name(remapper[key], bid_mtp, suffix=_n.suffix) + yield from ModelBase.modify_tensors(self, data_torch, mapped_name, bid_mtp) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Exaone4_5_ForConditionalGeneration") +class Exaone4_5VisionModel(Qwen2VLVisionModel): + """Vision tower for EXAONE 4.5; Qwen2-VL-style ViT (GQA) + patch merger""" + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + name = name.replace("model.visual.", "visual.", 1) + return super().filter_tensors((name, gen)) + + def set_gguf_parameters(self): + MmprojModel.set_gguf_parameters(self) + assert self.hparams_vision is not None + hparams = self.hparams_vision + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.EXAONE4_5) + self.gguf_writer.add_vision_use_silu(True) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + num_kv_head = self.find_vparam(["num_key_value_heads"], optional=True) + if num_kv_head is not None: + self.gguf_writer.add_vision_head_count_kv(num_kv_head) + eps = hparams.get("rms_norm_eps", self.global_config.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_vision_attention_layernorm_eps(eps) + if (window_size := hparams.get("window_size")) is not None: + self.gguf_writer.add_vision_window_size(window_size) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + if fullatt_block_indexes: + n_wa_pattern = fullatt_block_indexes[0] + 1 + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid EXAONE4.5 fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if ".qkv." in name: + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + + yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid) diff --git a/conversion/falcon.py b/conversion/falcon.py new file mode 100644 index 00000000000..085fd4cd33f --- /dev/null +++ b/conversion/falcon.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(TextModel): + model_arch = gguf.MODEL_ARCH.FALCON + + def set_gguf_parameters(self): + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + head_dim = self.hparams["hidden_size"] // n_head + + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/falcon_h1.py b/conversion/falcon_h1.py new file mode 100644 index 00000000000..a8bc880b2c4 --- /dev/null +++ b/conversion/falcon_h1.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Any, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llama import LlamaModel +from .mamba import Mamba2Model + + +@ModelBase.register("FalconH1ForCausalLM") +class FalconH1Model(Mamba2Model): + model_arch = gguf.MODEL_ARCH.FALCON_H1 + + def __init__(self, *args, **kwargs): + # Set the hparam prefixes for Falcon Mamba2 + self.hparam_prefixes = ["mamba"] + + # Initialize the base Mamba2Model + super().__init__(*args, **kwargs) + + # Use Llama conversion for attention + self._transformer_model_class = LlamaModel + + # n_group and d_inner are used during reshape_tensors for mamba2 + self.n_group = self.find_hparam(["n_groups"]) + self.d_inner = self.find_hparam(["mamba_d_ssm"]) + self.d_head = self.find_hparam(["d_head"]) + + # Initialize any Falcon Mamba2 specific attributes + self.has_attention = True # Falcon Mamba2 has attention components + + # Load Falcon-H1 multipliers from hyperparameters + self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True) + self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True) + self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True) + self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True) + self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) + self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) + self.intermediate_size = self.find_hparam(["intermediate_size"]) + self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True) + + def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: + prefixed = [] + for pfx in self.hparam_prefixes: + prefixed.extend( + "_".join([pfx, k]) + for k in keys + ) + keys = list(keys) + prefixed + return super().find_hparam(keys, *args, **kwargs) + + def set_vocab(self): + self._set_vocab_gpt2() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + tensors = list(super().modify_tensors(data_torch, name, bid)) + tensor = tensors[0][1] + + if "down_proj" in name: + tensor = tensor * self.mlp_multipliers[1] + elif "gate_proj" in name: + tensor = tensor * self.mlp_multipliers[0] + elif "k_proj" in name: + tensor = tensor * self.key_multiplier * self.attention_in_multiplier + elif "q_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "v_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "o_proj" in name: + tensor = tensor * self.attention_out_multiplier + elif "out_proj" in name: + tensor = tensor * self.ssm_out_multiplier + elif "in_proj" in name: + tensor = tensor * self.ssm_in_multiplier + zxbcdt_multipliers = self.hparams["ssm_multipliers"] + intermediate_size = self.hparams["mamba_d_ssm"] + groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] + tensor[:intermediate_size, :] *= zxbcdt_multipliers[0] + tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1] + tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2] + tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3] + tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4] + elif "lm_head" in name: + tensor = tensor * self.hparams["lm_head_multiplier"] + elif "embed_tokens" in name: + tensor = tensor * self.hparams["embedding_multiplier"] + elif "mamba.norm" in name: + tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group) + + tensors = [(tensors[0][0], tensor)] + return tensors + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + ## General Params ## + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + # Override some Mamba2 defaults + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + + ## Attention params ## + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_key_length(self.hparams["head_dim"]) + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + + ## Validation ## + assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" + assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" + + # Add any other Falcon Mamba2 specific configuration + self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"]) diff --git a/conversion/gemma.py b/conversion/gemma.py new file mode 100644 index 00000000000..76beedcf0d3 --- /dev/null +++ b/conversion/gemma.py @@ -0,0 +1,841 @@ +from __future__ import annotations + +import json +import re + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GemmaForCausalLM") +class GemmaModel(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA + + def set_vocab(self): + self._set_vocab_sentencepiece() + + # TODO: these special tokens should be exported only for the CodeGemma family + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma2ForCausalLM") +class Gemma2Model(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + self._set_vocab_sentencepiece() + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_attn_logit_softcapping( + self.hparams["attn_logit_softcapping"] + ) + self.gguf_writer.add_final_logit_softcapping( + self.hparams["final_logit_softcapping"] + ) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(TextModel): + model_arch = gguf.MODEL_ARCH.GEMMA3 + + def norm_shift(self, name: str) -> float: + return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value + + def set_vocab(self): + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_space_prefix(False) + else: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + # some default values are not specified in the hparams + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) + self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers + # attn_logit_softcapping is removed in Gemma3 + assert hparams.get("attn_logit_softcapping") is None + if (final_logit_softcap := hparams.get("final_logit_softcapping")): + self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) + if hparams.get("sliding_window_pattern") != 1: + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # remove OOV (out-of-vocabulary) rows in token_embd + if "embed_tokens.weight" in name: + n_vocab_real = -1 + if (self.dir_model / "tokenizer.model").is_file(): + tokens = self._create_vocab_sentencepiece()[0] + n_vocab_real = len(tokens) + else: + with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"]) + data_torch = data_torch[:n_vocab_real] + + # ref code in Gemma3RMSNorm + # output = output * (1.0 + self.weight.float()) + # note: this is not the case on gemma3n + f_shift = self.norm_shift(name) + if f_shift != 0.0: + data_torch = data_torch + f_shift + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma3TextModel") +class EmbeddingGemma(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING + module_paths = [] + dense_features_dims = {} + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.sentence_transformers_dense_modules: + # read modules.json to determine if model has Dense layers + modules_file = self.dir_model / "modules.json" + if modules_file.is_file(): + with open(modules_file, encoding="utf-8") as modules_json_file: + mods = json.load(modules_json_file) + for mod in mods: + if mod["type"].endswith("Dense"): + mod_path = mod["path"] + # check if model.safetensors file for Dense layer exists + model_tensors_file = self.dir_model / mod_path / "model.safetensors" + if model_tensors_file.is_file(): + self.module_paths.append(mod_path) + # read config.json of the Dense layer to get in/out features + mod_conf_file = self.dir_model / mod_path / "config.json" + if mod_conf_file.is_file(): + with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file: + mod_conf = json.load(mod_conf_json_file) + # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights + prefix = self._get_dense_prefix(mod_path) + if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None: + self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"]) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + from safetensors.torch import load_file + module_paths = list(self.module_paths) + for i, module_path in enumerate(module_paths): + tensors_file = self.dir_model / module_path / "model.safetensors" + local_tensors = load_file(tensors_file) + tensor_name = self._get_dense_prefix(module_path) + for name, local_tensor in local_tensors.items(): + if not name.endswith(".weight"): + continue + orig_name = name.replace("linear", tensor_name) + name = self.map_tensor_name(orig_name) + yield name, local_tensor.clone() + + @staticmethod + def _get_dense_prefix(module_path) -> str: + """Get the tensor name prefix for the Dense layer from module path.""" + tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3" + return tensor_name + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Override the sliding window size as it gets adjusted by the Gemma3TextConfig + # constructor. We want to use the value from the original model's config.json. + # ref: https://github.com/huggingface/transformers/pull/40700 + with open(self.dir_model / "config.json", "r", encoding="utf-8") as f: + config = json.load(f) + orig_sliding_window = config.get("sliding_window") + if orig_sliding_window is None: + raise ValueError("sliding_window not found in model config - this is required for the model") + + logger.info(f"Using original sliding_window from config: {orig_sliding_window} " + f"instead of {self.hparams['sliding_window']}") + self.gguf_writer.add_sliding_window(orig_sliding_window) + if self.sentence_transformers_dense_modules: + for dense, dims in self.dense_features_dims.items(): + logger.info(f"Setting dense layer {dense} in/out features to {dims}") + self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1]) + + self._try_set_pooling_type() + + +@ModelBase.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3) + # default values below are taken from HF transformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length ** 0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "vision_model.head." in name: + # skip redundant tensors for tinygemma3 + return None + + if not name.startswith(("multi_modal_projector.", "vision_tower.", "multimodal_projector.", "vision_model.")): + return None + + name = name.replace("_weight", ".weight") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +class ConformerAudioModel(MmprojModel): + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + @staticmethod + def is_audio_tensor(name: str): + return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ConformerAudioModel.is_audio_tensor(name): + if ".conv" in name or "_conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + + if len(self._batch_norm_tensors[bid]) < 5: + return + + weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] + bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] + running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] + running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] + eps = 1e-5 # default value + + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid) + return + + # reshape conv weights + if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): + data_torch = data_torch[:, None, None] + if "conv.depthwise_conv" in name and name.endswith(".weight"): + assert data_torch.shape[1] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) + if "conv.pointwise_conv" in name and name.endswith(".weight"): + assert data_torch.shape[2] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) + + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) + + +@ModelBase.register("Gemma3nForConditionalGeneration") +class Gemma3nVisionAudioModel(ConformerAudioModel): + has_audio_encoder = True + has_vision_encoder = True + + # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py) + # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py + block_tensor_mapping = { + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight", + "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", + } + + def __init__(self, *args, **kwargs): + # Parent init will call find_hparam which now returns 0 for empty keys + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it + self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4 + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8) + + # MobileNetV5 does not use image_mean/std + self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0] + self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0] + self.hparams_vision["image_size"] = self.preprocessor_config.get( + "size", {"height": 768, "width": 768} + )["height"] + + # Image sequence length (256 tokens = 16x16 for Gemma3n) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + image_size = self.hparams_vision["image_size"] + self.hparams_vision["patch_size"] = image_size // image_seq_length + + # remap audio hparams + assert self.hparams_audio is not None + self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"] + self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"] + self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # vision params + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # audio params + assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # Force quantization settings for specific tensor types + if "input_projection" in name or "input_proj" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name or "stem" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def custom_map(self, name: str) -> str: + """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping.""" + parts = name.split(".") + # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix + if len(parts) >= 7: + bid, sid = parts[4], parts[5] + suffix = ".".join(parts[6:]) + template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}" + if template in self.block_tensor_mapping: + return self.block_tensor_mapping[template].format(bid=bid, sid=sid) + + raise ValueError(f"Unknown name: {name}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if (ConformerAudioModel.is_audio_tensor(name)): + name = name.replace("model.audio_tower.conformer.", "conformer.layers.") + yield from super().modify_tensors(data_torch, name, bid) + + # Gemma3n uses + # - model.embed_vision.* for projection layers + # - model.vision_tower.* for vision encoder + # Skip non-vision tensors + if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): + return + + if name.startswith("model.vision_tower.timm_model.blocks."): + # Double-indexed block tensors through custom logic + yield (self.custom_map(name), data_torch) + return + else: + # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py + new_name = self.map_tensor_name(name) + + if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): + data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] + + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) + + +@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") +class Gemma3NModel(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA3N + + _altup_proj: list[Tensor] = [] + _altup_unembd: list[Tensor] = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs" + self._altup_proj = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + self._altup_unembd = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + + def norm_shift(self, name: str) -> float: + del name + return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code + + def set_vocab(self): + # For Gemma3n multimodal models, we need the FULL vocab_size (262400) + # which includes special tokens from 262144-262399 for vision/audio. + # The vocab_size_per_layer_input (262144) is only the embedding size per layer. + # Temporarily override the hparams lookup order to prioritize vocab_size. + + # Store original vocab_size_per_layer_input if it exists + vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input") + + # Temporarily remove vocab_size_per_layer_input to force using vocab_size + if vocab_size_per_layer_input is not None: + del self.hparams["vocab_size_per_layer_input"] + + # Call parent set_vocab which will now use vocab_size (262400) + super().set_vocab() + + # Restore vocab_size_per_layer_input for later use + if vocab_size_per_layer_input is not None: + self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) + self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"]) + self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"]) + self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"]) + + activation_sparsity_scale = [] + for s in self.hparams["activation_sparsity_pattern"]: + normal_dist = torch.distributions.normal.Normal(0, 1) + std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32)) + activation_sparsity_scale.append(std_multiplier.item()) + self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale) + + sliding_window_pattern = [] + for t in self.hparams["layer_types"]: + sliding_window_pattern.append(t == "sliding_attention") + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None: + has_all = all(m.numel() > 0 for m in matrices) + if not has_all: + return None + else: + return torch.stack(matrices, dim=0) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith("_scale"): + name = name + ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # TODO: implement self.prediction_coefs.weight.clamp_(...) + + # Pad token embeddings for vision/audio special tokens (262144-262399) + if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: + # Move to CPU to avoid meta device issues during padding + data_torch = data_torch.to(device="cpu") + + vocab_size = self.hparams.get("vocab_size", 262400) + current_size = data_torch.shape[0] # First dimension is vocab_size + + if current_size < vocab_size: + # Pad with zeros for vision/audio tokens (they get embeddings from vision tower) + padding_size = vocab_size - current_size + tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings" + logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)") + + # Create padding with zeros (vision tokens won't use these embeddings) + padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device) + data_torch = torch.cat([data_torch, padding], dim=0) + + # Continue with normal processing + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + + if "altup_unembed_projections" in name: + data_torch = data_torch.to(device="cpu") + # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based + # They should NOT be padded + if ".0." in name: + self._altup_unembd[0] = data_torch + elif ".1." in name: + self._altup_unembd[1] = data_torch + elif ".2." in name: + self._altup_unembd[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_unembd) + if out is not None: + yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid) + return + else: + return + + if "altup_projections" in name: + data_torch = data_torch.to(device="cpu") + if ".0." in name: + self._altup_proj[0] = data_torch + elif ".1." in name: + self._altup_proj[1] = data_torch + elif ".2." in name: + self._altup_proj[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_proj) + if out is not None: + yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM") +class Gemma4Model(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA4 + + def norm_shift(self, name: str) -> float: + del name # unused + return 0.0 + + def set_vocab(self): + vocab = gguf.LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"} + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + text_str = text.decode() + if text_str in visible_tokens: + # always render these tokens, so that the chat parser can read them + toktypes.append(gguf.TokenType.USER_DEFINED) + logger.info(f"Token '{text_str}' is set to USER_DEFINED") + else: + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("gemma4") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + num_kv_shared_layers = self.hparams["num_kv_shared_layers"] + self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers) + + # per-layer embedding is optional + n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0 + self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd) + + swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]] + self.gguf_writer.add_sliding_window_pattern(swa_layers) + + head_dim_full = self.hparams["global_head_dim"] + head_dim_swa = self.hparams["head_dim"] + # correct the head dim for global/swa layers + self.gguf_writer.add_key_length(head_dim_full) + self.gguf_writer.add_value_length(head_dim_full) + self.gguf_writer.add_key_length_swa(head_dim_swa) + self.gguf_writer.add_value_length_swa(head_dim_swa) + + expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"]) + if expert_intermediate_size is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers + use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False) + first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers + if use_double_wide_mlp: + n_ff = self.hparams["intermediate_size"] + n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)] + self.gguf_writer.add_feed_forward_length(n_ff_arr) + + # handle num_global_key_value_heads + num_key_value_heads_full = self.hparams.get("num_global_key_value_heads") + num_key_value_heads_swa = self.hparams.get("num_key_value_heads") + if num_key_value_heads_full is not None and num_key_value_heads_swa is not None: + value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers] + self.gguf_writer.add_head_count_kv(value_arr) + + # handle n_rot differently for global vs swa layers + partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) + n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors + n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) + self.gguf_writer.add_rope_dimension_count(n_rot_full) + self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # full layer uses "proportional" rope with partial_rotary_factor=0.25 + # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated), + # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention + # solution is to set specific freq_factors for the unrotated dims + + # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers + rope_params_full = self.hparams["rope_parameters"]["full_attention"] + assert rope_params_full["rope_type"] == "proportional" + head_dim_full = (self.hparams["global_head_dim"]) + partial_rotary_factor_full = rope_params_full["partial_rotary_factor"] + n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2) + n_unrot_full = int(head_dim_full / 2) - n_rot_full + values = [1.0] * n_rot_full + [1e30] * n_unrot_full + rope_freqs_full = torch.tensor(values, dtype=torch.float32) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full) + + def _generate_nvfp4_tensors(self): + # Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales + # each expert's contribution. It's mathematically equivalent to a per-expert + # scalar on the down_proj output, which is exactly where ffn_down_exps_s is + # applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the + # existing NVFP4 path produces the right scales. + n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 + for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]: + bid_match = re.search(r"\.layers\.(\d+)\.", name) + if bid_match is None: + continue + bid = bid_match.group(1) + prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")] + w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)] + present = [w2 in self.model_tensors for w2 in w2_targets] + if not any(present): + continue + assert all(present), f"layer {bid}: partial NVFP4 quantization across experts" + r = self.model_tensors.pop(name) + for e, w2 in enumerate(w2_targets): + s = self.model_tensors[w2] + self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i] + super()._generate_nvfp4_tensors() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith("per_dim_scale") or name.endswith("layer_scalar"): + name = name + ".weight" + if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("router.scale"): + name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale") + yield (name, data_torch) + return + if ".per_expert_scale" in name: + # convert per-expert scale to FFN down scale + name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale") + yield (name, data_torch) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Gemma4ForConditionalGeneration") +class Gemma4VisionAudioModel(MmprojModel): + has_audio_encoder = True + has_vision_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 224 # unused, but set to avoid error + + # remap audio hparams + if self.hparams_audio: + self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) + self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 + else: + self.has_audio_encoder = False + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # vision params + assert self.hparams_vision is not None + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + # audio params + assert self.hparams_audio is not None + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6)) + + def is_audio_tensor(self, name: str) -> bool: + return "audio_tower" in name or "embed_audio" in name + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if self.is_audio_tensor(name): + if ".conv" in name or "_conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + if "position_embedding_table" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if len(data_torch.shape) == 0: + # convert scalar tensors (input/output_mix/max) to 1D tensors + data_torch = data_torch.unsqueeze(0) + + if self.is_audio_tensor(name): + assert self.hparams_audio is not None + name = name.replace("model.audio_tower.", "conformer.") + name = name.replace(".linear.", ".") + if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"): + name = name + ".weight" + data_torch = torch.nn.functional.softplus(data_torch) + if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"): + assert data_torch.shape[1] == 1 + data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) + + else: + name = name.replace("model.vision_tower.encoder.", "vision_model.model.") + name = name.replace(".linear.weight", ".weight") + if name.endswith("layer_scalar") or name.endswith("position_embedding_table"): + name = name + ".weight" + if name.endswith("patch_embedder.input_proj.weight"): + n_embd, ksize_sq_c = data_torch.shape + patch_size = int((ksize_sq_c // 3) ** 0.5) + data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3) + data_torch = data_torch.permute(0, 3, 1, 2).contiguous() + mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) + yield (mapped_name, data_torch) diff --git a/conversion/glm.py b/conversion/glm.py new file mode 100644 index 00000000000..641937720d6 --- /dev/null +++ b/conversion/glm.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + +from .deepseek import DeepseekV2Model + + +@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") +class Glm4Model(TextModel): + model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) + if "mrope_section" in self.rope_parameters: + self.use_mrope = True + logger.info("Q/K weight will need to be permuted for M-RoPE") + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) + + @staticmethod + def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: + orig_shape = weights.shape + if len(orig_shape) == 1: + weights = weights.unsqueeze(1) # [out_dim, 1] + if len(weights.shape) != 2: + raise ValueError("Only 1D and 2D tensors are supported.") + n_effective_heads = weights.shape[0] // head_dim + if n_head_kv is not None and n_effective_heads != n_head: + if n_effective_heads != n_head_kv: + raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") + rotary_dim = int(head_dim * partial_rotary_factor) + if rotary_dim % 2 != 0: + raise ValueError("rotary_dim must be even.") + reshaped = weights.reshape(n_effective_heads, head_dim, -1) + rot_part = reshaped[:, :rotary_dim, :] + non_rot_part = reshaped[:, rotary_dim:, :] + permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) + combined = torch.cat((permuted_rot, non_rot_part), dim=1) + result = combined.reshape(weights.shape) + return result if len(orig_shape) != 1 else result.squeeze(1) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.use_mrope: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + head_dim = self.hparams.get("head_dim", n_embd // n_head) + # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("GlmOcrForConditionalGeneration") +class GlmOCRModel(Glm4Model): + model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + +@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") +class Glm4MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + return self._set_vocab_glm() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = ( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_rope_dimension_count( + int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) + ) + + # MoE parameters - Use only routed expert count (shared experts handled separately) + if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None: + self.gguf_writer.add_expert_count(n_routed_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None: + self.gguf_writer.add_expert_shared_count(n_shared_experts) + if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None: + self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + + # Expert gating function (sigmoid for GLM4_MOE) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + # Routed scaling factor + if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) + + # Normalise topk probabilities + if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None: + self.gguf_writer.add_expert_weights_norm(norm_topk_prob) + + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + _experts: list[dict[str, Tensor]] | None = None + + # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle main token embedding (but not layer-specific NextN embeddings) + if name == "model.embed_tokens.weight" and ".layers." not in name: + yield from super().modify_tensors(data_torch, "token_embd.weight", bid) + return + + # Handle routed experts + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("Glm4MoeLiteForCausalLM") +class Glm4MoeLiteModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + def set_vocab(self): + return self._set_vocab_glm() + + +@ModelBase.register("GlmMoeDsaForCausalLM") +class GlmMoeDsaModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.GLM_DSA + skip_mtp = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + return self._set_vocab_glm() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_dim = self.hparams["qk_rope_head_dim"] + partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) + + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + # DSA indexer parameters + self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"]) + self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"]) + self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"]) + + +@ModelBase.register("SolarOpenForCausalLM") +class SolarOpenModel(Glm4MoeModel): + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"]) # ty: ignore[unresolved-attribute] + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute] + special_vocab.add_to_gguf(self.gguf_writer) diff --git a/conversion/gpt2.py b/conversion/gpt2.py new file mode 100644 index 00000000000..1cf06ae8b50 --- /dev/null +++ b/conversion/gpt2.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GPT2LMHeadModel") +class GPT2Model(TextModel): + model_arch = gguf.MODEL_ARCH.GPT2 + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["n_ctx"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # we don't need these + if name.endswith((".attn.bias", ".attn.masked_bias")): + yield from super().modify_tensors(data_torch, name, bid) + return + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + yield from super().modify_tensors(data_torch, new_name, bid) + + +@ModelBase.register("RuGPT3XLForCausalLM") +class RuGPT3XLModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPT2 + + _qkv_parts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Fuse separate Q, K, V projections into a single QKV tensor + if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name: + suffix = "weight" if name.endswith(".weight") else "bias" + part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v") + key = f"{part}.{suffix}" + + assert bid is not None + if self._qkv_parts is None: + self._qkv_parts = [{} for _ in range(self.block_count)] + self._qkv_parts[bid][key] = data_torch + + q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}" + if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]): + q = self._qkv_parts[bid].pop(q_key) + k = self._qkv_parts[bid].pop(k_key) + v = self._qkv_parts[bid].pop(v_key) + data_torch = torch.cat([q, k, v], dim=0) + name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}") + logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}") + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._qkv_parts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()] + if len(parts) > 0: + raise ValueError(f"Unprocessed Q/K/V parts: {parts}") diff --git a/conversion/gpt_oss.py b/conversion/gpt_oss.py new file mode 100644 index 00000000000..d2c70c0bba5 --- /dev/null +++ b/conversion/gpt_oss.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GptOssForCausalLM") +class GptOssModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPT_OSS + + # TODO: remove once MXFP4 is supported more generally + def dequant_model(self): + if self._is_mxfp4: + return + return super().dequant_model() + + def transform_nibble_layout(self, tensor): + assert tensor.dtype == torch.uint8 + assert tensor.shape[-1] == 16 + # swap nibbles + t_lo = tensor & 0x0F + t_hi = tensor & 0xF0 + t_swapped = (t_lo << 4) | (t_hi >> 4) + tensor = t_swapped + # transform aaaa...bbbb... to abababab... + blk_a, blk_b = tensor.chunk(2, dim=-1) + # get a_ + blk_a0 = (blk_a & 0xF0).view(-1, 1) + blk_a1 = (blk_a << 4).view(-1, 1) + blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape) + # get _b + blk_b0 = (blk_b >> 4).view(-1, 1) + blk_b1 = (blk_b & 0x0F).view(-1, 1) + blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape) + # swap once more + out = blk_a | blk_b + out_h = out & 0xF0 + out_l = out & 0x0F + out = (out_h >> 4) | (out_l << 4) + return out + + def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor): + assert blocks.dtype == torch.uint8 + assert scales.dtype == torch.uint8 + scales = scales.unsqueeze(-1) + assert len(blocks.shape) == 4 + assert len(scales.shape) == 4 + blocks = self.transform_nibble_layout(blocks) + new_data = torch.concat((scales, blocks), dim=-1) + new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32] + logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4") + # flatten last dim + new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3]) + new_data = new_data.numpy() + self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + blocks0: Tensor = torch.zeros(1) + blocks1: Tensor = torch.zeros(1) + # we assume that tensors are loaded in the correct order + for name, data_torch in self.get_tensors(): + if "mlp.experts.down_proj_blocks" in name: + blocks0 = data_torch + elif "mlp.experts.down_proj_scales" in name: + new_name = self.map_tensor_name(name.replace("_scales", ".weight")) + self.repack_mxfp4(new_name, blocks0, data_torch) + elif "mlp.experts.gate_up_proj_blocks" in name: + blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] + elif "mlp.experts.gate_up_proj_scales" in name: + scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :] + new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight")) + new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) + self.repack_mxfp4(new_name_gate, blocks0, scales0) + self.repack_mxfp4(new_name_up, blocks1, scales1) + return [] + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "sinks" in name: + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # correct naming for down_proj + if "down_proj" in name: + if name.endswith("_bias"): + name = name.replace("down_proj_bias", "down_proj.bias") + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name = name.replace("down_proj", "down_proj.weight") + data_torch = data_torch.transpose(-1, -2) + else: + # otherwise, it should already be repacked to ggml MXFP4 format + return + + # split the gate_up into gate and up + if "gate_up_proj" in name: + if name.endswith("_bias"): + name_up = name.replace("gate_up_proj_bias", "up_proj.bias") + name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") + gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] + yield from super().modify_tensors(gate_proj_bias, name_gate, bid) + yield from super().modify_tensors(up_proj_bias, name_up, bid) + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + data_torch = data_torch.transpose(-1, -2) + gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"]) diff --git a/conversion/gptneox.py b/conversion/gptneox.py new file mode 100644 index 00000000000..6a42b12b15a --- /dev/null +++ b/conversion/gptneox.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import re + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GPTNeoXForCausalLM") +class GPTNeoXModel(TextModel): + model_arch = gguf.MODEL_ARCH.GPTNEOX + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + assert n_head is not None + assert n_embed is not None + + if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/granite.py b/conversion/granite.py new file mode 100644 index 00000000000..647269ba740 --- /dev/null +++ b/conversion/granite.py @@ -0,0 +1,328 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + +from .llama import LlamaModel +from .mamba import Mamba2Model + + +@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") +class GraniteModel(LlamaModel): + """Conversion for IBM's GraniteForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_scale + - embedding_scale + - residual_scale + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + logger.info("gguf: (granite) attention_scale = %s", attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) + logger.info("gguf: (granite) residual_scale = %s", residual_scale) + if logits_scale := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scale) + logger.info("gguf: (granite) logits_scale = %s", logits_scale) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.startswith("encoder."): + return None + return super().filter_tensors(item) + + +@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM") +class GraniteMoeModel(GraniteModel): + """Conversion for IBM's GraniteMoeForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE_MOE + + def set_gguf_parameters(self): + """GraniteMoeShared uses GraniteMoe parameters plus the following: + - shared_intermediate_size + """ + super().set_gguf_parameters() + if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"): + self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length) + logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + """In modeling_granitemoe, the JetMoe implementation of parallel experts + is used. This essentially merges w1 and w3 into a single tensor with 2x + the hidden size that is then split during forward. To keep compatibility + with existing mixtral support, we pull them apart here. + """ + + if name.endswith("block_sparse_moe.input_linear.weight"): + ffn_dim = self.hparams["intermediate_size"] + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" + gate, up = data_torch.split(ffn_dim, dim=-2) + yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) + return + + has_experts = bool(self.hparams.get('num_local_experts')) + + if name.endswith("shared_mlp.input_linear.weight"): + ffn_dim = self.hparams["shared_intermediate_size"] + assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" + gate, up = data_torch.split(ffn_dim, dim=-2) + if has_experts: + yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) + return + yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return + + if not has_experts and name.endswith("shared_mlp.output_linear.weight"): + yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM") +class GraniteHybridModel(Mamba2Model, GraniteMoeModel): + """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM + layers and optionally uses MoE w/ a shared expert""" + model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID + undo_permute = True + + def __init__(self, *args, **kwargs): + + # Hybrid mamba models use a prefix for the mamba-specific params. + # TODO: Extend this if the prefix(es) need to be configurable + self.hparam_prefixes = ["mamba"] + + super().__init__(*args, **kwargs) + + # Lists of which layers use ssm vs attention + self._attn_layers = self.get_attn_layers() + self._ssm_layers = [ + i for i in range(self.block_count) + if i not in self._attn_layers + ] + + # There are some models in this family that are non-hybrid, but keep the + # same parent class by setting all layers to "attention." If this is the + # case, the model architecture needs to be updated to a standard + # "granite" or "granitemoe" model + if not self._ssm_layers: + has_experts = self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True) + new_arch = ( + gguf.MODEL_ARCH.GRANITE_MOE + if has_experts else + gguf.MODEL_ARCH.GRANITE + ) + self.model_arch = new_arch + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] + self.gguf_writer.add_architecture() + + # n_group and d_inner are used during reshape_tensors for mamba2 + # NOTE: Explicitly include hparam prefix prefix for d_model to + # disambiguate with top-level head_dim + # NOTE 2: If needed for future models, this can be isolated in a method + # to separate the prefix setting and the keys used + self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"]) + self.n_group = self.find_hparam(["n_groups", "num_groups"]) + self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model + + def get_attn_layers(self): + # Explicit list of layer type names + if layer_types := self.hparams.get("layer_types"): + return [ + i for i, typ in enumerate(layer_types) + if typ == "attention" + ] + + # Layer types indicated by index or period + attn_layers = self.hparams.get("attn_layer_indices", []) + if not attn_layers: + attn_period = self.hparams.get("attn_layer_period") + assert attn_period, "Didn't find attn_layer_indices or attn_layer_period" + attn_offset = self.hparams.get("attn_layer_offset") + assert attn_offset is not None, "No attention layer offset set with attn_layer_period" + attn_layers = [ + i for i in range(self.block_count) + if i % attn_period == attn_offset + ] + return attn_layers + + def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: + prefixed = [] + for pfx in self.hparam_prefixes: + prefixed.extend( + "_".join([pfx, k]) + for k in keys + ) + keys = list(keys) + prefixed + return Mamba2Model.find_hparam(self, keys, *args, **kwargs) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if ( + name.endswith("block_sparse_moe.input_linear.weight") + or "shared_mlp" in name + ): + yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) + return + + # Determine whether this is a mamba layer or an attention layer + if bid in self._ssm_layers: + yield from Mamba2Model.modify_tensors(self, data_torch, name, bid) + return + elif bid in self._attn_layers: + yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) + return + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + + def set_gguf_parameters(self): + """This method merges params from both parents and some that are + specific to this model. The result is some duplication of how the params + get set. The following warnings are expected during conversion: + + WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv' + WARNING:Duplicated key name 'granitehybrid.context_length' + """ + GraniteMoeModel.set_gguf_parameters(self) + + ## Mamba mixer params ## + self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) + self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"])) + self.gguf_writer.add_ssm_group_count(self.n_group) + self.gguf_writer.add_ssm_inner_size(self.d_inner) + # NOTE: The mamba_dt_rank is _not_ the right field for how this is used + # in llama.cpp + self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"])) + + ## Attention params ## + head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + head_count_kv_vec = [ + head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count) + ] + if rope_dim := self.hparams.get("attn_rotary_emb"): + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_head_count_kv(head_count_kv_vec) + + ## If Bamba or non-hybrid, use rope, otherwise don't + use_rope = ( + "BambaForCausalLM" in self.hparams["architectures"] + or not self._ssm_layers + ) + self.gguf_writer.add_rope_scaling_finetuned(use_rope) + if not use_rope: + self.gguf_writer.add_context_length(2**20) + + ## Validation ## + d_head = self.find_hparam(["d_head"], optional=True) or 64 + assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" + assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" + + def set_vocab(self): + self.hparams["pad_vocab_size_multiple"] = 8 + Mamba2Model.set_vocab(self) + + +@ModelBase.register("GraniteSpeechForConditionalGeneration") +class GraniteSpeechMmprojModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder_config") + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + a = self.hparams_audio + a["hidden_size"] = a["hidden_dim"] + a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"] + a["num_attention_heads"] = a["num_heads"] + a["num_hidden_layers"] = a["num_layers"] + + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH) + self.gguf_writer.add_audio_num_mel_bins(a["input_dim"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + self.gguf_writer.add_audio_chunk_size(a["context_size"]) + self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"]) + self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"]) + + p = self.global_config + self.gguf_writer.add_audio_projector_window_size(p["window_size"]) + self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"]) + self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if "encoder" in name or "projector" in name: + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if "attention_dists" in name or "num_batches_tracked" in name: + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name and "encoder.layers." in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + if len(self._batch_norm_tensors[bid]) < 4: + return + prefix = f"encoder.layers.{bid}.conv.batch_norm" + weight = self._batch_norm_tensors[bid][f"{prefix}.weight"] + bias = self._batch_norm_tensors[bid][f"{prefix}.bias"] + running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"] + running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"] + eps = 1e-5 + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid) + return + + if ".attn.to_kv.weight" in name: + k_weight, v_weight = data_torch.chunk(2, dim=0) + yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid) + yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid) + return + + if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"): + if data_torch.ndim == 3 and data_torch.shape[2] == 1: + data_torch = data_torch.squeeze(2) + + if "depth_conv" in name and name.endswith(".weight"): + if data_torch.ndim == 3 and data_torch.shape[1] == 1: + data_torch = data_torch.squeeze(1) + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/grok.py b/conversion/grok.py new file mode 100644 index 00000000000..9098e514a3a --- /dev/null +++ b/conversion/grok.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import sys + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") +class GrokModel(TextModel): + model_arch = gguf.MODEL_ARCH.GROK + + def set_vocab(self): + if (self.dir_model / 'tokenizer.model').is_file(): + self._set_vocab_sentencepiece() + return + + if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file(): + logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer') + sys.exit(1) + + self._set_vocab_gpt2() + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0)) + self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0)) + if (final_logit_softcap := self.hparams.get("final_logit_softcapping")): + self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) + + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + # Treat "original" as "yarn", seems to have been a mistake + if self.hparams.get("rope_type") in ("yarn", "original"): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"]) + self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"]) + self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"]) + self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"]) + + if temp_len := self.hparams.get("attn_temperature_len"): + self.gguf_writer.add_attn_temperature_length(temp_len) + + self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5)) + self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"]) + self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"]) + + _experts: list[dict[str, list[Tensor]]] | None = None + _cur_expert = "" + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + deferred: list[tuple[Tensor, str, int | None]] = [] + is_expert = ".moe." in name or ".block_sparse_moe.experts." in name + + if not is_expert: + deferred.append((data_torch, name, bid)) + + # process the experts separately + if is_expert or self._cur_expert: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + # concatenate split tensors + if name in self._experts[bid]: + self._cur_expert = name + self._experts[bid][name].append(data_torch) + return + elif is_expert: + self._cur_expert = name + self._experts[bid][name] = [data_torch] + return + else: + self._cur_expert = "" + + for bid in range(self.block_count): + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight" + if ename not in self._experts[bid]: + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight" + tensor_list = self._experts[bid][ename] + datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + + for t in deferred: + yield from super().modify_tensors(*t) diff --git a/conversion/grovemoe.py b/conversion/grovemoe.py new file mode 100644 index 00000000000..a8be931cb90 --- /dev/null +++ b/conversion/grovemoe.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") +class GroveMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.GROVEMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299 + self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128) + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298 + self.gguf_writer.add_experts_per_group(2) + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 + self.gguf_writer.add_expert_group_scale(0.05) + + _experts: list[dict[str, Tensor]] | None = None + _chunk_experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".expert_bias"): + # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 + return + + # process the experts separately + if name.find("chunk_experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) // 2 # see add_experts_per_group + assert bid is not None + + if self._chunk_experts is None: + self._chunk_experts = [{} for _ in range(self.block_count)] + + self._chunk_experts[bid][name] = data_torch + + if len(self._chunk_experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight" + datas.append(self._chunk_experts[bid][ename]) + del self._chunk_experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + elif name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._chunk_experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + chunk_experts = [k for d in self._chunk_experts for k in d.keys()] + if len(chunk_experts) > 0: + raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}") + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/hunyuan.py b/conversion/hunyuan.py new file mode 100644 index 00000000000..537f023aa01 --- /dev/null +++ b/conversion/hunyuan.py @@ -0,0 +1,357 @@ +from __future__ import annotations + +import json + +from pathlib import Path +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .qwen import QwenModel + + +@ModelBase.register("HunYuanMoEV1ForCausalLM") +class HunYuanMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE + + def set_vocab(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + # 1. Get the pre-tokenizer identifier hash + tokpre = self.get_vocab_base_pre(tokenizer) + + # 2. Reverse-engineer the merges list from mergeable_ranks + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: # todo this is an assert in Qwen, why? + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # 3. Generate the tokens and toktypes lists + vocab_size = self.hparams["vocab_size"] + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # 4. Write all vocab-related fields to the GGUF writer + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + # 5. Add special tokens and chat templates + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # FIX for BOS token: Overwrite incorrect id read from config.json + self.gguf_writer.add_bos_token_id(127959) # <|bos|> + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"]) + + moe_intermediate_size = hparams["moe_intermediate_size"] + assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size) + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0]) + + moe_topk = hparams["moe_topk"] + assert all(topk == moe_topk[0] for topk in moe_topk) + self.gguf_writer.add_expert_used_count(moe_topk[0]) + + moe_shared_expert = hparams["num_shared_expert"] + assert all(n == moe_shared_expert[0] for n in moe_shared_expert) + self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) + + # Rope + if self.rope_parameters.get("rope_type") == "dynamic": + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) + alpha = self.rope_parameters.get("alpha", 1000) + base = self.rope_parameters.get("rope_theta", 10000.0) + dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 + scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 + self.gguf_writer.add_rope_freq_base(scaled_base) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(1) + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length + + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return + + if name.find("mlp.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("HunYuanDenseV1ForCausalLM") +class HunYuanModel(TextModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + + def _get_eod_token_id(self) -> int | None: + """Get the actual end-of-generation token from config (eod_token_id).""" + return self.hparams.get("eod_token_id") + + def _get_eot_token_id(self) -> int | None: + """Get the end-of-turn token from generation_config.json. + This is the first entry in eos_token_id when it's a list.""" + gen_cfg_path = self.dir_model / "generation_config.json" + if gen_cfg_path.is_file(): + with open(gen_cfg_path, encoding="utf-8") as f: + gen_cfg = json.load(f) + eos = gen_cfg.get("eos_token_id") + if isinstance(eos, list) and len(eos) >= 2: + return eos[0] + return None + + def _fix_special_tokens(self): + """Fix EOS/EOT tokens that are incorrect in upstream configs.""" + eod_id = self._get_eod_token_id() + if eod_id is not None: + self.gguf_writer.add_eos_token_id(eod_id) + eot_id = self._get_eot_token_id() + if eot_id is not None: + self.gguf_writer.add_eot_token_id(eot_id) + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1; + # guard SpecialVocab so it doesn't try to emit an invalid pad id. + token_types = None + if (self.hparams.get("pad_token_id") or 0) < 0: + token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types) + special_vocab.add_to_gguf(self.gguf_writer) + self._fix_special_tokens() + else: + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + # 1. Get the pre-tokenizer identifier hash + tokpre = self.get_vocab_base_pre(tokenizer) + + # 2. Reverse-engineer the merges list from mergeable_ranks + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # 3. Generate the tokens and toktypes lists + vocab_size = self.hparams["vocab_size"] + assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # 4. Write all vocab-related fields to the GGUF writer + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + # 5. Add special tokens and chat templates + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # FIX for BOS token: Overwrite incorrect id read from config.json + if self.hparams['hidden_size'] == 4096: + self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token + self._fix_special_tokens() + + def set_gguf_parameters(self): + # Some HunYuanVL variants set num_experts=1 (not real MoE); + # prevent the parent class from emitting expert_count metadata in that case. + saved_num_experts = self.hparams.pop("num_experts", None) + super().set_gguf_parameters() + if saved_num_experts is not None and saved_num_experts > 1: + self.hparams["num_experts"] = saved_num_experts + hparams = self.hparams + + # Rope + if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"): + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) + alpha = self.rope_parameters.get("alpha", 50) + base = self.rope_parameters.get("rope_theta", 10000.0) + dim = hparams["head_dim"] + scaled_base = base * (alpha ** (dim / (dim - 2))) + self.gguf_writer.add_rope_freq_base(scaled_base) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(1) + if self.rope_parameters.get("rope_type") == "dynamic": + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length + + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # HunyuanVL uses max_image_size instead of image_size + if "image_size" not in self.hparams_vision: + self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + vcfg = self.hparams_vision + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) + self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) + self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("vit."): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # strip CLS token (row 0) from position embeddings so resize_position_embeddings works + if "position_embedding" in name: + data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] + yield from super().modify_tensors(data_torch, name, bid) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal + # HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. + if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLTextModel(HunYuanModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + + def __init__(self, dir_model: Path, *args, **kwargs): + super().__init__(dir_model, *args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # XD-RoPE metadata for the HunyuanVL; + if self.rope_parameters.get("rope_type") != "xdrope": + return + + self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) + self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1))) + + ctx_len = int(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len) + self.gguf_writer.add_context_length(ctx_len) + + self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"])) diff --git a/conversion/internlm.py b/conversion/internlm.py new file mode 100644 index 00000000000..7e11aca3ce0 --- /dev/null +++ b/conversion/internlm.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import json +import sys + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + +from .llama import LlamaModel + + +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + logger.warning(f"InternLM2 convert token '{text}' to '๐Ÿ‰'!") + text = "๐Ÿ‰".encode("utf-8") + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + # take care of ununsed raw token + if piece.startswith('[UNUSED'): + toktype = SentencePieceTokenTypes.UNUSED + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + chat_eos_token = '<|im_end|>' + chat_eos_token_id = None + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"] + if token == chat_eos_token: + chat_eos_token_id = token_id + token = token.encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if chat_eos_token_id is not None: + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 + special_vocab.special_token_ids["eos"] = chat_eos_token_id + logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" + " in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + q_per_kv = num_heads // num_kv_heads + head_dim = n_embd // num_heads + num_groups = num_heads // q_per_kv + + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: + qkv = data_torch + + qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + + # The model weights of q and k equire additional reshape. + q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + v = v.reshape((-1, v.shape[-1])) + + yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("InternLM3ForCausalLM") +class InternLM3Model(TextModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + tokens, scores, toktypes = self._create_vocab_sentencepiece() + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + if "added_tokens_decoder" in tokenizer_config_json: + for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): + if token_data.get("special"): + token_id = int(token_id) + token = token_data["content"] + special_vocab._set_special_token(token, token_id) + # update eos token + if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: + special_vocab.special_token_ids["eos"] = token_id + + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("mlp", "vision_model")): + # skip visual tensors + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/internvl.py b/conversion/internvl.py new file mode 100644 index 00000000000..9a2a1e43df7 --- /dev/null +++ b/conversion/internvl.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("InternVisionModel") +class InternVisionModel(MmprojModel): + + min_dynamic_tiles: int = 0 + max_dynamic_tiles: int = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) + self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) + + def set_gguf_parameters(self): + assert self.hparams_vision is not None + if isinstance(self.hparams_vision['image_size'], list): + self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] + if isinstance(self.hparams_vision['patch_size'], list): + self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] + super().set_gguf_parameters() + + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + # downsample_ratio + downsample_ratio = self.global_config.get("downsample_ratio") + assert downsample_ratio is not None + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + # older models may not have min/max_dynamic_patch in config + if self.min_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) + if self.max_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] + if not any([name.startswith(prefix) for prefix in vision_prefix]): + return None + # deal with intern-s1 special case + names_map = { + "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", + "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", + "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", + "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", + "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", + "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", + } + if name in names_map: + name = names_map[name] + # correct name + if name.startswith("vision_model"): + name = "vision_tower." + name + if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c: c * 2] + wv = data_torch[c * 2:] + yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) + yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) + yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/jais.py b/conversion/jais.py new file mode 100644 index 00000000000..00add4c77fc --- /dev/null +++ b/conversion/jais.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import math + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("Jais2ForCausalLM") +class Jais2Model(TextModel): + model_arch = gguf.MODEL_ARCH.JAIS2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + head_dim = hparams.get("head_dim", hparams["hidden_size"] // hparams["num_attention_heads"]) + self.gguf_writer.add_rope_dimension_count(head_dim) + + +@ModelBase.register("JAISLMHeadModel") +class JaisModel(TextModel): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + if 'mup_embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + self.max_alibi_bias = 8.0 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # we don't need these + if name.endswith((".attn.bias")): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(("relative_pe.slopes")): + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch[0].item()) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + + return + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid) + else: + yield from super().modify_tensors(data_torch, new_name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) diff --git a/conversion/jamba.py b/conversion/jamba.py new file mode 100644 index 00000000000..da712ba5014 --- /dev/null +++ b/conversion/jamba.py @@ -0,0 +1,119 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("JambaForCausalLM") +class JambaModel(TextModel): + model_arch = gguf.MODEL_ARCH.JAMBA + + def set_vocab(self): + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) + d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 + d_inner = self.hparams["mamba_expand"] * d_model + d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6 + n_kv_head = self.hparams["num_key_value_heads"] + attn_offset = self.hparams["attn_layer_offset"] + attn_period = self.hparams["attn_layer_period"] + n_kv_vec = [0 for _ in range(attn_offset)] + [ + n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count) + ] + + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"])) + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(n_kv_vec) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) + self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) + self.gguf_writer.add_file_type(self.ftype) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # Mini-Jamba + name = name.replace(".moe.", ".feed_forward.") + if bid is not None: + moe_offset = self.hparams["expert_layer_offset"] + moe_period = self.hparams["expert_layer_period"] + + if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): + name = name.replace(".experts.0.", ".") + + # process the experts separately + if ".feed_forward.experts." in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + + # merge the experts into a single 3d tensor + for wid in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + # using the same merged name as qwen2moe + merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + yield new_name, data_torch + return + + new_name = self.map_tensor_name(name) + + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + yield (new_name, data_torch) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/januspro.py b/conversion/januspro.py new file mode 100644 index 00000000000..b49691205cc --- /dev/null +++ b/conversion/januspro.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + +from .llama import LlamaModel + + +@ModelBase.register("JanusForConditionalGeneration") +class JanusProModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip vision, aligner, and generation tensors + skip_prefixes = ( + 'model.vision_model.', + 'model.aligner.', + 'model.vqmodel.', + 'model.generation_embeddings.', + 'model.generation_aligner.', + 'model.generation_head.', + ) + if name.startswith(skip_prefixes): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("JanusForConditionalGeneration") +class JanusProVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + if "intermediate_size" not in self.hparams_vision: + mlp_ratio = self.hparams_vision.get("mlp_ratio") + hidden_size = self.hparams_vision.get("hidden_size") + if mlp_ratio is not None and hidden_size is not None: + self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO) + + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]: + """Map aligner tensors to projector format""" + suffix = ".bias" if name.endswith(".bias") else ".weight" + + if name.startswith("model.aligner."): + local_name = name[len("model.aligner."):] + elif name.startswith("aligner."): + local_name = name[len("aligner."):] + else: + raise ValueError(f"Unsupported Janus aligner prefix: {name}") + + if local_name.startswith("fc1."): + mm_index = 0 + elif local_name.startswith("hidden_layers."): + parts = local_name.split(".", 2) + if len(parts) < 3: + raise ValueError(f"Unexpected Janus aligner tensor name: {name}") + mm_index = int(parts[1]) + 1 + else: + raise ValueError(f"Unsupported Janus aligner tensor: {name}") + + tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix) + return [(tensor_name, data_torch)] + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip generation-related components + skip_generation_prefixes = ( + 'model.vqmodel.', + 'vqmodel.', + 'model.generation_embeddings.', + 'generation_embeddings.', + 'model.generation_aligner.', + 'generation_aligner.', + 'model.generation_head.', + 'generation_head.', + ) + if name.startswith(skip_generation_prefixes): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle aligner tensors + if name.startswith(('model.aligner.', 'aligner.')): + yield from self._map_aligner_tensor(data_torch, name) + return + + # Handle vision tensors + if name.startswith(('model.vision_model.', 'vision_model.')): + yield from super().modify_tensors(data_torch, name, bid) + return + + return diff --git a/conversion/kimi_linear.py b/conversion/kimi_linear.py new file mode 100644 index 00000000000..f2e6cda83c1 --- /dev/null +++ b/conversion/kimi_linear.py @@ -0,0 +1,223 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + +from .qwen import QwenModel + + +@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") +class KimiLinearModel(TextModel): + """Kimi-Linear model with hybrid MLA+KDA architecture""" + model_arch = gguf.MODEL_ARCH.KIMI_LINEAR + + _experts: list[dict[str, Tensor]] | None = None + + def set_vocab(self): + try: + self._set_vocab_gpt2() + return + except Exception: + pass + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokpre = self.get_vocab_base_pre(tokenizer) + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # override eos id in config.json with tiktoken eos id + self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute] + else: + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + + def set_gguf_parameters(self): + # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # KDA & MLA params + # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv + linear_attn_config = self.hparams["linear_attn_config"] + # n_head == 0 for KDA layers, n_head > 0 for MLA layers + # full_attention_layers list will be used to distinguish layer type + _num_kv_heads = list() + _full_attn_layers = linear_attn_config["full_attn_layers"] + for il in range(self.hparams["num_hidden_layers"]): + if il + 1 in _full_attn_layers: + _num_kv_heads.append(self.hparams["num_key_value_heads"]) + else: + _num_kv_heads.append(0) + assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] + self.gguf_writer.add_head_count_kv(_num_kv_heads) + + if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None: + self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + if (kda_head_dim := linear_attn_config.get("head_dim")) is not None: + self.gguf_writer.add_kda_head_dim(kda_head_dim) + + # MLA params - use add_* methods that handle arch substitution + # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) + if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None: + self.gguf_writer.add_q_lora_rank(q_lora_rank) + # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA + kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False) + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + + # MLA head dimensions + # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim + qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") + # Rotation - use qk_rope_head_dim for Kimi + qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False) + self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim) + self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim) + v_head_dim = self.hparams.get("v_head_dim") + + # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None: + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + elif qk_nope_head_dim is not None: + n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + + # n_embd_head_v_mla = v_head_dim + if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None: + self.gguf_writer.add_value_length_mla(n_embd_head_v_mla) + elif v_head_dim is not None: + self.gguf_writer.add_value_length_mla(v_head_dim) + + # moe_intermediate_size (1024 for Kimi) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + # num_shared_experts (1 for Kimi) + self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"]) + # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") + + # Handle KDA conv1d weights + # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest + # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest + # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + # Memory layouts match: both have conv_step (d_conv) changing fastest + if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")): + # HF shape: [d_inner, d_conv] e.g. [4096, 4] + # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + if data_torch.ndim == 2: + d_inner, d_conv = data_torch.shape + # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest) + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + elif data_torch.ndim == 3: + # Already 3D [d_inner, 1, d_conv] from unsqueeze + d_inner, _, d_conv = data_torch.shape + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + + # Handle A_log: iHF stores as [1, 1, num_heads, 1] + # llama.cpp expects ggml ne = [1, num_heads, 1, 1] + # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + if name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + logger.info("Changed dt_bias to dt_proj.bias") + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + # w1: gate, w2: down, w3: up + for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), + ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), + ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: + datas: list[Tensor] = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + data_torch = torch.stack(datas, dim=0) + new_name = self.format_tensor_name(tname, bid) + yield from super().modify_tensors(data_torch, new_name, bid) + return + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False) + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/kimivl.py b/conversion/kimivl.py new file mode 100644 index 00000000000..63b8a079b72 --- /dev/null +++ b/conversion/kimivl.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("KimiVLForConditionalGeneration") +class KimiVLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 64 * 14 # for compatibility + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_projector_scale_factor(2) + # eps is the same as pytorch's default value + assert self.hparams_vision is not None + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if not is_vision_tensor: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "pos_emb.weight" in name: + data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) + + if "wqkv" in name: + split_dim = 0 if "weight" in name else -1 + wq, wk, wv = data_torch.chunk(3, dim=split_dim) + yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid) + yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid) + yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("KimiK25ForConditionalGeneration") +class KimiK25Model(MmprojModel): + """Kimi-K2.5 with MoonViT3d vision encoder""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config" + + self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2])) + self.patch_size = self.hparams_vision.get("patch_size", 14) + + # Set image_size for compatibility with base class + # Use position embedding dimensions as image_size reference + pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64) + self.hparams_vision["image_size"] = pos_emb_h * self.patch_size + + def set_gguf_parameters(self): + # Base class MmprojModel.set_gguf_parameters() already writes: + # - vision_block_count, vision_head_count, vision_embedding_length + # - vision_feed_forward_length, vision_patch_size, image_mean, image_std + # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25) + + # Position embedding parameters (for interpolation) + self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64)) + self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64)) + self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4)) + + # Projector parameters + self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu") + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0]) + + # Image size limits + # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet) + in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384) + min_patches = 8 # reasonable minimum + pixels_per_patch = self.patch_size ** 2 + self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch) + self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch) + + @staticmethod + def permute(weights: Tensor, n_head: int) -> Tensor: + out_dim, in_dim = weights.shape + head_dim = out_dim // n_head + w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim) + w = w.permute(0, 2, 1, 3, 4) + return w.reshape(out_dim, in_dim) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Only process vision and projector tensors + is_vision = any(x in name for x in ["vision_tower", "mm_projector"]) + + if not is_vision: + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + assert self.hparams_vision is not None + n_head = self.hparams_vision.get("num_attention_heads", 16) + + # Permute Q/K weights/biases from interleaved to split RoPE format + # This allows using build_rope_2d at runtime without post-permutation. + if "wqkv" in name: + out_dim = data_torch.shape[0] + qkv_dim = out_dim // 3 + head_dim = qkv_dim // n_head + + if "weight" in name: + wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :] + wq = self.permute(wq, n_head) + wk = self.permute(wk, n_head) + data_torch = torch.cat([wq, wk, wv], dim=0) + elif "bias" in name: + bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:] + bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) + bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) + data_torch = torch.cat([bq, bk, bv], dim=0) + + # Temporal embeddings: (T, 1, C) โ†’ (T, C) + if "pos_emb.time_weight" in name: + T, _, C = data_torch.shape + data_torch = data_torch.reshape(T, C) + + # PatchMergerMLP tensor name mapping + # proj.0.weight โ†’ proj.linear_1.weight + # proj.2.weight โ†’ proj.linear_2.weight + if "mm_projector.proj.0." in name: + name = name.replace(".proj.0.", ".proj.linear_1.") + elif "mm_projector.proj.2." in name: + name = name.replace(".proj.2.", ".proj.linear_2.") + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/lfm2.py b/conversion/lfm2.py new file mode 100644 index 00000000000..f28fccf10f2 --- /dev/null +++ b/conversion/lfm2.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + +from .gemma import ConformerAudioModel + + +@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") +class LFM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.LFM2 + + def _add_feed_forward_length(self): + ff_dim = self.find_hparam(["block_ff_dim", "intermediate_size"]) + auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"] + ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"] + multiple_of = self.hparams["block_multiple_of"] + + if auto_adjust_ff_dim: + ff_dim = int(2 * ff_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + ff_dim = int(ffn_dim_multiplier * ff_dim) + ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) + + self.gguf_writer.add_feed_forward_length(ff_dim) + + def set_gguf_parameters(self): + # set num_key_value_heads only for attention layers + self.hparams["num_key_value_heads"] = [ + self.hparams["num_key_value_heads"] if layer_type != "conv" else 0 + for layer_type in self.hparams["layer_types"] + ] + + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) + self._add_feed_forward_length() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if ConformerAudioModel.is_audio_tensor(name): + # skip multimodal tensors + return None + + name = name.replace("lfm.", "model.") # audio + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # conv op requires 2d tensor + if 'conv.conv' in name: + data_torch = data_torch.squeeze(1) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Lfm2Model") +class LFM2ColBertModel(LFM2Model): + model_arch = gguf.MODEL_ARCH.LFM2 + dense_tensor_name = "dense_2" + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if not name.startswith(self.dense_tensor_name): + name = "model." + name + + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # dense tensor is stored in a separate safetensors file + from safetensors.torch import load_file + tensors_file = self.dir_model / "1_Dense" / "model.safetensors" + assert tensors_file.is_file() + tensor = load_file(tensors_file)["linear.weight"] + self.gguf_writer.add_embedding_length_out(tensor.shape[0]) + yield f"{self.dense_tensor_name}.weight", tensor.clone() + + +@ModelBase.register("Lfm2MoeForCausalLM") +class LFM2MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.LFM2MOE + + def set_gguf_parameters(self): + # set num_key_value_heads only for attention layers + self.hparams["num_key_value_heads"] = [ + self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 + for layer_type in self.hparams["layer_types"] + ] + + super().set_gguf_parameters() + + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"]) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) + + # cache for experts weights for merging + _experts_cache: dict[int, dict[str, Tensor]] = {} + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # conv op requires 2d tensor + if 'conv.conv' in name: + data_torch = data_torch.squeeze(1) + + # merge expert weights + if 'experts' in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + expert_cache = self._experts_cache.setdefault(bid, {}) + expert_cache[name] = data_torch + expert_weights = ["w1", "w2", "w3"] + + # not enough expert weights to merge + if len(expert_cache) < n_experts * len(expert_weights): + return + + for w_name in expert_weights: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight" + datas.append(expert_cache[ename]) + del expert_cache[ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + + del self._experts_cache[bid] + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + assert not self._experts_cache + + +@ModelBase.register("Lfm2VlForConditionalGeneration") +class LFM2VLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility + self.hparams_vision["image_size"] = 256 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 + vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("model.vision_tower.", "vision_tower.") + name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "patch_embedding.weight" in name: + data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Lfm2AudioForConditionalGeneration") +class LFM2AudioModel(ConformerAudioModel): + has_vision_encoder = False + has_audio_encoder = True + model_name = "Lfm2AudioEncoder" + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder") + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] + self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # skip language model tensors + if name.startswith("lfm."): + return None + + # for training only + if any(p in name for p in ["audio_loss_weight"]): + return None + + # for audio output + if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("Lfm25AudioTokenizer") +class LFM25AudioTokenizer(LFM2Model): + model_arch = gguf.MODEL_ARCH.LFM2 + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_embedding_length_out(self.hparams["output_size"]) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # skip language model tensors + if name == "istft.window" or name.startswith("emb.emb"): + return None + + if name.startswith("lin"): + name = name.replace("lin", "dense_2_out") + + return super().filter_tensors((name, gen)) diff --git a/conversion/lighton_ocr.py b/conversion/lighton_ocr.py new file mode 100644 index 00000000000..ead3200ac18 --- /dev/null +++ b/conversion/lighton_ocr.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llava import LlavaVisionModel + + +@ModelBase.register("LightOnOCRForConditionalGeneration") +class LightOnOCRVisionModel(LlavaVisionModel): + is_mistral_format = False + use_break_tok = False + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("model.vision_encoder.", "vision_tower.") + name = name.replace("model.vision_projection.", "multi_modal_projector.") + + return super().filter_tensors((name, gen)) diff --git a/conversion/llada.py b/conversion/llada.py new file mode 100644 index 00000000000..98dc9de95b3 --- /dev/null +++ b/conversion/llada.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("LLaDAModelLM") +class LLaDAModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA + undo_permute = True + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] + vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) + assert max(vocab_dict.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + # Check if it's a special token - treat special tokens as CONTROL tokens + if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + def set_vocab(self): + self._set_vocab_gpt2() + + # LLaDA specific parameters + self.gguf_writer.add_add_bos_token(True) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + # Add parameters similar to LlamaModel + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + n_heads = hparams.get("num_attention_heads", hparams.get("n_heads")) + assert n_heads is not None + rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads + self.gguf_writer.add_rope_dimension_count(rope_dim) + + # Set context length for LLaDA + context_length = self.hparams.get("max_sequence_length", 4096) + self.gguf_writer.add_context_length(context_length) + + # Set embedding length (dimension size) + embedding_length = self.hparams.get("d_model", 4096) + self.gguf_writer.add_embedding_length(embedding_length) + + # Set feed forward length (MLP hidden size) + feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) + self.gguf_writer.add_feed_forward_length(feed_forward_length) + + # LLaDA models use non-causal attention for diffusion, similar to Dream + self.gguf_writer.add_causal_attention(False) + + # LLaDA models don't shift their logits + self.gguf_writer.add_diffusion_shift_logits(False) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads")) + assert n_head is not None + n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads")) + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head) + + # LLaDA model tensors should be mapped directly since it's the base model + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") +class LLaDAMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + self.gguf_writer.add_mask_token_id(156895) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_diffusion_shift_logits(False) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/llama.py b/conversion/llama.py new file mode 100644 index 00000000000..fd6167bfd91 --- /dev/null +++ b/conversion/llama.py @@ -0,0 +1,314 @@ +from __future__ import annotations + +import json +import math + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register( + "LLaMAForCausalLM", + "LlamaForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "VLlama3ForCausalLM", + "LlavaForConditionalGeneration", + "VoxtralForConditionalGeneration", + "IQuestCoderForCausalLM", + "LlamaModel") +class LlamaModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLAMA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hf_arch == "VLlama3ForCausalLM": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + # Mistral consolidated format has no config.json; origin_hf_arch is HF-only. + if self.is_mistral_format: + self.origin_hf_arch = None + else: + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + def set_vocab(self): + if self.origin_hf_arch == "GlmasrModel": + return self._set_vocab_glmedge() + + if self.is_mistral_format: + return self._set_vocab_mistral() + + path_tekken_json = self.dir_model / "tekken.json" + path_tokenizer_json = self.dir_model / "tokenizer.json" + if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): + self._set_vocab_mistral() + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None: + self.gguf_writer.add_add_space_prefix(add_prefix_space) + if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer": + return self._set_vocab_hybriddna() + + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + if not self.is_mistral_format: + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + # Mirror the BF16 Q/K RoPE permutation site in modify_tensors; the NVFP4 path bypasses it. + if self.undo_permute: + n_head = self.find_hparam(["n_heads", "num_attention_heads"], optional=True) + n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"], optional=True) + if n_head is not None: + if name.endswith("q_proj.weight"): + weight = LlamaModel.permute(weight, n_head, n_head) + scale = LlamaModel.permute(scale, n_head, n_head) + elif name.endswith("k_proj.weight"): + weight = LlamaModel.permute(weight, n_head, n_kv_head) + scale = LlamaModel.permute(scale, n_head, n_kv_head) + super()._repack_nvfp4(name, weight, scale, scale2, input_scale) + + _experts: list[dict[str, Tensor]] | None = None + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "text_model." in name: + name = name.replace("text_model.", "") # for SmolVLM + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) + n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) + + if self.hf_arch == "LlamaModel": + name = "model." + name + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4 + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("ArceeForCausalLM") +class ArceeModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.ARCEE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + +@ModelBase.register( + "Llama4ForConditionalGeneration", + "Llama4ForCausalLM", +) +class Llama4Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA4 + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this + self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] + self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) + if "layer_types" in self.hparams: + if all(lt == "full_attention" for lt in self.hparams["layer_types"]): + # all layers are full attention (for MobileLLM), disable swa + self.gguf_writer.add_sliding_window(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # split the gate_up into gate and up + if "gate_up_proj" in name: + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + dim_half = data_torch.shape[-1] // 2 + gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + return + + if name.endswith("down_proj"): + name += ".weight" + data_torch = data_torch.transpose(-1, -2) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("LlamaBidirectionalModel") +class LlamaEmbedNemotronModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA_EMBED + + +@ModelBase.register("SmolLM3ForCausalLM") +class SmolLM3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.SMOLLM3 + + +@ModelBase.register("ApertusForCausalLM") +class ApertusModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.APERTUS + undo_permute = False + + _alpha_n = {} + _alpha_p = {} + _beta = {} + _eps = {} + + def modify_tensors(self, data_torch, name, bid): + # Handle xIELU activation parameters + n_layers = self.hparams["num_hidden_layers"] + if name.endswith(".act_fn.alpha_n"): + self._alpha_n[bid] = data_torch.to("cpu").float().item() + if (len(self._alpha_n) == n_layers): + self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) + return + if name.endswith(".act_fn.alpha_p"): + self._alpha_p[bid] = data_torch.to("cpu").float().item() + if (len(self._alpha_p) == n_layers): + self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) + return + if name.endswith(".act_fn.beta"): + self._beta[bid] = data_torch.to("cpu").float().item() + if (len(self._beta) == n_layers): + self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) + return + if name.endswith(".act_fn.eps"): + self._eps[bid] = data_torch.to("cpu").float().item() + if (len(self._eps) == n_layers): + self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/llama4.py b/conversion/llama4.py new file mode 100644 index 00000000000..f84c7629619 --- /dev/null +++ b/conversion/llama4.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("Llama4ForConditionalGeneration") +class Llama4VisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"]) + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"])) + assert self.hparams["hidden_act"] == "gelu" + self.gguf_writer.add_vision_use_gelu(True) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "multi_modal_projector" not in name and "vision_model" not in name: + return None + + if "positional_embedding_vlm" in name and ".weight" not in name: + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "multi_modal_projector.linear_1" in name: + # despite the name with number postfix, this is a single fully connected layer + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch) + else: + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/llava.py b/conversion/llava.py new file mode 100644 index 00000000000..31d6e2ad80e --- /dev/null +++ b/conversion/llava.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import json + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + +from .llama import LlamaModel + + +@ModelBase.register( + "LlavaForConditionalGeneration", # pixtral + "Mistral3ForConditionalGeneration", # mistral small 3.1 +) +class LlavaVisionModel(MmprojModel): + img_break_tok_id = -1 + use_break_tok = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams.get("model_type") == "pixtral": + # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py + self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) + if self.use_break_tok: + self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") + elif self.is_mistral_format: + # hparams is already vision config here so norm_eps is only defined in global_config. + self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) + assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" + if self.use_break_tok: + self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) + + # params.json may ship -1 placeholders (Mistral Medium 3.5) + # resolve the real id from the bundled tokenizer in that case + if self.img_break_tok_id < 0: + self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]") + else: + raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + logger.info(f"Image break token id: {self.img_break_tok_id}") + + def get_token_id(self, token: str) -> int: + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {} + for id_, token_data in added_tokens_decoder.items(): + if token_data.get("content") == token: + return int(id_) + # fallthrough to tokenizer.json + with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + for token_data in tokenizer_json["added_tokens"]: + if token_data["content"] == token: + return int(token_data["id"]) + raise ValueError(f"Token '{token}' not found in tokenizer config.") + + def get_mistral_token_id(self, token: str) -> int: + # mistral native format ships tekken.json or a versioned spm tokenizer + tekken_file = self.dir_model / "tekken.json" + if tekken_file.is_file(): + with open(tekken_file, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data.get("special_tokens", []): + if entry.get("token_str") == token: + return int(entry["rank"]) + tokenizer_json_file = self.dir_model / "tokenizer.json" + if tokenizer_json_file.is_file(): + with open(tokenizer_json_file, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data.get("added_tokens", []): + if entry.get("content") == token: + return int(entry["id"]) + raise ValueError(f"Token '{token}' not found in mistral tokenizer files.") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if hparams.get("model_type") == "pixtral": + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) + + # hidden_act + if hparams["hidden_act"] == "silu": + self.gguf_writer.add_vision_use_silu(True) + elif hparams["hidden_act"] == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + else: + raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") + + # spatial_merge_size + if "spatial_merge_size" in self.global_config: + self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = ( + self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) + ) + n_kv_head = n_head + + valid_prefixes = ( + "multi_modal_projector.", + "vision_tower.", + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ) + + if any(name.startswith(prefix) for prefix in valid_prefixes): + # process vision tensors + if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format: + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + yield from super().modify_tensors(data_torch, name, bid) + return + + embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" + if self.img_break_tok_id > 0 and embed_key in name: + logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") + # for pixtral model, we need to extract the [IMG_BREAK] token embedding + img_break_embd = data_torch[self.img_break_tok_id] + name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] + yield from super().modify_tensors(img_break_embd, name, bid) + + return # skip other tensors diff --git a/conversion/maincoder.py b/conversion/maincoder.py new file mode 100644 index 00000000000..18b625b08fb --- /dev/null +++ b/conversion/maincoder.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("MaincoderForCausalLM") +class MaincoderModel(TextModel): + model_arch = gguf.MODEL_ARCH.MAINCODER + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_rope_dimension_count(head_dim) diff --git a/conversion/mamba.py b/conversion/mamba.py new file mode 100644 index 00000000000..be0e36a29ba --- /dev/null +++ b/conversion/mamba.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +import json + +from pathlib import Path +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(TextModel): + model_arch = gguf.MODEL_ARCH.MAMBA + + def __init__(self, dir_model: Path, *args, **kwargs): + # Avoid using AutoConfig for hparams + hparams = kwargs.pop("hparams", None) + if hparams is None: + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + super().__init__(dir_model, *args, hparams=hparams, **kwargs) + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + elif (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + self._set_vocab_builtin("gpt-neox", vocab_size) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + use_dt_b_c_norm = False + # For falconmamba we do apply RMS norm on B / DT and C layers + if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): + use_dt_b_c_norm = True + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers + self.gguf_writer.add_file_type(self.ftype) + + _tok_embd = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + + new_name = self.map_tensor_name(name) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # [4 1 8192 1] -> [4 8192 1 1] + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + + # assuming token_embd.weight is seen before output.weight + if self._tok_embd is not None and new_name == output_name: + if torch.equal(self._tok_embd, data_torch): + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + return + elif new_name == tok_embd_name: + self._tok_embd = data_torch + + yield from super().modify_tensors(data_torch, new_name, bid) + + +@ModelBase.register("Mamba2ForCausalLM") +class Mamba2Model(TextModel): + model_arch = gguf.MODEL_ARCH.MAMBA2 + + def __init__(self, dir_model: Path, *args, **kwargs): + # Avoid using AutoConfig for hparams + # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1 + hparams = kwargs.pop("hparams", None) + if hparams is None: + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + if "llm_config" in hparams: + hparams["text_config"] = hparams["llm_config"] + super().__init__(dir_model, *args, hparams=hparams, **kwargs) + self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) + self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model + self.n_group = self.find_hparam(["n_groups"], optional=True) or 1 + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 16 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + elif (self.dir_model / "tokenizer.model.v3").is_file(): + # mamba-codestral + raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}") + elif (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + self._set_vocab_builtin("gpt-neox", vocab_size) + + def set_gguf_parameters(self): + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 + head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64 + + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + # TODO: does this really matter? + # skip the assertion for FalconH1 Model + if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: + assert self.d_inner == 2 * self.d_model + assert self.d_inner % head_dim == 0 + + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(self.d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(self.d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim) + self.gguf_writer.add_ssm_group_count(self.n_group) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("model.backbone", "model.lm_head")): + # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2 + name = name.removeprefix("model.") + + if name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): + data_torch = data_torch.squeeze() + elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [ + gguf.MODEL_TENSOR.SSM_A, + gguf.MODEL_TENSOR.SSM_D, + ]): + # unsqueeze A to use similar shape semantics as Mamba-1 + # (D is also unsqueezed, but for more straightforward broadcast internally) + data_torch = data_torch.reshape((*data_torch.shape, 1)) + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid): + data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group)) + + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + yield (new_name, data_torch) diff --git a/conversion/mimo.py b/conversion/mimo.py new file mode 100644 index 00000000000..d4067aab4b6 --- /dev/null +++ b/conversion/mimo.py @@ -0,0 +1,295 @@ +from __future__ import annotations + +import re + +from typing import Callable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register("MiMoV2FlashForCausalLM", "MiMoV2ForCausalLM") +class MimoV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.MIMO2 + + # MiMo V2-Flash, V2.5 and V2.5-Pro all ship 3 trained MTP layers under model.mtp.layers.{0,1,2}. + # The HF config does not expose the count, so it's hardcoded to match the count found in the safetensors. + _n_nextn = 3 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.block_count = self.hparams["num_hidden_layers"] + self._n_nextn + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + @staticmethod + def _tp_aware_qkv_dequant(weight: Tensor, scale_inv: Tensor, + n_q: int, n_kv: int, hd: int, vhd: int, + bs: int = 128) -> Tensor: + # MiMo-V2.5 (TP=4) and V2.5-Pro (TP=8) ship qkv_proj sharded across TP + # ranks; per rank, rows are stacked as [Q_per | K_per | V_per]. + # weight_scale_inv has ceil(rows_per_rank/bs) block-rows per rank (last + # may extend past rows_per_rank with phantom rows not in the weight). + # Naive repeat_interleave aligns rank 0 only and mis-applies scales to + # later ranks once rows_per_rank isn't a multiple of bs. + # Re-group the per-rank [Q_per|K_per|V_per] rows into a single fused + # [Q | K | V] tensor matching the un-sharded original layout. + q_size = n_q * hd + k_size = n_kv * hd + v_size = n_kv * vhd + total_rows = q_size + k_size + v_size + if weight.shape[0] != total_rows: + raise ValueError(f"qkv_proj weight rows {weight.shape[0]} != q+k+v {total_rows}") + + # detect TP from scale_inv block count, descending order so larger matches first + tp = None + for cand in (8, 4): + if total_rows % cand != 0: + continue + rpr = total_rows // cand + bpr = (rpr + bs - 1) // bs + if scale_inv.shape[0] == cand * bpr: + tp = cand + break + if tp is None: + raise ValueError( + f"qkv_proj: cannot detect TP - scale_inv rows {scale_inv.shape[0]}, " + f"q+k+v {total_rows}") + + q_per = q_size // tp + k_per = k_size // tp + v_per = v_size // tp + rows_per_rank = q_per + k_per + v_per + blocks_per_rank = (rows_per_rank + bs - 1) // bs + + scale_inv = scale_inv.float() + # per-row scale-row index: rank * blocks_per_rank + (rr_in_rank // bs) + row_idx = torch.arange(total_rows) + rr = row_idx % rows_per_rank + rank = row_idx // rows_per_rank + scale_row_idx = rank * blocks_per_rank + (rr // bs) + # gather: (total_rows, n_col_blocks) + scale_per_row_block = scale_inv[scale_row_idx] + # expand col-blocks -> cols: each block-col covers `bs` weight cols + scale_full = scale_per_row_block.repeat_interleave(bs, dim=1) + # crop to weight col count (in case last col-block isn't full) + scale_full = scale_full[:, : weight.shape[1]] + dequant = weight.float() * scale_full + + if tp == 1: + return dequant + + # Re-group per-rank [Q_per|K_per|V_per] rows into unified [Q | K | V] + qs, ks, vs = [], [], [] + for r in range(tp): + base = r * rows_per_rank + qs.append(dequant[base : base + q_per]) + ks.append(dequant[base + q_per : base + q_per + k_per]) + vs.append(dequant[base + q_per + k_per : base + rows_per_rank]) + return torch.cat(qs + ks + vs, dim=0) + + def dequant_model(self): + # Capture raw FP8 (weight, scale_inv) lambdas for qkv_proj BEFORE super + # rewrites them with the existing dequant. Replace super's lambda after + # it runs so scale_inv removal still happens via the standard path. + qkv_overrides: dict[str, tuple[Callable, Callable, int]] = {} + qc = self.hparams.get("quantization_config") + if isinstance(qc, dict) and qc.get("quant_method") == "fp8": + pat = re.compile(r"^model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight_scale_inv$") + for name in list(self.model_tensors.keys()): + m = pat.match(name) + if not m: + continue + weight_name = name.removesuffix("_scale_inv") + if weight_name not in self.model_tensors: + continue + qkv_overrides[weight_name] = ( + self.model_tensors[weight_name], + self.model_tensors[name], + int(m.group(1)), + ) + + super().dequant_model() + + if not qkv_overrides: + return + + n_q = self.hparams["num_attention_heads"] + hd = self.hparams["head_dim"] + vhd = self.hparams["v_head_dim"] + hybrid = self.hparams["hybrid_layer_pattern"] + n_layer_text = self.hparams["num_hidden_layers"] + for weight_name, (w_fn, s_fn, bid) in qkv_overrides.items(): + # MTP layers (bid >= n_layer_text) use SWA-style attention dims + is_swa = True if bid >= n_layer_text else hybrid[bid] == 1 + n_kv = self.hparams["swa_num_key_value_heads" if is_swa else "num_key_value_heads"] + self.model_tensors[weight_name] = ( + lambda w_fn=w_fn, s_fn=s_fn, n_q=n_q, n_kv=n_kv, hd=hd, vhd=vhd: + MimoV2Model._tp_aware_qkv_dequant(w_fn(), s_fn(), n_q, n_kv, hd, vhd) + ) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + assert self.hparams["swa_head_dim"] == self.hparams["head_dim"] + assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"] + assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"] + assert self.hparams["topk_method"] == "noaux_tc" + + n_head_kv = self.hparams["num_key_value_heads"] + n_head_kv_swa = self.hparams["swa_num_key_value_heads"] + # Extend the per-layer pattern with SWA entries for the MTP blocks so the + # runtime arrays (sized to extended block_count) are fully populated. + hybrid = list(self.hparams["hybrid_layer_pattern"]) + [1] * self._n_nextn + n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in hybrid] + self.gguf_writer.add_head_count_kv(n_head_kv_arr) + + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_sliding_window_pattern(hybrid) + self.gguf_writer.add_value_length(self.hparams["v_head_dim"]) + self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + + rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"]) + self.gguf_writer.add_rope_dimension_count(rope_dim) + + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5)) + + v_scale = self.hparams.get("attention_value_scale") + if v_scale is not None: + self.gguf_writer.add_attn_value_scale(float(v_scale)) + + self.gguf_writer.add_nextn_predict_layers(self._n_nextn) + + _experts: list[dict[str, Tensor]] | None = None + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "attention_sink" in name and not name.endswith(".weight"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch, name, bid): + # Remap MTP/NextN tensors to additional layer slots so the standard tensor map handles them. + # HF: model.mtp.layers.{i}.foo -> model.layers.{n_layer_text + i}.foo + m = re.match(r"^model\.mtp\.layers\.(\d+)\.(.*)$", name) + if m is not None: + mtp_idx = int(m.group(1)) + assert mtp_idx < self._n_nextn, f"MTP layer index {mtp_idx} >= _n_nextn ({self._n_nextn})" + rest = m.group(2) + n_layer_text = self.hparams["num_hidden_layers"] + new_bid = n_layer_text + mtp_idx + name = f"model.layers.{new_bid}.{rest}" + bid = new_bid + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["gate_proj", "up_proj", "down_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename_to_retrieve]) + del self._experts[bid][ename_to_retrieve] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("MiMoV2ForCausalLM") +class MiMoV2VisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + hp = self.hparams_vision + + hp["image_size"] = hp.get("image_size", 560) + hp["num_attention_heads"] = hp.get("num_heads", 32) + hp["num_hidden_layers"] = hp.get("depth", 28) + + self.n_q_heads = int(hp["num_heads"]) + self.num_kv_heads = int(hp.get("num_key_value_heads", 8)) + self.head_dim = int(hp.get("qk_channels", 64)) + self.spatial_merge_size = int(hp["spatial_merge_size"]) + # MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the + # field is absent from MiMo-V2.5's vision_config + self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6)) + + # fullatt_block_indexes are also reflected in vit_window_attn_types as -1 + self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or []) + self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or []) + self.visual_token_window_size = int(hp.get("visual_token_window_size", -1)) + self.use_sink = bool(hp.get("use_sink", False)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL) + self.gguf_writer.add_vision_use_silu(True) + self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads) + self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size) + self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size) + self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types) + self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps) + self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) + self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # Sinks must be F32: any sink-style softmax/mask add in ggml requires + # F32, and we fold sinks into a host-built F32 mask at encode time. + if new_name.endswith(".attn_sinks"): + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + if not name.startswith("visual."): + return None + return super().filter_tensors(item) + + def modify_tensors(self, data_torch, name, bid): + # Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D + # weights that the existing qwen2vl-style two-Conv2D path consumes. + if name == "visual.patch_embed.proj.weight": + _, _, kt, _, _ = data_torch.shape + if kt != 2: + raise ValueError(f"unexpected temporal_patch_size: {kt}") + embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + yield (embd_name + ".weight", data_torch[:, :, 0, ...]) + yield (embd_name + ".weight.1", data_torch[:, :, 1, ...]) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/minicpm.py b/conversion/minicpm.py new file mode 100644 index 00000000000..e9a4c4a74dc --- /dev/null +++ b/conversion/minicpm.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .llama import LlamaModel +from .qwen import Qwen3_5TextModel + + +@ModelBase.register("MiniCPMForCausalLM") +class MiniCPMModel(TextModel): + model_arch = gguf.MODEL_ARCH.MINICPM + + def set_gguf_parameters(self): + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(TextModel): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + rope_dims = self.hparams["qk_rope_head_dim"] + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under +# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger +# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as +# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup. + +@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") +class MiniCPMV4_6TextModel(Qwen3_5TextModel): + model_arch = gguf.MODEL_ARCH.QWEN35 + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("model.merger."): + return None + # MTP tensors are not used at inference yet; align with Qwen3Next behaviour + if name.startswith("mtp"): + return None + + return super().filter_tensors(item) + + +@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") +class MiniCPMV4_6VisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams_vision is not None: + # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP + # positional embedding bucket grid (70 x 70), while the per-slice processing + # resolution is the preprocessor's `scale_resolution` (typically 448). + # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size` + # as the slice size and warmup resolution, so report `scale_resolution` there + # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules. + scale_resolution = self.preprocessor_config.get("scale_resolution") + if scale_resolution is not None: + self.hparams_vision["image_size"] = int(scale_resolution) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + # projector type string is consumed by clip_projector_type_from_string() in clip.cpp + # (mapped to PROJECTOR_TYPE_MINICPMV4_6). + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6) + + # ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment + self.gguf_writer.add_vision_projector_scale_factor(4) + + # borrow wa_layer_indexes for vit_merger insertion point + insert_layer_id = int(self.global_config.get( + "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6))) + self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id]) + + # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx). + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps( + self.hparams_vision.get("layer_norm_eps", 1e-6)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # lm_head / MTP -> belong to the LM file + if name.startswith(("lm_head.", "mtp")): + return None + + return super().filter_tensors(item) diff --git a/conversion/minimax.py b/conversion/minimax.py new file mode 100644 index 00000000000..4857775cbfb --- /dev/null +++ b/conversion/minimax.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("MiniMaxM2ForCausalLM") +class MiniMaxM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.MINIMAXM2 + _experts_cache: dict[int, dict[str, Tensor]] = {} + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # merge expert weights + if 'experts' in name: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + expert_cache = self._experts_cache.setdefault(bid, {}) + expert_cache[name] = data_torch + expert_weights = ["w1", "w2", "w3"] + + # not enough expert weights to merge + if len(expert_cache) < n_experts * len(expert_weights): + return + + for w_name in expert_weights: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(expert_cache[ename]) + del expert_cache[ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + new_name = self.map_tensor_name(merged_name) + yield from super().modify_tensors(data_torch, new_name, bid) + + del self._experts_cache[bid] + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/mistral.py b/conversion/mistral.py new file mode 100644 index 00000000000..7a7d6e03933 --- /dev/null +++ b/conversion/mistral.py @@ -0,0 +1,201 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MistralTokenizerType, MistralVocab, _mistral_common_installed, _mistral_import_error_msg, gguf, logger + +from .deepseek import DeepseekV2Model +from .llama import LlamaModel + +if _mistral_common_installed: + from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import] + from mistral_common.tokens.tokenizers.sentencepiece import SentencePieceTokenizer # type: ignore[import-not-found, ty:unresolved-import] +else: + TokenizerVersion = None # type: ignore[assignment] + Tekkenizer = None # type: ignore[assignment] + SentencePieceTokenizer = None # type: ignore[assignment] + + +class MistralModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.MISTRAL3 + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # for compatibility, we use LLAMA arch for older models + # TODO: remove this once everyone migrates to newer version of llama.cpp + if "llama_4_scaling" not in self.hparams: + self.model_arch = gguf.MODEL_ARCH.LLAMA + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def dequant_model(self): + # transform quantization config into HF format + quant_config = self.hparams.get("quantization") + if quant_config is not None: + assert quant_config["qformat_weight"] == "fp8_e4m3" + self.hparams["quantization_config"] = { + "activation_scheme": "static", + "quant_method": "fp8", + "weight_block_size": None, + } + return super().dequant_model() + + @staticmethod + def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): + assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg + assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( + f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" + ) + + if vocab.tokenizer.version == TokenizerVersion.v1: + return "mistral-v1" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v3" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v3-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v7" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v7-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v11: + template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja" + elif vocab.tokenizer.version == TokenizerVersion.v13: + template_file = "unsloth-mistral-Devstral-Small-2507.jinja" + else: + err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}" + if is_mistral_format: + err_message += ( + " . Please pass --disable-mistral-community-chat-template argument to the CLI " + "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." + ) + raise ValueError(err_message) + + template_path = templates_dir / template_file + if not template_path.exists(): + raise FileNotFoundError(f"Template file not found: {template_path}") + + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + + return template + + def set_gguf_parameters(self): + super().set_gguf_parameters() + MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + + @staticmethod + def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): + if "yarn" in hparams: + yarn_params = hparams["yarn"] + mscale_all_dim = 1.0 if not yarn_params["apply_scale"] else 0.0 + gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + gguf_writer.add_rope_scaling_factor(yarn_params["factor"]) + gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"]) + gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"]) + gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim) + gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"]) + + if "llama_4_scaling" in hparams: + gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) + + +class MistralMoeModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + logger.info("Using MistralMoeModel") + # remap hparams from Mistral MoE format to DeepseekV2 format + # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic + # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py + config = self.hparams + # Mistral key -> HF key + config_mapping = { + "dim": "hidden_size", + "norm_eps": "rms_norm_eps", + "n_kv_heads": "num_key_value_heads", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "hidden_dim": "intermediate_size", + } + # HF key -> (Mistral key, default value) + top_level_mapping_with_default = { + "model_type": ("model_type", "transformer"), + "hidden_act": ("activation", "silu"), + "tie_word_embeddings": ("tied_embeddings", False), + "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), + "max_position_embeddings": ("max_position_embeddings", 128_000), + } + # mapping top-level keys + for key, new_key in config_mapping.items(): + if key in config: + config[new_key] = config[key] + for new_key, (key, default_value) in top_level_mapping_with_default.items(): + config[new_key] = config.get(key, default_value) + # mapping MoE-specific keys + moe_config_map = { + "route_every_n": "moe_layer_freq", + "first_k_dense_replace": "first_k_dense_replace", + "num_experts_per_tok": "num_experts_per_tok", + "num_experts": "n_routed_experts", + "expert_hidden_dim": "moe_intermediate_size", + "routed_scale": "routed_scaling_factor", + "num_shared_experts": "n_shared_experts", + "num_expert_groups": "n_group", + "num_expert_groups_per_tok": "topk_group", + } + moe = config["moe"] + for key, new_key in moe_config_map.items(): + if key in moe: + config[new_key] = moe[key] + # provide missing values + config["topk_method"] = None + config["norm_topk_prob"] = True + config["scoring_func"] = "softmax" + + def set_vocab(self): + self._set_vocab_mistral() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + yarn_params = self.hparams["yarn"] + self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) + + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic + if name.endswith(".qscale_act"): + name = name.replace(".qscale_act", ".input_scale") + if name.endswith(".qscale_weight"): + name = name.replace(".qscale_weight", ".weight_scale") + if ".wkv_b." in name: + name = name.replace(".wkv_b.", ".kv_b_proj.") + if ".experts." in name: + name = name.replace(".experts.", ".mlp.experts.") + name = name.replace(".w1.", ".gate_proj.") + name = name.replace(".w2.", ".down_proj.") + name = name.replace(".w3.", ".up_proj.") + name = "model." + name + + return super().filter_tensors((name, gen)) diff --git a/conversion/mistral3.py b/conversion/mistral3.py new file mode 100644 index 00000000000..af9438ae705 --- /dev/null +++ b/conversion/mistral3.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + +from .deepseek import DeepseekV2Model +from .llama import LlamaModel + + +@ModelBase.register( + "Mistral3ForConditionalGeneration", + "Ministral3ForCausalLM", +) +class Mistral3Model(TextModel): + class Ministral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.MISTRAL3 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + rope_params = self.rope_parameters + if self.hparams.get("model_type") == "ministral3": + assert rope_params, "ministral3 must have 'rope_parameters' config" + assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" + self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) + + class Mistral4Model(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.MISTRAL4 + skip_mtp = False # model contains no MTP layers, so no need to skip + merge_expert = False # experts are already stacked as 3D + + def modify_tensors(self, data_torch, name, bid): + if name.endswith(".down_proj") or name.endswith(".gate_up_proj"): + name = name + ".weight" + yield from super().modify_tensors(data_torch, name, bid) + + model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused + impl: TextModel + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams.get("model_type") == "mistral4": + self.impl = Mistral3Model.Mistral4Model(*args, **kwargs) + else: + self.impl = Mistral3Model.Ministral3Model(*args, **kwargs) + + def set_vocab(self): + self.impl.set_vocab() + + def set_gguf_parameters(self): + self.impl.set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + yield from self.impl.modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + self.impl.prepare_tensors() + + def write_vocab(self): + self.impl.write_vocab() + + def write(self): + self.impl.write() diff --git a/conversion/mpt.py b/conversion/mpt.py new file mode 100644 index 00000000000..9557ab7fa64 --- /dev/null +++ b/conversion/mpt.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("MPTForCausalLM") +class MPTModel(TextModel): + model_arch = gguf.MODEL_ARCH.MPT + + def set_vocab(self): + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "scales" in name: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) + + yield from super().modify_tensors(data_torch, new_name, bid) diff --git a/conversion/nemotron.py b/conversion/nemotron.py new file mode 100644 index 00000000000..dfeeb978582 --- /dev/null +++ b/conversion/nemotron.py @@ -0,0 +1,384 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf, logger + +from .granite import GraniteHybridModel + + +@ModelBase.register( + "NemotronH_Nano_VL_V2", + "RADIOModel", +) +class NemotronNanoV2VLModel(MmprojModel): + # ViT-Huge architecture parameters for RADIO v2.5-h + _vit_hidden_size = 1280 + _vit_intermediate_size = 5120 + _vit_num_layers = 32 + _vit_num_heads = 16 + + def get_vision_config(self) -> dict[str, Any] | None: + # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually + vision_config = self.global_config.get("vision_config") + if vision_config is None: + return None + # Add ViT-H parameters + vision_config = { + **vision_config, + "hidden_size": self._vit_hidden_size, + "intermediate_size": self._vit_intermediate_size, + "num_hidden_layers": self._vit_num_layers, + "num_attention_heads": self._vit_num_heads, + "image_size": self.global_config.get("force_image_size", 512), + } + return vision_config + + def set_gguf_parameters(self): + if "image_mean" not in self.preprocessor_config: + self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406] + if "image_std" not in self.preprocessor_config: + self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225] + + super().set_gguf_parameters() + hparams = self.global_config + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) + self.gguf_writer.add_vision_attention_layernorm_eps(1e-6) + self.gguf_writer.add_vision_use_gelu(True) + downsample_ratio = hparams.get("downsample_ratio", 0.5) + self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name or "pos_embed" in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "input_conditioner" in name: + return None + + # mtmd does not support video yet so skip tensors related to video. + if "radio_model.model.patch_generator.video_embedder" in name: + return None + + if not name.startswith("vision_model.radio_model.model.") and not name.startswith("mlp1."): + return None + + if "patch_generator.pos_embed" in name: + if not name.endswith(".weight"): + name += ".weight" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it + if "patch_generator.pos_embed" in name: + # Downsample position embeddings for fixed 512x512 image size + import torch.nn.functional as F + n_embd = self.hparams["hidden_size"] + image_size = self.global_config.get("force_image_size", 512) + patch_size = self.hparams["patch_size"] + target_patches_per_side = image_size // patch_size # 32 + max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128 + if target_patches_per_side != max_patches_per_side: + # Reshape to grid, interpolate, flatten back + data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd) + data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128] + data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side), + mode='bilinear', align_corners=True) + data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd] + data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd) + + # Reshape linear patch embedding to conv2d format for ggml_conv_2d + # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size] + if "patch_generator.embedder" in name: + patch_size = self.hparams["patch_size"] + n_embd = self.hparams["hidden_size"] + data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NemotronForCausalLM") +class NemotronModel(TextModel): + model_arch = gguf.MODEL_ARCH.NEMOTRON + + def set_vocab(self): + self._set_vocab_sentencepiece() + self.gguf_writer.add_pad_token_id(0) + self.gguf_writer.add_unk_token_id(1) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + + # * Partial RoPE + rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + + # * RopeScaling for Nemotron + if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + else: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side + # model.layers.{l}.input_layernorm.weight + # model.layers.{l}.post_attention_layernorm.weight + # model.norm.weight + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NemotronHForCausalLM") +class NemotronHModel(GraniteHybridModel): + """Hybrid mamba2/attention model from NVIDIA""" + model_arch = gguf.MODEL_ARCH.NEMOTRON_H + is_moe: bool = False + + def __init__(self, *args, **kwargs): + # We have to determine the correct model architecture (MoE vs non-MoE) before + # calling the parent __init__. This is because the parent constructor + # uses self.model_arch to build the tensor name map, and all MoE-specific + # mappings would be missed if it were called with the default non-MoE arch. + hparams = ModelBase.load_hparams(args[0], self.is_mistral_format) + has_moe_params = ( + "num_experts_per_tok" in hparams + or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"]) + ) + if has_moe_params: + self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE + self.is_moe = True + + super().__init__(*args, **kwargs) + + # Save the top-level head_dim for later + self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim")) + assert self.head_dim is not None, "Could not find the attention head dim in config" + + # Don't use expand to calculate d_inner + self.d_inner = self.find_hparam(["num_heads"]) * self.d_model + + # Update the ssm / attn / mlp layers + # M: Mamba2, *: Attention, -: MLP + # MoE: + # M: Mamba2, *: Attention, E: Expert + pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") + if pattern is None: + self._ssm_layers = [] + self._mlp_layers = [] + elif isinstance(pattern, str): + self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"] + self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")] + else: + self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"] + self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"] + + def get_attn_layers(self): + pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") + if pattern is None: + return [] + assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!" + if isinstance(pattern, str): + return [i for i, val in enumerate(pattern) if val == "*"] + + return [i for i, val in enumerate(pattern) if val == "attention"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + head_dim = self.head_dim + if head_dim is None: + raise ValueError("Could not find the attention head dim in config") + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + # Set feed_forward_length + # NOTE: This will trigger an override warning. This is preferable to + # duplicating all the parent logic + if not self.is_moe: + n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) + self.gguf_writer.add_feed_forward_length([ + n_ff if i in self._mlp_layers else 0 for i in range(self.block_count) + ]) + else: + moe_intermediate_size = self.hparams["moe_intermediate_size"] + self.gguf_writer.add_feed_forward_length([ + moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count) + ]) + self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"]) + self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_group_count(self.hparams["n_group"]) + + # number of experts used per token (top-k) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + if (latent_size := self.hparams.get("moe_latent_size")) is not None: + self.gguf_writer.add_moe_latent_size(latent_size) + + def set_vocab(self): + # The NemotronH config uses pattern characters (e.g. '-') that may not + # be supported by the installed transformers version. AutoTokenizer + # internally calls AutoConfig which triggers this parsing failure. + # Using trust_remote_code=True to load the model's own config class. + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + # Pad vocab size (from Mamba2Model/GraniteHybridModel) + self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now. + # From Mamba2Model.set_vocab(): + vocab_size = self.hparams["vocab_size"] + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + # From TextModel.set_vocab_gpt2(): + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + # The tokenizer _does_ add a BOS token (via post_processor type + # TemplateProcessing) but does not set add_bos_token to true in the + # config, so we need to explicitly override it here. + if not self.is_moe: + self.gguf_writer.add_add_bos_token(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.is_moe and bid is not None: + # Skip Multi-Token Prediction (MTP) tensors. These are used for + # for speculative decoding but we don't include them in this model + # conversion. See https://github.com/ggml-org/llama.cpp/pull/18886 + if name.startswith("mtp."): + logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}") + return + + if name.endswith("mixer.gate.e_score_correction.bias"): + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + + if name.endswith("mixer.dt_bias"): + new_name = name.replace("dt_bias", "dt.bias") + yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) + return + + if name.endswith("mixer.conv1d.weight"): + squeezed_data = data_torch.squeeze() + yield from ModelBase.modify_tensors(self, squeezed_data, name, bid) + return + + if name.endswith("mixer.A_log"): + transformed_data = -torch.exp(data_torch) + reshaped_data = transformed_data.squeeze().reshape(-1, 1) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) + return + + if name.endswith("mixer.D"): + reshaped_data = data_torch.squeeze().reshape(-1, 1) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) + return + + if name.endswith("mixer.norm.weight"): + reshaped_data = data_torch.reshape(self.n_group, -1) + yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) + return + + if name.find("mixer.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 2: + # merge the experts into a single tensor + for w_name in ["down_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/olmo.py b/conversion/olmo.py new file mode 100644 index 00000000000..1664c30e402 --- /dev/null +++ b/conversion/olmo.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + +from .llama import LlamaModel + + +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") +class OlmoModel(TextModel): + model_arch = gguf.MODEL_ARCH.OLMO + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_eps(1e-5) + clip_qkv = self.hparams.get("clip_qkv") + if clip_qkv is not None: + self.gguf_writer.add_clamp_kqv(clip_qkv) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("SeedOssForCausalLM") +class SeedOssModel(TextModel): + model_arch = gguf.MODEL_ARCH.SEED_OSS + + +@ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") +class Olmo2Model(TextModel): + model_arch = gguf.MODEL_ARCH.OLMO2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + +@ModelBase.register("OlmoeForCausalLM") +class OlmoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/openelm.py b/conversion/openelm.py new file mode 100644 index 00000000000..ecc746dc348 --- /dev/null +++ b/conversion/openelm.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import Any, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("OpenELMForCausalLM") +class OpenELMModel(TextModel): + model_arch = gguf.MODEL_ARCH.OPENELM + + @staticmethod + def _make_divisible(v: float | int, divisor: int) -> int: + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 + new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] + ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] + self._n_embd: int = self.hparams["model_dim"] + self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] + self._num_query_heads: list[int] = self.hparams["num_query_heads"] + self._ffn_dims: list[int] = [ + OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) + for multiplier in ffn_multipliers + ] + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) + + # Uses the tokenizer from meta-llama/Llama-2-7b-hf + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) + + def set_gguf_parameters(self): + n_embd = self._n_embd + head_dim = self.hparams["head_dim"] + rot_pct = 1.0 + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_query_heads) + assert self.block_count == len(self._ffn_dims) + + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_context_length"]) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_head_count(self._num_query_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) + # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 + self.gguf_writer.add_layer_norm_rms_eps(1e-6) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + if "n_layers" in keys: + return self.hparams["num_transformer_layers"] + + return super().find_hparam(keys, optional) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + # split ff + if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": + ff_dim = self._ffn_dims[bid] + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) + return + + yield (self.map_tensor_name(name), data_torch) diff --git a/conversion/orion.py b/conversion/orion.py new file mode 100644 index 00000000000..8dfceeed1f7 --- /dev/null +++ b/conversion/orion.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("OrionForCausalLM") +class OrionModel(TextModel): + model_arch = gguf.MODEL_ARCH.ORION + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) diff --git a/conversion/pangu.py b/conversion/pangu.py new file mode 100644 index 00000000000..42016ba0286 --- /dev/null +++ b/conversion/pangu.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import json + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("PanguEmbeddedForCausalLM") +class PanguEmbeddedModel(TextModel): + model_arch = gguf.MODEL_ARCH.PANGU_EMBED + + def set_vocab(self): + self._set_vocab_sentencepiece() + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + # PanguEmbedded's hparam loaded from config.json without head_dim + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if hparams.get("head_dim") is None: + self.gguf_writer.add_key_length(rope_dim) + self.gguf_writer.add_value_length(rope_dim) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/phi.py b/conversion/phi.py new file mode 100644 index 00000000000..5e0d72847aa --- /dev/null +++ b/conversion/phi.py @@ -0,0 +1,390 @@ +from __future__ import annotations + +import json +import math + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + + +@ModelBase.register("PhiForCausalLM") +class Phi2Model(TextModel): + model_arch = gguf.MODEL_ARCH.PHI2 + + def set_gguf_parameters(self): + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + +@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV") +class Phi3MiniModel(TextModel): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': + return self._set_vocab_gpt2() + + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise ValueError(f'Error: Missing {tokenizer_path}') + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) + for token_id, foken_data in added_tokens_decoder.items(): + token_id = int(token_id) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + tokenizer_file = self.dir_model / 'tokenizer.json' + if tokenizer_file.is_file(): + with open(tokenizer_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + added_tokens = tokenizer_json.get("added_tokens", []) + for foken_data in added_tokens: + token_id = int(foken_data["id"]) + token = foken_data["content"].encode("utf-8") + if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') + tokens[token_id] = token + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + if foken_data.get("special"): + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head + + self.gguf_writer.add_context_length(max_pos_embds) + self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(rope_dims) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"]) + self.gguf_writer.add_file_type(self.ftype) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head + + # write rope scaling for long context (128k) model + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is None: + return + + scale = max_pos_embds / orig_max_pos_embds + + rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() + if len(rope_scaling_type) == 0: + raise KeyError('Missing the required key rope_scaling.type') + + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': + attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 + elif rope_scaling_type == 'yarn': + attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 + else: + raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') + + self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + +@ModelBase.register("Phi4ForCausalLMV") +class Phi4VisionMmprojModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + + self.vision_total_layers = int(self.find_vparam(self.n_block_keys)) + if self.vision_total_layers < 2: + raise ValueError( + f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}" + ) + + # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and + # drop post-layernorm/head weights. This makes the GGUF runtime output match + # the feature map consumed by the patched siglip.cpp Phi-4 projector path. + self.vision_export_layers = self.vision_total_layers - 1 + self.vision_last_layer_idx = self.vision_total_layers - 1 + + for key in self.n_block_keys: + if key in self.hparams_vision: + self.hparams_vision[key] = self.vision_export_layers + break + + self.block_count = self.vision_export_layers + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) + + patch_size = self.preprocessor_config.get("patch_size") + if patch_size is None: + raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json") + + self.hparams_vision["patch_size"] = patch_size + + pos_emb_name = next( + ( + name for name in self.model_tensors + if name.endswith("vision_model.embeddings.position_embedding.weight") + ), + None, + ) + if pos_emb_name is None: + raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight") + + pos_emb_shape = self.model_tensors[pos_emb_name]().shape + base_grid_tokens = int(pos_emb_shape[0]) + grid_side = math.isqrt(base_grid_tokens) + if grid_side * grid_side != base_grid_tokens: + raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}") + + self.hparams_vision["image_size"] = grid_side * patch_size + + min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches")) + max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches")) + if min_num_patches is None or max_num_patches is None: + raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches") + + self.min_pixels = int(min_num_patches) * patch_size * patch_size + self.max_pixels = int(max_num_patches) * patch_size * patch_size + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4) + self.gguf_writer.add_vision_min_pixels(self.min_pixels) + self.gguf_writer.add_vision_max_pixels(self.max_pixels) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("model.vision_tower.vision_tower.", "vision_tower.") + + if not name.startswith(("vision_tower.", "model.mm_projector.", "mm_projector.")): + return None + + if ".vision_model.head." in name: + return None + + if ".vision_model.post_layernorm." in name: + return None + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("vision_tower."): + if bid is not None and bid == self.vision_last_layer_idx: + return + + if name.endswith("vision_model.embeddings.patch_embedding.weight"): + assert self.hparams_vision is not None + if data_torch.ndim != 2: + raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}") + + patch_area = self.hparams_vision["patch_size"] ** 2 + in_features = data_torch.shape[1] + if in_features % patch_area != 0: + raise ValueError( + f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}" + ) + + num_channels = in_features // patch_area + patch_size = self.hparams_vision["patch_size"] + data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels) + data_torch = data_torch.permute(0, 3, 1, 2) + + yield from super().modify_tensors(data_torch, name, bid) + return + + if name.startswith(("model.mm_projector.", "mm_projector.")): + local_name = name + local_name = local_name.replace("model.mm_projector.", "") + local_name = local_name.replace("mm_projector.", "") + + if not (local_name.startswith("0.") or local_name.startswith("2.")): + return + + suffix = ".bias" if local_name.endswith(".bias") else ".weight" + mm_idx = int(local_name.split(".", maxsplit=1)[0]) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch) + return + + return + + +@ModelBase.register("PhiMoEForCausalLM") +class PhiMoeModel(Phi3MiniModel): + model_arch = gguf.MODEL_ARCH.PHIMOE + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) + self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/pixtral.py b/conversion/pixtral.py new file mode 100644 index 00000000000..acd9ce1cf70 --- /dev/null +++ b/conversion/pixtral.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import Sequence + +from .base import gguf + +from .llava import LlavaVisionModel + + +class PixtralModel(LlavaVisionModel): + model_name = "Pixtral" + hf_arch = "" + is_mistral_format = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + + self.gguf_writer.add_vision_attention_layernorm_eps( + self.find_hparam(["norm_eps"]) + ) + self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) + + self.gguf_writer.add_vision_use_silu(True) + + # spatial_merge_size + if self.find_vparam(["mm_projector_id"], optional=True) == "patch_merge": + self.gguf_writer.add_vision_spatial_merge_size( + self.find_vparam(["spatial_merge_size"]) + ) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + if name == "vision_language_adapter.w_in.weight": + return "mm.1.weight" + elif name == "vision_language_adapter.w_in.bias": + return "mm.1.bias" + elif name == "vision_language_adapter.w_out.weight": + return "mm.2.weight" + elif name == "vision_language_adapter.w_out.bias": + return "mm.2.bias" + return super().map_tensor_name(name, try_suffixes) diff --git a/conversion/plamo.py b/conversion/plamo.py new file mode 100644 index 00000000000..c4bcbdf06bc --- /dev/null +++ b/conversion/plamo.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +import json + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("PlamoForCausalLM") +class PlamoModel(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(4096) # not in config.json + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def shuffle_attn_q_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(8, 5, 128, 5120) + data_torch = torch.permute(data_torch, (1, 0, 2, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def shuffle_attn_output_weight(self, data_torch): + assert data_torch.size() == (5120, 5120) + data_torch = data_torch.reshape(5120, 8, 5, 128) + data_torch = torch.permute(data_torch, (0, 2, 1, 3)) + data_torch = torch.reshape(data_torch, (5120, 5120)) + return data_torch + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") +class Plamo2Model(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO2 + + def set_vocab(self): + self._set_vocab_plamo() + + def set_gguf_parameters(self): + hparams = self.hparams + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # Which layers are Mamba layers + # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer) + # This logic matches modeling_plamo.py's is_mamba function + mamba_step = hparams.get("mamba_step", 2) + mamba_enabled = hparams.get("mamba_enabled", True) + num_key_value_heads = [] + num_attention_heads = [] + + if mamba_enabled: + for i in range(self.block_count): + if self.block_count <= (mamba_step // 2): + # use attention in last layer + is_mamba = (i != self.block_count - 1) + else: + is_mamba = (i % mamba_step) != (mamba_step // 2) + if is_mamba: + num_key_value_heads.append(0) + num_attention_heads.append(0) + else: + num_key_value_heads.append(hparams.get("num_key_value_heads", 4)) + num_attention_heads.append(hparams.get("num_attention_heads", 32)) + + if num_key_value_heads and num_attention_heads: + self.gguf_writer.add_head_count_kv(num_key_value_heads) + self.gguf_writer.add_head_count(num_attention_heads) + + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048)) + self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096)) + self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128)) + self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000)) + + # Mamba parameters + self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) + self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4)) + self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64)) + intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128) + self.gguf_writer.add_ssm_inner_size(intermediate_size) + self.gguf_writer.add_ssm_group_count(0) + + # MLP feed forward parameters (for attention layers) + self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312)) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + elif name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + elif name.endswith(".dt_norm_weight"): + name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight" + elif name.endswith(".B_norm_weight"): + name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight" + elif name.endswith(".C_norm_weight"): + name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight" + elif name.endswith(".k_weight"): + name = name.rpartition(".k_weight")[0] + ".k.weight" + elif name.endswith(".q_weight"): + name = name.rpartition(".q_weight")[0] + ".q.weight" + elif name.endswith(".conv1d.weight"): + data_torch = torch.squeeze(data_torch) # remove (, 1, ) + assert data_torch.ndim == 2 + elif name.endswith(".pre_mixer_norm.weight"): + data_torch += 1.0 + elif name.endswith(".post_mixer_norm.weight"): + data_torch += 1.0 / 5 + elif name.endswith(".pre_mlp_norm.weight"): + data_torch += 1.0 + elif name.endswith(".post_mlp_norm.weight"): + data_torch += 1.0 / (5**1.5) + elif name.endswith(".norm.weight"): + data_torch += 1.0 + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") +class Plamo3Model(TextModel): + model_arch = gguf.MODEL_ARCH.PLAMO3 + + def set_vocab(self): + self._set_vocab_plamo() + + tokenizer_config_path = self.dir_model / "tokenizer_config.json" + tokenizer_config = {} + + if tokenizer_config_path.is_file(): + with open(tokenizer_config_path, encoding="utf-8") as f: + tokenizer_config = json.load(f) + + chat_template = tokenizer_config.get("chat_template") + chat_template_jinja = self.dir_model / "chat_template.jinja" + + if chat_template_jinja.is_file(): + with open(chat_template_jinja, encoding="utf-8") as f: + chat_template = f.read() + + if chat_template: + self.gguf_writer.add_chat_template(chat_template) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None: + self.gguf_writer.add_sliding_window(sliding_window) + self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + + if name.endswith(".pre_mixer_norm.weight"): + data_torch = data_torch + 1.0 + elif name.endswith(".post_mixer_norm.weight"): + data_torch = data_torch + 1.0 / 5 + elif name.endswith(".pre_mlp_norm.weight"): + data_torch = data_torch + 1.0 + elif name.endswith(".post_mlp_norm.weight"): + data_torch = data_torch + 1.0 / (5**1.5) + elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")): + data_torch = data_torch + 1.0 + elif name.endswith(".norm.weight"): + data_torch = data_torch + 1.0 + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/plm.py b/conversion/plm.py new file mode 100644 index 00000000000..3fde487085b --- /dev/null +++ b/conversion/plm.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("PLMForCausalLM") +class PLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.PLM + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["v_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + def prepare_tensors(self): + super().prepare_tensors() diff --git a/conversion/qwen.py b/conversion/qwen.py new file mode 100644 index 00000000000..7eb135c832d --- /dev/null +++ b/conversion/qwen.py @@ -0,0 +1,627 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("QWenLMHeadModel") +class QwenModel(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN + + @staticmethod + def token_bytes_to_string(b): + from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] + byte_encoder = bytes_to_unicode() + return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) + + @staticmethod + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + + def set_vocab(self): + self._set_vocab_qwen() + + +@ModelBase.register( + "Qwen2Model", + "Qwen2ForCausalLM", + "Qwen2AudioForConditionalGeneration", + "KORMoForCausalLM", + "AudioFlamingo3ForConditionalGeneration", + "DotsOCRForCausalLM", +) +class Qwen2Model(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.hf_arch == "Qwen2Model": + name = f"model.{name}" # map to Qwen2ForCausalLM tensors + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN2MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) + logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # handle aggregated expert tensors + # GGUF stores dimensions reversed from PyTorch, so: + # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A} + # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp) + # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down + if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): + mapped = f"{name}.weight" if not name.endswith(".weight") else name + # HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert} + yield from super().modify_tensors(data_torch, mapped, bid) + return + + if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): + if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0: + raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") + # HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2 + n_ff = data_torch.shape[-2] // 2 + gate = data_torch[..., :n_ff, :].contiguous() + up = data_torch[..., n_ff:, :].contiguous() + # gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert} + base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj") + mapped_gate = f"{base_name}.gate_proj.weight" + mapped_up = f"{base_name}.up_proj.weight" + yield from super().modify_tensors(gate, mapped_gate, bid) + yield from super().modify_tensors(up, mapped_up, bid) + return + + if name.find("experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model") +class Qwen3Model(Qwen2Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + # extra logic for rerank models + is_rerank: bool = False + is_tied_embeddings: bool = False + token_false_id: int | None = None + token_true_id: int | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # track for intern-s1-mini + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + if self._is_qwen3_reranker(): + self._find_rerank_config() + + def _is_qwen3_reranker(self) -> bool: + readme_path = self.dir_model / "README.md" + readme_text = "" + if readme_path.exists(): + with readme_path.open("r", encoding="utf-8") as f: + readme_text = f.read() + + name_hints = [ + str(self.dir_model.name), + str(self.hparams.get("_name_or_path", "")), + str(self.hparams.get("model_type", "")), + str(self.origin_hf_arch or ""), + ] + name_hints = [hint.lower() for hint in name_hints if hint] + + if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower(): + return True + + if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints): + return True + + return "sequenceclassification" in (self.origin_hf_arch or "").lower() + + def set_vocab(self): + # deal with intern-s1-mini + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': + self._set_vocab_interns1() + return + + super().set_vocab() + + def _find_rerank_config(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + + self.is_rerank = True + self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) + self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment] + self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment] + self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute] + + assert self.token_false_id is not None and self.token_true_id is not None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.is_rerank: + self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK) + self.gguf_writer.add_classifier_output_labels(["yes", "no"]) + self.gguf_writer.add_chat_template([{ + "name": "rerank", + "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" + "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n" + "<|im_start|>assistant\n<think>\n\n</think>\n\n" + }]) + + def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: + # extract "yes" and "no" tokens from the output lm_head tensor + false_row = data_torch[self.token_false_id] + true_row = data_torch[self.token_true_id] + return torch.stack([true_row, false_row], dim=0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if self.is_rerank: + is_tied_head = self.is_tied_embeddings and "embed_tokens" in name + is_real_head = not self.is_tied_embeddings and "lm_head" in name + if is_tied_head or is_real_head: + cls_out_head = ( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", + self._get_cls_out_tensor(data_torch), + ) + yield cls_out_head + if is_tied_head: + yield from super().modify_tensors(data_torch, name, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3MoeForCausalLM") +class Qwen3MoeModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams = ModelBase.load_hparams(self.dir_model, False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + def set_vocab(self): + # deal with intern-s1 + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': + self._set_vocab_interns1() + return + + super().set_vocab() + + +@ModelBase.register("Qwen3NextForCausalLM") +class Qwen3NextModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3NEXT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"]) + self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"]) + self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"]) + self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"]) + self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"]) + self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4)) + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25))) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("mtp"): + # ignore MTP layers for now + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + elif name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + elif "conv1d" in name: + data_torch = data_torch.squeeze() + elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"): + data_torch = data_torch + 1 + + if "in_proj_qkvz.weight" in name: + # original order: [q, k, v, z] * head_count + # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count] + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_heads = self.hparams["linear_num_value_heads"] + num_k_heads = self.hparams["linear_num_key_heads"] + hidden_size = self.hparams["hidden_size"] + split_arg_list_qkvz = [ + head_k_dim, # q partition + head_k_dim, # k partition + (num_v_heads // num_k_heads * head_v_dim), # v partition + (num_v_heads // num_k_heads * head_v_dim), # z partition + ] + # view as (n_embd, head_count, [q+k+v+z]) + data_torch = data_torch.permute(1, 0).contiguous() + data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz)) + # split into q, k, v, z + q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1) + # flatten dim + head_count + q = q.contiguous().view(hidden_size, -1) + k = k.contiguous().view(hidden_size, -1) + v = v.contiguous().view(hidden_size, -1) + z = z.contiguous().view(hidden_size, -1) + # stack back + qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous() + z = z.permute(1, 0).contiguous() + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("RND1") +class RND1Model(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.RND1 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # RND1 specific parameters + # RND1 uses bidirectional attention + self.gguf_writer.add_causal_attention(False) + + if (mask_token_id := self.hparams.get("mask_token_id")) is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + +class _LinearAttentionVReorderBase(Qwen3NextModel): + model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses + """reorders V heads from grouped to tiled order for ggml broadcast + + see https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306 + + Linear attention may has num_k_heads < num_v_heads. The HF weights store + V heads grouped by K head: [G0_v0..v{r-1}, G1_v0..v{r-1}, ...]. + ggml binary ops use tiled broadcast: [K0, K1, ..., K0, K1, ...]. + We reorder V heads to tiled order so ggml_repeat can replace the expensive + interleaved repeat: [G0_v0, G1_v0, ..., G0_v1, G1_v1, ...]. + """ + + @staticmethod + def _reorder_v_heads(tensor: Tensor, dim: int, num_k_heads: int, num_v_per_k: int, head_dim: int) -> Tensor: + """Reorder V heads from grouped (by K head) to tiled order along the given dimension.""" + shape = list(tensor.shape) + if dim < 0: + dim += len(shape) + new_shape = shape[:dim] + [num_k_heads, num_v_per_k, head_dim] + shape[dim + 1:] + tensor = tensor.reshape(*new_shape) + perm = list(range(len(new_shape))) + perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim] + return tensor.permute(*perm).contiguous().reshape(*shape) + + def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]: + if not name.endswith(( + ".linear_attn.in_proj_qkv.weight", + ".linear_attn.in_proj_z.weight", + ".linear_attn.in_proj_a.weight", + ".linear_attn.in_proj_b.weight", + ".linear_attn.out_proj.weight", + )): + return weight, scale + + num_k_heads = self.hparams["linear_num_key_heads"] + num_v_heads = self.hparams["linear_num_value_heads"] + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_per_k = num_v_heads // num_k_heads + + def unpack_nibbles(qs: Tensor) -> Tensor: + lo = torch.bitwise_and(qs, 0x0F) + hi = torch.bitwise_right_shift(qs, 4) + return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2) + + def pack_nibbles(codes: Tensor) -> Tensor: + codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2) + lo = torch.bitwise_and(codes[..., 0], 0x0F) + hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4) + return torch.bitwise_or(lo, hi).contiguous() + + def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]: + assert qs.ndim >= 2 + assert scales.ndim >= 2 + + k = qs.shape[-1] * 2 + assert col_perm.numel() == k + assert k % 16 == 0 + + group_cols = col_perm.reshape(-1, 16) + group_starts = group_cols[:, 0] + expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype) + assert torch.equal(group_cols, expected) + assert torch.all(group_starts % 16 == 0) + + group_perm = (group_starts // 16).to(dtype=torch.long) + expected_groups = torch.arange(scales.shape[-1], dtype=torch.long) + assert group_perm.numel() == scales.shape[-1] + assert torch.equal(torch.sort(group_perm).values, expected_groups) + + codes = unpack_nibbles(qs) + codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long)) + qs = pack_nibbles(codes) + scales = scales.index_select(-1, group_perm.to(device=scales.device)) + return qs, scales + + def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]: + row_perm = self._reorder_v_heads( + torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1), + 0, num_k_heads, num_v_per_k, head_dim, + ).squeeze(-1) + return ( + qs.index_select(0, row_perm.to(device=qs.device)), + scales.index_select(0, row_perm.to(device=scales.device)), + ) + + if name.endswith(".linear_attn.in_proj_qkv.weight"): + q_dim = head_k_dim * num_k_heads + k_dim = head_k_dim * num_k_heads + q = weight[:q_dim] + k = weight[q_dim:q_dim + k_dim] + v = weight[q_dim + k_dim:] + q_scale = scale[:q_dim] + k_scale = scale[q_dim:q_dim + k_dim] + v_scale = scale[q_dim + k_dim:] + v, v_scale = reorder_rows(v, v_scale, head_v_dim) + return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0) + + if name.endswith(".linear_attn.in_proj_z.weight"): + weight, scale = reorder_rows(weight, scale, head_v_dim) + elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")): + weight, scale = reorder_rows(weight, scale, 1) + elif name.endswith(".linear_attn.out_proj.weight"): + col_perm = self._reorder_v_heads( + torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0), + 1, num_k_heads, num_v_per_k, head_v_dim, + ).squeeze(0) + weight, scale = apply_col_perm(weight, scale, col_perm) + + return weight, scale + + def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): + weight, scale = self._transform_nvfp4_weight(name, weight, scale) + super()._repack_nvfp4(name, weight, scale, scale2, input_scale) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_k_heads = self.hparams.get("linear_num_key_heads", 0) + num_v_heads = self.hparams.get("linear_num_value_heads", 0) + + if num_k_heads > 0 and num_v_heads > 0 and num_k_heads != num_v_heads and "linear_attn." in name: + head_k_dim = self.hparams["linear_key_head_dim"] + head_v_dim = self.hparams["linear_value_head_dim"] + num_v_per_k = num_v_heads // num_k_heads + + if ".in_proj_qkv." in name: + # QKV weight: reorder only the V rows + q_dim = head_k_dim * num_k_heads + k_dim = head_k_dim * num_k_heads + q = data_torch[:q_dim] + k = data_torch[q_dim:q_dim + k_dim] + v = data_torch[q_dim + k_dim:] + v = self._reorder_v_heads(v, 0, num_k_heads, num_v_per_k, head_v_dim) + data_torch = torch.cat([q, k, v], dim=0) + + elif ".in_proj_z." in name: + # Z gate weight: reorder rows (num_v_heads * head_v_dim) + data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, head_v_dim) + + elif ".in_proj_b." in name or ".in_proj_a." in name: + # Beta/Alpha weight: reorder rows (num_v_heads, head_dim=1) + data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, 1) + + elif ".A_log" in name or ".dt_bias" in name or ".dt_proj" in name: + # A_log / dt_bias: 1D parameters with num_v_heads elements + if data_torch.ndim == 1: + data_torch = self._reorder_v_heads( + data_torch.unsqueeze(-1), 0, num_k_heads, num_v_per_k, 1 + ).squeeze(-1) + else: + data_torch = self._reorder_v_heads(data_torch, -1, num_k_heads, num_v_per_k, 1) + + elif ".conv1d" in name: + # Conv1d kernel: reorder only the V channel portion + data = data_torch.squeeze() + qk_channels = head_k_dim * num_k_heads * 2 + qk_part = data[:qk_channels] + v_part = data[qk_channels:] + v_part = self._reorder_v_heads(v_part, 0, num_k_heads, num_v_per_k, head_v_dim) + data_torch = torch.cat([qk_part, v_part], dim=0) + + elif ".out_proj." in name: + # Out projection weight: reorder columns (input dimension) + data_torch = self._reorder_v_heads(data_torch, 1, num_k_heads, num_v_per_k, head_v_dim) + + yield from super().modify_tensors(data_torch, name, bid) + + +class _Qwen35MRopeMixin: + # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers); + # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE + # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always + # written even when a particular checkpoint omits the field in `rope_parameters`. + _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0] + + gguf_writer: gguf.GGUFWriter + rope_parameters: dict + + def set_gguf_parameters(self): + super().set_gguf_parameters() # ty: ignore[unresolved-attribute] + if "mrope_section" not in self.rope_parameters: + self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION) + + +class _Qwen35MtpMixin: + """Shared MTP wiring for Qwen3.5/3.6 text variants. The HF config carries + the MTP block under `mtp_num_hidden_layers` and the tensors under + `mtp.*`; we extend block_count, emit the nextn metadata key, and remap + `mtp.*` to the standard layer-indexed nextn naming so the existing + tensor_map handles them.""" + + hparams: dict[str, Any] + model_arch: gguf.MODEL_ARCH + gguf_writer: gguf.GGUFWriter + block_count: int + tensor_map: gguf.TensorNameMap + no_mtp: bool + mtp_only: bool + _original_block_count: int | None = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + if not self.no_mtp: + self.block_count += self.hparams.get("mtp_num_hidden_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + hparams = {**self.hparams, **self.hparams.get("text_config", {})} + key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None) + type(self)._original_block_count = hparams.get(key) + return super().index_tensors(remote_hf_model_id=remote_hf_model_id) # ty: ignore[unresolved-attribute] + + @classmethod + def filter_tensors(cls, item): + assert cls._original_block_count is not None + # TODO: change TextModel to super() + if (titem := TextModel.filter_tensors(item)) is None: + return None + name, gen = titem + if name.startswith("model.mtp."): + name = name.replace("model.", "", 1) + if name.startswith("mtp."): + if cls.no_mtp: + return None + remapper = { + "fc": "eh_proj", + "pre_fc_norm_embedding": "enorm", + "pre_fc_norm_hidden": "hnorm", + "norm": "shared_head.norm", + } + parts = name.split(".", 3) + if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal(): + mtp_idx = int(parts[2]) + name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}" + elif len(parts) == 3 and parts[1] in remapper: + name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}" + elif cls.mtp_only: + keep = name in ( + "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight", + "embed_tokens.weight", "norm.weight", + ) + if not keep: + return None + return name, gen + + def set_gguf_parameters(self): + super().set_gguf_parameters() # ty: ignore[unresolved-attribute] + if self.no_mtp: + return + if (n := self.hparams.get("mtp_num_hidden_layers", 0)) > 0: + self.gguf_writer.add_nextn_predict_layers(n) + + def prepare_metadata(self, vocab_only: bool): + from_dir = self.fname_out.is_dir() + super().prepare_metadata(vocab_only=vocab_only) # ty: ignore[unresolved-attribute] + + if not self.mtp_only or not from_dir: + return + + output_type: str = self.ftype.name.partition("_")[2] # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + fname_default: str = gguf.naming_convention( + self.metadata.name, self.metadata.basename, self.metadata.finetune, # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf" + + +@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM") +class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase): + model_arch = gguf.MODEL_ARCH.QWEN35 + + +@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") +class Qwen3_5MoeTextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase): + model_arch = gguf.MODEL_ARCH.QWEN35MOE diff --git a/conversion/qwen3vl.py b/conversion/qwen3vl.py new file mode 100644 index 00000000000..9f11757697f --- /dev/null +++ b/conversion/qwen3vl.py @@ -0,0 +1,360 @@ +from __future__ import annotations + +import json + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + +from .qwen import Qwen3Model, Qwen3MoeModel +from .qwenvl import Qwen25AudioModel + + +@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration") +class Qwen3VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams_vision is None: + logger.info("No vision config found, skipping vision tensor processing") + return + + # Compute image_size if not present + if "image_size" not in self.hparams_vision: + # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings + num_pos = self.hparams_vision.get("num_position_embeddings", 2304) + patch_size = self.hparams_vision.get("patch_size", 16) + # num_position_embeddings = (image_size / patch_size) ** 2 + # So image_size = sqrt(num_position_embeddings) * patch_size + image_size = int(num_pos**0.5 * patch_size) + self.hparams_vision["image_size"] = image_size + + # Rename config values for compatibility + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") + self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") + + self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0) + for idx in self.hparams_vision.get("deepstack_visual_indexes", []): + self.is_deepstack_layers[idx] = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # in case mixed modalities, the arch will be handled by subclass + if not self.has_audio_encoder: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL) + self.gguf_writer.add_vision_use_gelu(True) + + if self.hparams_vision is not None: + merge_size = self.hparams_vision.get("spatial_merge_size") + if merge_size is not None: + self.gguf_writer.add_vision_spatial_merge_size(int(merge_size)) + + # Use text config's rms_norm_eps for vision attention layernorm eps + rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + if self.is_deepstack_layers: + self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip text model tensors + if name.startswith("lm_head."): + return None + + # Skip MTP tensors + if name.startswith("mtp."): + return None + + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.", 1) + + if not name.startswith("visual."): + return None + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + assert self.hparams_vision is not None + + if name.startswith("visual.deepstack_merger_list."): + prefix, rest = name.split(".", maxsplit=3)[2:] + # prefix is the layer index, convert to absolute clip layer index! + idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)] + target = rest + + tensor_type: gguf.MODEL_TENSOR + if target.startswith("norm."): + tensor_type = gguf.MODEL_TENSOR.V_DS_NORM + suffix = target.split(".", 1)[1] + elif target.startswith("linear_fc1."): + tensor_type = gguf.MODEL_TENSOR.V_DS_FC1 + suffix = target.split(".", 1)[1] + elif target.startswith("linear_fc2."): + tensor_type = gguf.MODEL_TENSOR.V_DS_FC2 + suffix = target.split(".", 1)[1] + else: + raise ValueError(f"Unexpected deepstack tensor: {name}") + + new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}") + yield from super().modify_tensors(data_torch, new_name, bid) + return + + if name.startswith("visual.merger."): + suffix = name.split(".", 2)[2] + if suffix.startswith("linear_fc"): + fc_idx_str, tail = suffix.split(".", 1) + fc_num = int(fc_idx_str.replace("linear_fc", "")) + # Qwen3VL has linear_fc1 and linear_fc2 + # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2) + if fc_num == 1: + fc_idx = 0 + elif fc_num == 2: + fc_idx = 2 + else: + raise ValueError(f"unexpected fc index {fc_num} in {name}") + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}") + elif suffix.startswith("norm."): + new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}") + else: + raise ValueError(f"Unexpected merger tensor: {name}") + yield (new_name, data_torch) + return + + if name == "visual.patch_embed.proj.weight": + # split Conv3D into Conv2Ds along temporal dimension + c1, c2, kt, _, _ = data_torch.shape + del c1, c2 + if kt != 2: + raise ValueError("Current implementation only supports temporal_patch_size of 2") + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) + return + + if name == "visual.patch_embed.proj.bias": + # Include the bias - it's used by the C++ code + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch) + return + + yield from MmprojModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def get_vision_config(self) -> dict[str, Any] | None: + if self.has_vision_encoder: + return self.global_config["thinker_config"].get("vision_config") + else: + return None + + def get_audio_config(self) -> dict[str, Any] | None: + if self.has_audio_encoder: + return self.global_config["thinker_config"].get("audio_config") + else: + return None + + def set_gguf_parameters(self): + if self.has_vision_encoder: + Qwen3VLVisionModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) + if self.has_audio_encoder: + Qwen25AudioModel.set_gguf_parameters(self) + self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip text model tensors + if name.startswith("lm_head."): + return None + + # Skip MTP tensors + if name.startswith("mtp."): + return None + + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.", 1) + + if name.startswith("thinker.audio_tower."): + name = name.replace("thinker.audio_tower.", "audio_tower.", 1) + + if "visual." not in name and "audio_tower." not in name: + return None + + return MmprojModel.filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "visual." in name: + if not self.has_vision_encoder: + raise ValueError(f"Model does not have vision encoder, but found tensor {name}") + # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly + name = name.replace("thinker.visual.", "model.visual.") + if ".merger_list." in name: + name = name.replace(".merger_list.", ".deepstack_merger_list.") + name = name.replace(".ln_q", ".norm") + name = name.replace(".mlp.0", ".linear_fc1") + name = name.replace(".mlp.2", ".linear_fc2") + elif ".merger." in name: + name = name.replace(".ln_q", ".norm") + name = name.replace(".mlp.0", ".linear_fc1") + name = name.replace(".mlp.2", ".linear_fc2") + yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) + elif "audio_tower." in name: + if not self.has_audio_encoder: + raise ValueError(f"Model does not have audio encoder, but found tensor {name}") + if "conv2d" in name and name.endswith(".bias"): + # transform conv2d bias [n_embd] --> [1, 1, n_embd] + data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) + yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel): + has_audio_encoder = True + has_vision_encoder = False + + +@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") +class Glm4VVisionModel(Qwen3VLVisionModel): + def set_gguf_parameters(self): + MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters + assert self.hparams_vision is not None + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("visual.merger."): + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3VLForConditionalGeneration") +class Qwen3VLTextModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.QWEN3VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if "thinker_config" in self.hparams: + vision_config = self.hparams["thinker_config"].get("vision_config", {}) + else: + vision_config = self.hparams.get("vision_config", {}) + deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) + self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("thinker.", "") + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("Qwen3VLMoeForConditionalGeneration") +class Qwen3VLMoeTextModel(Qwen3MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + vision_config = self.hparams.get("vision_config", {}) + deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) + self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + name = name.replace("thinker.", "") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors + if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): + mapped = f"{name}.weight" if not name.endswith(".weight") else name + permuted = data_torch.permute(0, 2, 1).contiguous() + yield from ModelBase.modify_tensors(self, permuted, mapped, bid) + return + + if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): + if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: + raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") + split_dim = data_torch.shape[-1] // 2 + gate = data_torch[..., :split_dim].contiguous() + up = data_torch[..., split_dim:].contiguous() + # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768) + # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128} + # Need PyTorch: (128, 768, 2048) [reversed of GGML] + # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048) + base_name = name.removesuffix(".weight") + base = base_name.rsplit('.', 1)[0] + mapped_gate = f"{base}.gate_proj.weight" + mapped_up = f"{base}.up_proj.weight" + perm_gate = gate.permute(0, 2, 1).contiguous() + perm_up = up.permute(0, 2, 1).contiguous() + yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid) + yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") +class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VLMOE + + def set_vocab(self): + super().set_vocab() + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + break + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + +@ModelBase.register("Qwen3ASRForConditionalGeneration") +class Qwen3ASRTextModel(Qwen3VLTextModel): + model_arch = gguf.MODEL_ARCH.QWEN3VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_num_deepstack_layers(0) + + def set_vocab(self): + super().set_vocab() + # fix chat template, use correct chatml format + self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}") + # correct BOS/EOS tokens + with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + added_tokens = tokenizer_config.get("added_tokens_decoder", {}) + for token_id, data in added_tokens.items(): + if data.get("content") == "<|im_end|>": + self.gguf_writer.add_bos_token_id(int(token_id)) + self.gguf_writer.add_eos_token_id(int(token_id)) + break diff --git a/conversion/qwenvl.py b/conversion/qwenvl.py new file mode 100644 index 00000000000..7befd0c8d81 --- /dev/null +++ b/conversion/qwenvl.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +import numpy as np +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register( + "Qwen2VLModel", + "Qwen2VLForConditionalGeneration", + "Qwen2_5_VLForConditionalGeneration", + "Qwen2_5OmniModel", +) +class Qwen2VLModel(TextModel): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("thinker."): + name = name.replace("thinker.", "") + + return super().filter_tensors((name, gen)) + + +@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +class Qwen2VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) + # rename config.json values + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") + self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") + if "embed_dim" in self.hparams_vision: # qwen2vl + self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size") + self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + model_type = self.global_config['model_type'] + if model_type == 'qwen2_vl': + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL) + elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni': + if model_type == 'qwen2_5_omni': + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) + else: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) + self.gguf_writer.add_vision_use_silu(True) + # find n_wa_pattern (window attention pattern) + fullatt_block_indexes = hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" + n_wa_pattern = fullatt_block_indexes[0] + 1 + # validate n_wa_pattern + for i in range(1, len(fullatt_block_indexes)): + if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: + raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + else: + raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("visual."): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # split QKV tensors if needed + if ".qkv." in name: + if data_torch.ndim == 2: # weight + c3, _ = data_torch.shape + else: # bias + c3 = data_torch.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = data_torch[:c] + wk = data_torch[c: c * 2] + wv = data_torch[c * 2:] + yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid) + yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid) + yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid) + elif 'patch_embed.proj.weight' in name: + # split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = data_torch.shape + del c1, c2, kh, kw # unused + assert kt == 2, "Current implementation only support temporal_patch_size of 2" + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) + else: + yield from super().modify_tensors(data_torch, name, bid) + + +class Qwen25AudioModel(MmprojModel): + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_audio is not None + self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] + self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"] + self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_audio is not None + self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5)) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # SinusoidsPositionEmbedding + assert self.hparams_audio is not None + max_timescale = 10000 + length = 1500 + channels = self.hparams_audio["hidden_size"] + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float()) + scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] + pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32) + yield ("audio_tower.embed_positions.weight", pos_embd) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + yield from MmprojModel.modify_tensors(self, data_torch, name, bid) + + +@ModelBase.register("Qwen2_5OmniModel") +class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel): + has_audio_encoder = True + has_vision_encoder = True + + def get_vision_config(self) -> dict[str, Any] | None: + return self.global_config["thinker_config"].get("vision_config") + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config["thinker_config"].get("audio_config") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if not name.startswith("visual.") and not name.startswith("audio_tower."): + return None + + if name.startswith("thinker."): + name = name.replace("thinker.", "") + + if "audio_bos_eos_token" in name: + # this tensor is left unused in transformers code + # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 + return None + + return MmprojModel.filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "visual." in name: + yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid) + elif "audio_tower." in name: + yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) + return # skip other tensors diff --git a/conversion/refact.py b/conversion/refact.py new file mode 100644 index 00000000000..1170cddeb2c --- /dev/null +++ b/conversion/refact.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("GPTRefactForCausalLM") +class RefactModel(TextModel): + model_arch = gguf.MODEL_ARCH.REFACT + + def set_vocab(self): + super().set_vocab() + + # TODO: how to determine special FIM tokens automatically? + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot']) + special_vocab._set_special_token("prefix", 1) + special_vocab._set_special_token("suffix", 3) + special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + + if bid is not None: + if name == f"transformer.h.{bid}.attn.kv.weight": + yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return + if name == f"transformer.h.{bid}.attn.q.weight": + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + return + if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/rwkv.py b/conversion/rwkv.py new file mode 100644 index 00000000000..2de0aa5346e --- /dev/null +++ b/conversion/rwkv.py @@ -0,0 +1,302 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("Rwkv6ForCausalLM") +class Rwkv6Model(TextModel): + model_arch = gguf.MODEL_ARCH.RWKV6 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def set_gguf_parameters(self): + head_size = self.hparams["head_size"] + hidden_size = self.hparams["hidden_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + rescale_every_n_layers = self.hparams["rescale_every"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) + time_mix_extra_dim = 64 if hidden_size == 4096 else 32 + time_decay_extra_dim = 128 if hidden_size == 4096 else 64 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): + data_torch = data_torch.transpose(0, 1) + + if new_name.endswith("time_mix_w2.weight"): + data_torch = data_torch.permute(0, 2, 1) + + if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: + data_torch = data_torch.squeeze() + + try: + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + except KeyError: + pass + + # concat time_mix_lerp weights to reduce some cpu overhead + # also reduces the number of tensors in the model + if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: + try: + self.lerp_weights[bid][new_name] = data_torch + except KeyError: + self.lerp_weights[bid] = {new_name: data_torch} + if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) + yield (new_name, data) + return + + yield (new_name, data_torch) + + +@ModelBase.register("RWKV6Qwen2ForCausalLM") +class RWKV6Qwen2Model(Rwkv6Model): + model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32) + time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # special parameters for time_mixing in RWKV6QWEN2 + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + # RWKV6QWEN2 use grouped key/value like GQA + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes + data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(TextModel): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) + + # ICLR: In-Context-Learning-Rate + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + # already concatenated + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) for t in [ + "time_mix_w1.weight", "time_mix_w2.weight", + "time_mix_a1.weight", "time_mix_a2.weight", + "time_mix_v1.weight", "time_mix_v2.weight", + "time_mix_g1.weight", "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if 'r_k' in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + # dummy v0/v1/v2 on first layer + # easiest way to make llama happy + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@ModelBase.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + hidden_size = self.hparams["hidden_size"] + head_size = self.hparams["head_size"] + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + # ICLR: In-Context-Learning-Rate + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_token_shift_count(1) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) diff --git a/conversion/sarashina2.py b/conversion/sarashina2.py new file mode 100644 index 00000000000..05448db812e --- /dev/null +++ b/conversion/sarashina2.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, gguf + +from .llama import LlamaModel +from .qwenvl import Qwen2VLVisionModel + + +@ModelBase.register("Sarashina2VisionForCausalLM") +class Sarashina2VLTextModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + if name.startswith("llm."): + name = name.replace("llm.", "", 1) + elif name.startswith("norm."): + return None + return super().filter_tensors((name, gen)) + + +@ModelBase.register("Sarashina2VisionForCausalLM") +class Sarashina2VLVisionModel(Qwen2VLVisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.global_config['model_type'] = "qwen2_vl" diff --git a/conversion/smallthinker.py b/conversion/smallthinker.py new file mode 100644 index 00000000000..1b0f79aa3ea --- /dev/null +++ b/conversion/smallthinker.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("SmallThinkerForCausalLM") +class SmallThinkerModel(TextModel): + model_arch = gguf.MODEL_ARCH.SMALLTHINKER + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("moe_num_primary_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("moe_num_active_primary_experts")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + self.gguf_writer.add_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (self.hparams.get('moe_primary_router_apply_softmax')): + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + sliding_window_layout = self.hparams.get("sliding_window_layout") + if sliding_window_layout: + for i in sliding_window_layout: + if i != 0: + sliding_window = self.hparams.get("sliding_window_size") + if sliding_window: + self.gguf_writer.add_sliding_window(sliding_window) + break + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams.get("moe_num_primary_experts") or self.find_hparam(["num_local_experts", "num_experts"]) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down", "gate", "up"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + return + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/smolvlm.py b/conversion/smolvlm.py new file mode 100644 index 00000000000..30e9dca329b --- /dev/null +++ b/conversion/smolvlm.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf + + +@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") +class SmolVLMModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.hparams["model_type"] == "smolvlm_vision": + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) + self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + + # Add the preprocessor longest edge size + preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size) + self.gguf_writer.add_vision_preproc_image_size(preproc_image_size) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if not is_vision_tensor: + return None + + return super().filter_tensors(item) diff --git a/conversion/stablelm.py b/conversion/stablelm.py new file mode 100644 index 00000000000..ba5e9aa6ca9 --- /dev/null +++ b/conversion/stablelm.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(TextModel): + model_arch = gguf.MODEL_ARCH.STABLELM + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + + def set_gguf_parameters(self): + hparams = self.hparams + + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + self.gguf_writer.add_file_type(self.ftype) + + _q_norms: list[dict[str, Tensor]] | None = None + _k_norms: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + + if name.find("q_layernorm.norms") != -1: + assert bid is not None + + if self._q_norms is None: + self._q_norms = [{} for _ in range(self.block_count)] + + self._q_norms[bid][name] = data_torch + + if len(self._q_norms[bid]) >= n_head: + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + else: + return + + if name.find("k_layernorm.norms") != -1: + assert bid is not None + + if self._k_norms is None: + self._k_norms = [{} for _ in range(self.block_count)] + + self._k_norms[bid][name] = data_torch + + if len(self._k_norms[bid]) >= n_kv_head: + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + else: + return + + yield from super().modify_tensors(data_torch, name, bid) + + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + datas: list[Tensor] = [] + # extract the norms in order + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + if self._q_norms is not None or self._k_norms is not None: + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + ) + ( + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + ) + if len(norms) > 0: + raise ValueError(f"Unprocessed norms: {norms}") diff --git a/conversion/starcoder.py b/conversion/starcoder.py new file mode 100644 index 00000000000..0b4ffd84702 --- /dev/null +++ b/conversion/starcoder.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("GPTBigCodeForCausalLM") +class StarCoderModel(TextModel): + model_arch = gguf.MODEL_ARCH.STARCODER + + def set_gguf_parameters(self): + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +@ModelBase.register("Starcoder2ForCausalLM") +class StarCoder2Model(TextModel): + model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/conversion/step3.py b/conversion/step3.py new file mode 100644 index 00000000000..ba867fb831b --- /dev/null +++ b/conversion/step3.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import math +import re + +from typing import Callable, Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, _MISTRAL_COMMON_DATASET_MEAN, _MISTRAL_COMMON_DATASET_STD, gguf + +from .qwen import Qwen3Model + + +@ModelBase.register("StepVLForConditionalGeneration") +class Step3VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + + if not self.hparams_vision.get("intermediate_size"): + hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0 + assert hidden_size > 0 + mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536)) + self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) + + self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN)) + self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + projector_stride = int(self.global_config.get("understand_projector_stride", -1)) + hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1))) + num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1))) + assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), ( + "current Step3-VL conversion path is only validated for Step3-VL-10B" + ) + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL) + self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5))) + self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2) + # 3024 max resize comes from step3-vl-10b processing_step3.py. + self.gguf_writer.add_vision_preproc_image_size(3024) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("model.", "lm_head.")): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("vision_model.vit_downsampler"): + match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name) + if match is None: + raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}") + + proj_id = int(match.group(1)) - 1 + suffix = f".{match.group(2)}" + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch) + return + + if name == "vit_large_projector.weight": + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch) + return + + if name.startswith("vision_model."): + if name == "vision_model.positional_embedding": + name += ".weight" + elif name.endswith(".gamma") and ".ls_" in name: + name = name.removesuffix(".gamma") + ".weight" + + name = name.replace("attn.in_proj_weight", "attn.in_proj.weight") + name = name.replace("attn.in_proj_bias", "attn.in_proj.bias") + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("StepVLForConditionalGeneration") +class Step3VLTextModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + +@ModelBase.register("Step3p5ForCausalLM") +class Step35Model(TextModel): + model_arch = gguf.MODEL_ARCH.STEP35 + + def set_gguf_parameters(self): + rope_theta = self.hparams.get("rope_theta") + if isinstance(rope_theta, list): + self.hparams["rope_theta"] = float(rope_theta[0]) + self.hparams["local_rope_theta"] = float(rope_theta[1]) + self.rope_parameters["rope_theta"] = self.hparams["rope_theta"] + self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]} + + super().set_gguf_parameters() + + layer_types = self.hparams.get("layer_types") or [] + partial_rotary_factors = self.hparams.get("partial_rotary_factors") or [] + attn_other = self.hparams.get("attention_other_setting") or {} + + n_head_base = self.hparams["num_attention_heads"] + n_kv_base = self.hparams["num_attention_groups"] + + n_head_swa = attn_other.get("num_attention_heads", n_head_base) + n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) + + layer_types = layer_types[: self.block_count] + partial_rotary_factors = partial_rotary_factors[: self.block_count] + assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors + head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] + kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] + swa_pat = [lt == "sliding_attention" for lt in layer_types] + + self.gguf_writer.add_head_count(head_arr) + self.gguf_writer.add_head_count_kv(kv_arr) + + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_sliding_window_pattern(swa_pat) + + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + + # MoE params + self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"]) + + if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor) + if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None: + self.gguf_writer.add_expert_weights_norm(norm_expert_weight) + + # leading dense blocks + leading_dense = 0 + moe_layers_enum = self.hparams.get("moe_layers_enum") + if isinstance(moe_layers_enum, str) and moe_layers_enum.strip(): + moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(",")) + if moe_layers: + leading_dense = max(0, moe_layers[0]) + self.gguf_writer.add_leading_dense_block_count(leading_dense) + self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1))) + + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) + + # Optional per-layer SwiGLU clamps. + if (limits := self.hparams.get("swiglu_limits")) is not None: + limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] + self.gguf_writer.add_swiglu_clamp_exp(limits_f) + if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: + limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] + self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Map router bias (expert selection bias) to a GGUF bias tensor + if name.endswith(".moe.router_bias"): + name += ".bias" + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # remove mtp layers + if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None: + il = int(m.group(1)) + n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) + if il >= n_main: + return + if name.endswith("norm.weight"): + data_torch += 1.0 + + if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): + data_torch = data_torch.squeeze().contiguous() + + yield from super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). + # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS). + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + rope_type = rope_params.get("rope_type") or "" + if rope_type.lower() != "llama3": + return + + # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. + rope_theta = self.hparams.get("rope_theta", 10000.0) + if isinstance(rope_theta, list): + rope_theta = rope_theta[0] + base = float(rope_theta) + if (dim := self.hparams.get("head_dim")) is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + dim = int(dim) + + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = float(rope_params.get("factor", 8.0)) + low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) + high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) + old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + rope_factors: list[float] = [] + for freq in freqs: + wavelen = 2 * math.pi / float(freq) + if wavelen < high_freq_wavelen: + rope_factors.append(1.0) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) diff --git a/conversion/t5.py b/conversion/t5.py new file mode 100644 index 00000000000..73dcfd1a2ce --- /dev/null +++ b/conversion/t5.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import json +import os + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger + + +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") +@ModelBase.register("UMT5Model") +class T5Model(TextModel): + model_arch = gguf.MODEL_ARCH.T5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.block_count) + if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: + self.gguf_writer.add_decoder_block_count(dec_n_layer) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("T5EncoderModel") +class T5EncoderModel(TextModel): + model_arch = gguf.MODEL_ARCH.T5ENCODER + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.shared_token_embeddings_found = False + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + # many older models use spiece.model tokenizer model filename + if not tokenizer_path.is_file(): + tokenizer_path = self.dir_model / 'spiece.model' + + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct + assert tokenizer_path.name == 'tokenizer.model' + return self._set_vocab_sentencepiece() + else: + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + for key in added_tokens_json: + token_id = added_tokens_json[key] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: + logger.warning("Couldn't find context length in config.json, assuming default value of 512") + n_ctx = 512 + self.gguf_writer.add_context_length(n_ctx) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_head_count(self.hparams["num_heads"]) + self.gguf_writer.add_key_length(self.hparams["d_kv"]) + self.gguf_writer.add_value_length(self.hparams["d_kv"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. + if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: + if not self.shared_token_embeddings_found: + name = "shared.weight" + self.shared_token_embeddings_found = True + else: + logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") + return + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/talkie.py b/conversion/talkie.py new file mode 100644 index 00000000000..a970b32d3bf --- /dev/null +++ b/conversion/talkie.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from typing import Iterable, TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +from .base import LazyTorchTensor, ModelBase, TextModel, gguf + + +@ModelBase.register("TalkieForCausalLM") +class TalkieModel(TextModel): + model_arch = gguf.MODEL_ARCH.TALKIE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # Talkie used F.rms_norm without an explicit eps + self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + prefix = f"model.blocks.{bid}." if bid is not None else "" + suffix = name.removeprefix(prefix) + + if suffix == "attn_gain.a_g": + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch + return + elif suffix == "mlp_gain.a_g": + yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch + return + elif suffix == "lm_head_gain.w_g": + self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item()) + return + elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"): + # absorb inverse rope + head_dim = self.hparams["head_dim"] + shape = data_torch.shape + data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1])) + signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype) + signs[:, head_dim // 2 :, :] = -1 + if self.lazy: + signs = LazyTorchTensor.from_eager(signs) + # (n_head, head_dim, n_in) -> (n_out, n_in) + data_torch = torch.reshape(data_torch * signs, shape) + elif suffix == "attn.head_gain.head_g": + # allow head gain to broadcast + data_torch = data_torch.unsqueeze(-1) + + if not name.endswith(".weight"): + name += ".weight" + + yield from super().modify_tensors(data_torch, name, bid) diff --git a/conversion/ultravox.py b/conversion/ultravox.py new file mode 100644 index 00000000000..347188733a5 --- /dev/null +++ b/conversion/ultravox.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +from typing import Any, Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, TextModel, gguf + + +@ModelBase.register("UltravoxModel") +class UltravoxModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLAMA # dummy + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument") + + +@ModelBase.register("GlmasrModel") +class GlmASRWhisperEncoderModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: + self.hparams["hidden_size"] = self.hparams["d_model"] + self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] + self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA) + self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith(("model.", "lm_head.")): + # skip language model tensors + return None + + if name.startswith("audio_encoder.whisper."): + name = name.replace("audio_encoder.whisper.","audio_tower.") + if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name: + name = name.replace("audio_encoder.", "audio_encoder.adapting.") + if name.startswith("audio_encoder.adapting."): + name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") + if ".layer_norm." in name: + name = name.replace(".layer_norm.", ".ln_pre.") + if ".0." in name: + name = name.replace(".0.", ".linear_1.") + if ".2." in name: + name = name.replace(".2.", ".linear_2.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("audio_encoder.audio_bos_eos_token."): + yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid) + yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid) + return + + if name.startswith("audio_encoder.adapting."): + if ".proj." in name: + return + + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen2AudioForConditionalGeneration") +class WhisperEncoderModel(MmprojModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: + self.hparams["hidden_size"] = self.hparams["d_model"] + self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] + self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A) + self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # prevent clash naming with vision tensors + if name.startswith("multi_modal_projector"): + name = "audio." + name + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("UltravoxModel") +class UltravoxWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX) + self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) + + +@ModelBase.register("MERaLiON2ForConditionalGeneration") +class MERaLiONWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False + has_audio_encoder = True + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("speech_config") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION) + self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15)) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if name.startswith("text_decoder."): + return None + + if name.startswith("speech_encoder."): + name = name.replace("speech_encoder.", "audio_tower.") + + return super().filter_tensors((name, gen)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + suffix = "." + name.rsplit(".", 1)[-1] + + if name.startswith("ln_speech."): + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch) + return + + if name.startswith("speech_audio_adapter."): + if ".mlp_adapter.0." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch) + elif ".gate_proj." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch) + elif ".pool_proj." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch) + elif ".out_proj." in name: + yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch) + return + + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("VoxtralForConditionalGeneration") +class VoxtralWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL) + self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size + + +@ModelBase.register("AudioFlamingo3ForConditionalGeneration") +class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + # Was trained in BF16, being safe, avoiding quantizing to FP16 + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) diff --git a/conversion/wavtokenizer.py b/conversion/wavtokenizer.py new file mode 100644 index 00000000000..7d25447be88 --- /dev/null +++ b/conversion/wavtokenizer.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Callable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf, logger + + +@ModelBase.register("WavTokenizerDec") +class WavTokenizerDecModel(TextModel): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return None + + return super().filter_tensors(item) + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + + self.gguf_writer.add_causal_attention(False) diff --git a/conversion/xverse.py b/conversion/xverse.py new file mode 100644 index 00000000000..fa8a31a133f --- /dev/null +++ b/conversion/xverse.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import re + +from typing import Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import ModelBase, TextModel, gguf + + +@ModelBase.register("XverseForCausalLM") +class XverseModel(TextModel): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model + hparams = self.hparams + + tokens: list[bytes] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute] + if max_vocab_index >= vocab_size: + raise ValueError("Vocabulary size exceeds expected maximum size.") + + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute] + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith("q_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith("k_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + yield from super().modify_tensors(data_torch, name, bid) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) diff --git a/conversion/youtuvl.py b/conversion/youtuvl.py new file mode 100644 index 00000000000..cabc44445f3 --- /dev/null +++ b/conversion/youtuvl.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import Callable, Iterable, TYPE_CHECKING + +if TYPE_CHECKING: + from torch import Tensor + +from .base import MmprojModel, ModelBase, gguf, logger + + +@ModelBase.register("YoutuVLForConditionalGeneration") +class YoutuVLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + + # Handle activation function + hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower() + if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"): + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + else: + raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}") + + self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2)) + + window_size = self.hparams.get("window_size") + if window_size is not None: + self.gguf_writer.add_vision_window_size(window_size) + # fullatt_block_indexes contains explicit layer indices that use full attention + # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention + # All other layers use window attention + fullatt_block_indexes = self.hparams.get("fullatt_block_indexes") + assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl" + # Store the explicit layer indices for YoutuVL (irregular pattern approach) + self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + # Skip language model tensors + skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') + if name.startswith(skip_prefixes): + return None + + return super().filter_tensors(item) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Try to map the tensor using TensorNameMap (handles vision encoder and projector) + try: + yield from super().modify_tensors(data_torch, name, bid) + except ValueError: + # If mapping fails, log warning and skip + logger.warning(f"Cannot map tensor: {name}") + return diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8835491af0f..85527553563 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3,14064 +3,46 @@ from __future__ import annotations -import ast -import logging -import argparse -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast -from itertools import chain -from transformers import AutoConfig - -import math -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf -from gguf.vocab import MistralTokenizerType, MistralVocab - -try: - from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import] - from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import] - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import] - from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import] - SentencePieceTokenizer, - ) - - _mistral_common_installed = True - _mistral_import_error_msg = "" -except ImportError: - _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) - _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) - - _mistral_common_installed = False - TokenizerVersion: Any = None - Tekkenizer: Any = None - SentencePieceTokenizer: Any = None - _mistral_import_error_msg = ( - "Mistral format requires `mistral-common` to be installed. Please run " - "`pip install mistral-common[image,audio]` to install it." - ) - - -logger = logging.getLogger("hf-to-gguf") - - -###### MODEL DEFINITIONS ###### - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class ModelType(IntEnum): - TEXT = 1 - MMPROJ = 2 - - -AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") - - -class ModelBase: - _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { - ModelType.TEXT: {}, - ModelType.MMPROJ: {}, - } - - dir_model: Path - ftype: gguf.LlamaFileType - fname_out: Path - is_big_endian: bool - endianess: gguf.GGUFEndian - use_temp_file: bool - lazy: bool - dry_run: bool - hparams: dict[str, Any] - model_tensors: dict[str, Callable[[], Tensor]] - gguf_writer: gguf.GGUFWriter - model_name: str | None - metadata_override: Path | None - dir_model_card: Path - remote_hf_model_id: str | None - - # subclasses should define this! - model_arch: gguf.MODEL_ARCH - - # subclasses should initialize this! - block_count: int - tensor_map: gguf.TensorNameMap - - # Mistral format specifics - is_mistral_format: bool = False - disable_mistral_community_chat_template: bool = False - sentence_transformers_dense_modules: bool = False - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, - use_temp_file: bool = False, eager: bool = False, - metadata_override: Path | None = None, model_name: str | None = None, - split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, - disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False, - fuse_gate_up_exps: bool = False): - if type(self) is ModelBase or \ - type(self) is TextModel or \ - type(self) is MmprojModel: - raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") - - if self.is_mistral_format and not _mistral_common_installed: - raise ImportError(_mistral_import_error_msg) - - self.dir_model = dir_model - self.ftype = ftype - self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.use_temp_file = use_temp_file - self.lazy = not eager or (remote_hf_model_id is not None) - self.dry_run = dry_run - self.remote_hf_model_id = remote_hf_model_id - self.sentence_transformers_dense_modules = sentence_transformers_dense_modules - self.fuse_gate_up_exps = fuse_gate_up_exps - self._gate_exp_buffer: dict[int, Tensor] = {} - self._up_exp_buffer: dict[int, Tensor] = {} - self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams - self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) - self.metadata_override = metadata_override - self.model_name = model_name - self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - self._is_nvfp4 = False - self._is_mxfp4 = False - - # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype - # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. - if self.ftype == gguf.LlamaFileType.GUESSED: - for _, tensor in self.get_tensors(): - if tensor.dim() < 2: - continue - - if tensor.dtype == torch.bfloat16: - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16") - break - elif tensor.dtype == torch.float16: - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - logger.info("heuristics detected float16 tensor dtype, setting --outtype f16") - break - else: - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16") - - # Configure GGUF Writer - self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, - split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) - - # Mistral specific - self.disable_mistral_community_chat_template = disable_mistral_community_chat_template - - @classmethod - def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: - stem, suffix = path.stem, path.suffix - new_name = f"{prefix}{stem}{suffix}" - return path.with_name(new_name) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in self.hparams), None) - if key is not None: - return self.hparams[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: - tensors: dict[str, Callable[[], Tensor]] = {} - - if remote_hf_model_id is not None: - is_safetensors = True - - logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) - for name, remote_tensor in remote_tensors.items(): - data_gen = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r) # noqa: E731 - if titem := self.filter_tensors((name, data_gen)): - tname, tgen = titem - tensors[tname] = tgen - - return tensors - - prefix = "model" if not self.is_mistral_format else "consolidated" - part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") - is_safetensors: bool = len(part_names) > 0 - if not is_safetensors: - part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - - tensor_names_from_index: set[str] = set() - tensor_names_from_parts: set[str] = set() - - if not self.is_mistral_format: - index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - index_file = self.dir_model / index_name - - if index_file.is_file(): - logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(index_file, "r", encoding="utf-8") as f: - index: dict[str, Any] = json.load(f) - weight_map = index.get("weight_map") - if weight_map is None or not isinstance(weight_map, dict): - raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - tensor_names_from_index.update(weight_map.keys()) - part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment] - part_names = sorted(part_dict.keys()) - else: - weight_map = {} - else: - weight_map = {} - - for part_name in part_names: - logger.info(f"gguf: indexing model part '{part_name}'") - ctx: ContextManager[Any] - if is_safetensors: - ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name)) - else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) - - with ctx as model_part: - assert model_part is not None - - for name in model_part.keys(): - tensor_names_from_parts.add(name) - if is_safetensors: - data: gguf.utility.LocalTensor = model_part[name] - if self.lazy: - data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731 - else: - dtype = LazyTorchTensor._dtype_str_map[data.dtype] - data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731 - else: - data_torch: Tensor = model_part[name] - if self.lazy: - data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731 - else: - data_gen = lambda data=data_torch: data # noqa: E731 - if titem := self.filter_tensors((name, data_gen)): - tname, tgen = titem - tensors[tname] = tgen - - # verify tensor name presence and identify potentially missing files - if len(tensor_names_from_index) > 0: - if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0: - missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts)) - extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index)) - missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) - if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}\n" - f"Missing tensors: {missing}") - else: - raise ValueError("Mismatch between weight map and model parts for tensor names:\n" - f"Missing tensors: {missing}\n" - f"Extra tensors: {extra}") - - return tensors - - @staticmethod - def _scale_is_trivial(scale: Tensor) -> bool: - return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6 - - def _write_scale_tensor(self, scale_name: str, scale: Tensor): - if not self._scale_is_trivial(scale): - scale_f32 = scale.float().numpy().flatten() - logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])") - self.gguf_writer.add_tensor(scale_name, scale_f32) - - def _write_scales_tensor(self, scale_name: str, scales: list[float]): - if not np.allclose(scales, 1.0, atol=1e-6): - scale_vals = np.array(scales, dtype=np.float32) - logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])") - self.gguf_writer.add_tensor(scale_name, scale_vals) - - def dequant_model(self): - # If all quantized tensors were already handled (e.g. pure NVFP4), skip - if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors): - return - - tensors_to_remove: list[str] = [] - new_tensors: dict[str, Callable[[], Tensor]] = {} - - if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict): - quant_method = quant_config.get("quant_method") - - def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor: - weight = weight.view(torch.uint8) - orig_shape = weight.shape - - shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape))))) - data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift - data = data & 3 - data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:])) - - # The scale is inverted - return data / scale.float() - - def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor: - scale = scale.float() - - if block_size is not None: - dim_offset = scale.ndim - len(block_size) - for i, size in enumerate(block_size): - scale = scale.repeat_interleave(size, dim_offset + i) - # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) - scale = scale[tuple(slice(0, size) for size in weight.shape)] - - # align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1]) - while scale.ndim < weight.ndim: - scale = scale.unsqueeze(-1) - - return weight.float() * scale - - # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476 - def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor: - bits = quant_config["bits"] - assert bits in (2, 3, 4, 8) - assert qweight.dtype == qzeros.dtype - maxq = (2 ** bits) - 1 - weight = None - zeros = None - pack_dtype_bits = qweight.dtype.itemsize * 8 - - if bits in [2, 4, 8]: - pack_factor = pack_dtype_bits // bits - wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0) - if self.lazy: - wf = LazyTorchTensor.from_eager(wf) - - zeros = torch.bitwise_right_shift( - qzeros.unsqueeze(2).expand(-1, -1, pack_factor), - wf.unsqueeze(0) - ).to(torch.int16 if bits == 8 else torch.int8) - zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape) - - weight = torch.bitwise_and( - torch.bitwise_right_shift( - qweight.unsqueeze(1).expand(-1, pack_factor, -1), - wf.unsqueeze(-1) - ).to(torch.int16 if bits == 8 else torch.int8), - maxq - ) - elif bits == 3: - raise NotImplementedError("3-bit gptq dequantization is not yet implemented") - - assert weight is not None - assert zeros is not None - - weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) - - # gptq_v2 doesn't need to offset zeros - if quant_config.get("checkpoint_format", "gptq") == "gptq": - zeros += 1 - - return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T - - def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int): - assert w.dtype == torch.int32 - shape = tuple(shape_tensor.tolist()) - assert len(shape) == 2 - mask = (1 << num_bits) - 1 - - shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32) - if self.lazy: - shifts = LazyTorchTensor.from_eager(shifts) - - if zero_point is None: - offset = 1 << (num_bits - 1) - else: - assert len(zero_point.shape) == 2 - offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask - offset = offset.reshape(-1, zero_point.shape[1]) - # trim padding, and prepare for broadcast - # NOTE: the zero-point is packed along dim 0 - offset = offset[:shape[0], :].unsqueeze(-1) - - # extract values - # NOTE: the weights are packed along dim 1 - unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask - unpacked = unpacked.reshape(shape[0], -1) - - # trim padding - unpacked = unpacked[:, :shape[1]] - - # prepare for broadcast of the scale - unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size) - unpacked = unpacked - offset - - return (unpacked * scale.unsqueeze(-1).float()).reshape(shape) - - if quant_method == "bitnet": - for name in self.model_tensors.keys(): - if name.endswith(".weight_scale"): - weight_name = name.removesuffix("_scale") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s()) - tensors_to_remove.append(name) - elif quant_method == "fp8": - block_size = quant_config.get("weight_block_size") - for name in self.model_tensors.keys(): - if name.endswith("_scale_inv"): - weight_name = name.removesuffix("_scale_inv") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) - tensors_to_remove.append(name) - if name.endswith(".activation_scale"): # unused - tensors_to_remove.append(name) - if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused - tensors_to_remove.append(name) - # mistral format - if name.endswith(".qscale_weight"): - weight_name = name.removesuffix("qscale_weight") + "weight" - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) - tensors_to_remove.append(name) - if name.endswith(".qscale_act"): - tensors_to_remove.append(name) - elif quant_method == "gptq": - for name in self.model_tensors.keys(): - if name.endswith(".qweight"): - base_name = name.removesuffix(".qweight") - g_idx = self.model_tensors[base_name + ".g_idx"] - qweight = self.model_tensors[base_name + ".qweight"] - qzeros = self.model_tensors[base_name + ".qzeros"] - scales = self.model_tensors[base_name + ".scales"] - new_tensors[base_name + ".weight"] = ( - lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq( - g(), w(), z(), s() - ) - ) - tensors_to_remove += [ - base_name + n - for n in ( - ".g_idx", - ".qzeros", - ".qweight", - ".scales", - ) - ] - elif quant_method == "compressed-tensors": - quant_format = quant_config["format"] - groups = quant_config["config_groups"] - if len(groups) > 1: - raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet") - weight_config = tuple(groups.values())[0]["weights"] - - if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized": - block_size = weight_config.get("block_structure", None) - strategy = weight_config.get("strategy") - assert strategy == "channel" or strategy == "block" - assert weight_config.get("group_size") is None # didn't find a model using this yet - for name in self.model_tensors.keys(): - if name.endswith(".weight_scale"): - weight_name = name.removesuffix("_scale") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size) - tensors_to_remove.append(name) - elif quant_format == "pack-quantized": - assert weight_config.get("strategy") == "group" - assert weight_config.get("type", "int") == "int" - num_bits = weight_config.get("num_bits") - group_size = weight_config.get("group_size") - assert isinstance(num_bits, int) - assert isinstance(group_size, int) - for name in self.model_tensors.keys(): - if name.endswith(".weight_packed"): - base_name = name.removesuffix("_packed") - w = self.model_tensors[name] - scale = self.model_tensors[base_name + "_scale"] - shape = self.model_tensors[base_name + "_shape"] - zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None) - new_tensors[base_name] = ( - lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed( - w(), scale(), shape(), zero_point(), num_bits, group_size, - ) - ) - tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")] - if (base_name + "_zero_point") in self.model_tensors: - tensors_to_remove.append(base_name + "_zero_point") - else: - raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported") - elif quant_method == "modelopt": - # Mixed-precision ModelOpt models: NVFP4 tensors are handled by - # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and - # are dequantized here. k/v scale tensors are unused. - for name in self.model_tensors.keys(): - if name.endswith(".weight_scale"): - weight_name = name.removesuffix("_scale") - w = self.model_tensors[weight_name] - s = self.model_tensors[name] - self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) - tensors_to_remove.append(name) - if name.endswith((".input_scale", ".k_scale", ".v_scale")): - tensors_to_remove.append(name) - elif quant_method is not None: - raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") - - for name in tensors_to_remove: - if name in self.model_tensors: - del self.model_tensors[name] - - for name, value in new_tensors.items(): - self.model_tensors[name] = value - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith("e_score_correction_bias"): - name = name.replace("e_score_correction_bias", "e_score_correction.bias") - - if "language_model." in name: - name = name.replace("language_model.", "") - - return name, gen - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, gen in self.model_tensors.items(): - yield name, gen() - - def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") - name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in name: - assert bid is not None - name = name.format(bid=bid) - return name + suffix - - def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - return False - key_name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in key_name: - if bid is None: - return False - key_name = key_name.format(bid=bid) - else: - if bid is not None: - return False - return name == (key_name + suffix) - - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: - new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - if new_name is None: - raise ValueError(f"Can not map tensor {name!r}") - return new_name - - def set_gguf_parameters(self): - raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - # Handle gate/up expert tensor fusion if enabled - if self.fuse_gate_up_exps and bid is not None: - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): - self._gate_exp_buffer[bid] = data_torch - elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): - self._up_exp_buffer[bid] = data_torch - - # Check if both gate and up are buffered for this layer - if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: - gate_data = self._gate_exp_buffer.pop(bid) - up_data = self._up_exp_buffer.pop(bid) - # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) - fused_data = torch.cat([gate_data, up_data], dim=1) - fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) - logger.info(f"Fused gate_exps and up_exps for layer {bid}") - return [(fused_name, fused_data)] - - # If we buffered a gate/up tensor, wait for the other - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ - self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): - return [] - - return [(new_name, data_torch)] - - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid, n_dims # unused - - return False - - # some models need extra generated tensors (like rope_freqs) - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - return () - - @staticmethod - def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]: - """Repack NVFP4 ModelOpt tensors into ggml super-block layout. - Preserves original E4M3 scale bits as UE4M3 (strip sign bit). - The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul(). - Returns (raw_data, logical_shape).""" - - out_features = weight.shape[0] - n_blocks = scale.shape[1] - - # Unpack ModelOpt nibble-packed weights - w = weight.reshape(out_features, n_blocks, 8) - vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16) - - # Preserve original E4M3 scale bits as UE4M3 (strip sign bit) - d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F - qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy() - - # Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements - n_super = n_blocks // 4 - d_grouped = d_ue.reshape(out_features, n_super, 4) - qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32) - raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36) - return raw, [out_features, n_super * 64] - - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): - new_name = self.map_tensor_name(name) - - raw, shape = self._nvfp4_pack(weight, scale) - logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4") - self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4) - - self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2) - self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale) - - def _generate_nvfp4_tensors(self): - # Per-layer expert merging to avoid holding all experts in memory - expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {} - expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} - expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {} - expert_shapes: dict[tuple[int, str], list[int]] = {} - n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 - consumed: list[str] = [] - - for name in self.model_tensors.keys(): - if not name.endswith(".weight"): - continue - scale_name = name.replace(".weight", ".weight_scale") - scale2_name = name.replace(".weight", ".weight_scale_2") - input_scale_name = name.replace(".weight", ".input_scale") - if scale_name not in self.model_tensors: - continue - # Force eager materialization of lazy tensors - weight = LazyTorchTensor.to_eager(self.model_tensors[name]()) - scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]()) - - # Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales) - if scale.ndim < 2: - continue - - scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))()) - input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))()) - - # Mark tensors for removal from model_tensors (already written to gguf) - consumed.extend([name, scale_name]) - if scale2_name in self.model_tensors: - consumed.append(scale2_name) - if input_scale_name in self.model_tensors: - consumed.append(input_scale_name) - - # Check if this is a per-expert tensor - m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name) - if m: - expert_id = int(m.group(1)) - proj_type = m.group(2) - bid_m = re.search(r'\.layers\.(\d+)\.', name) - bid = int(bid_m.group(1)) if bid_m else 0 - key = (bid, proj_type) - - raw, shape = self._nvfp4_pack(weight, scale) - - if key not in expert_blocks: - expert_blocks[key] = [] - expert_scales[key] = [] - expert_input_scales[key] = [] - expert_shapes[key] = shape - expert_blocks[key].append((expert_id, raw.copy())) - # Collect per-expert scale2 (scalar per expert) - expert_scales[key].append((expert_id, float(scale2.float().sum()))) - # Collect per-expert input_scale (scalar per expert) - expert_input_scales[key].append((expert_id, float(input_scale.float().sum()))) - - # Flush when all experts for this (layer, proj) are collected - if n_experts > 0 and len(expert_blocks[key]) >= n_experts: - self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) - else: - self._repack_nvfp4(name, weight, scale, scale2, input_scale) - - # Flush any remaining experts (fallback if n_experts was unknown) - for bid, proj_type in list(expert_blocks.keys()): - self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type) - - # Remove consumed tensors so get_tensors/modify_tensors won't see them - for name in consumed: - self.model_tensors.pop(name, None) - - # Remove any remaining unused auxiliary tensors - for name in list(self.model_tensors.keys()): - if name.endswith((".k_scale", ".v_scale")): - del self.model_tensors[name] - - def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type): - experts = expert_blocks.pop(key) - scales = expert_scales.pop(key) - input_scales = expert_input_scales.pop(key) - shape = expert_shapes.pop(key) - - experts.sort(key=lambda x: x[0]) - merged = np.stack([e[1] for e in experts], axis=0) - merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight" - new_name = self.map_tensor_name(merged_name) - logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4") - self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4) - - scales.sort(key=lambda x: x[0]) - self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales]) - - input_scales.sort(key=lambda x: x[0]) - self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales]) - - del experts, merged - - def prepare_tensors(self): - # detect NVFP4 quantization (ModelOpt format) - quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") - quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method") - quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} - quant_config_file = self.dir_model / "hf_quant_config.json" - - if (not quant_algo or not quant_layers) and quant_config_file.is_file(): - with open(quant_config_file, "r", encoding="utf-8") as f: - hf_quant_config = json.load(f) - quant_config = hf_quant_config.get("quantization") or {} - producer = hf_quant_config.get("producer") or {} - producer_name = (producer.get("name") or "").lower() - if quant_method is None: - self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name - quant_algo = quant_config.get("quant_algo", quant_algo) - quant_layers = quant_config.get("quantized_layers", quant_layers) or {} - - # Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with - # per-layer NVFP4/FP8) instead of a single global "NVFP4" value. - if quant_algo != "NVFP4": - if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)): - quant_algo = "NVFP4" - - self._is_nvfp4 = quant_algo == "NVFP4" - self._is_mxfp4 = quant_method == "mxfp4" - - # NVFP4 weights are repacked and written directly to gguf_writer. - # This must run before dequant_model so NVFP4 tensors are removed - # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant. - if self._is_nvfp4: - self._generate_nvfp4_tensors() - - self.dequant_model() - - # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) - if self.tensor_map.mapping: - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - else: - max_name_len = len("vision_encoder.weight,") # Default reasonable length - - for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # use the first number-like part of the tensor name as the block id - bid = None - for part in name.split("."): - if part.isdecimal(): - bid = int(part) - break - - for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - # TODO: why do we squeeze here? - # data = data_torch.squeeze().numpy() - data = data_torch.numpy() - - n_dims = len(data.shape) - data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) - - # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - if n_dims <= 1 or new_name.endswith("_norm.weight"): - data_qtype = gguf.GGMLQuantizationType.F32 - - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - # Some tensor types are always in float32 - if data_qtype is False and ( - any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - gguf.MODEL_TENSOR.SSM_CONV1D, - gguf.MODEL_TENSOR.SHORTCONV_CONV, - gguf.MODEL_TENSOR.TIME_MIX_FIRST, - gguf.MODEL_TENSOR.TIME_MIX_W1, - gguf.MODEL_TENSOR.TIME_MIX_W2, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, - gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, - gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, - gguf.MODEL_TENSOR.POSNET_NORM1, - gguf.MODEL_TENSOR.POSNET_NORM2, - gguf.MODEL_TENSOR.V_ENC_EMBD_POS, - gguf.MODEL_TENSOR.A_ENC_EMBD_POS, - gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, - gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, - # Kimi KDA conv weights should be F32 - gguf.MODEL_TENSOR.SSM_CONV1D_Q, - gguf.MODEL_TENSOR.SSM_CONV1D_K, - gguf.MODEL_TENSOR.SSM_CONV1D_V, - ) - ) - or new_name[-7:] not in (".weight", ".lora_a", ".lora_b") - ): - data_qtype = gguf.GGMLQuantizationType.F32 - - if data_qtype is False and any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - gguf.MODEL_TENSOR.ALTUP_ROUTER, - gguf.MODEL_TENSOR.LAUREL_L, - gguf.MODEL_TENSOR.LAUREL_R, - ) - ): - if self.ftype in ( - gguf.LlamaFileType.MOSTLY_TQ1_0, - gguf.LlamaFileType.MOSTLY_TQ2_0, - ): - # TODO: use Q4_K and Q6_K - data_qtype = gguf.GGMLQuantizationType.F16 - - # No override (data_qtype is False), or wants to be quantized (data_qtype is True) - if isinstance(data_qtype, bool): - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: - data_qtype = gguf.GGMLQuantizationType.TQ1_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: - data_qtype = gguf.GGMLQuantizationType.TQ2_0 - else: - raise ValueError(f"Unknown file type: {self.ftype.name}") - - try: - data = gguf.quants.quantize(data, data_qtype) - except gguf.QuantError as e: - logger.warning("%s, %s", e, "falling back to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - - # n_dims is implicit in the shape - logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MODEL) - - def prepare_metadata(self, vocab_only: bool): - - total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() - - self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) - - # If we are using HF model id, set the metadata name to the model id - if self.remote_hf_model_id: - self.metadata.name = self.remote_hf_model_id - - # Fallback to model directory name if metadata name is still missing - if self.metadata.name is None: - self.metadata.name = self.dir_model.name - - if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): - if self._is_nvfp4: - self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 - elif self._is_mxfp4: - self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE - - # Generate parameter weight class (useful for leader boards) if not yet determined - if self.metadata.size_label is None and total_params > 0: - self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) - - self.set_type() - - logger.info("Set meta model") - self.metadata.set_gguf_meta_model(self.gguf_writer) - - logger.info("Set model parameters") - self.set_gguf_parameters() - - logger.info("Set model quantization version") - self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - - def write_vocab(self): - raise NotImplementedError("write_vocab() must be implemented in subclasses") - - def write(self): - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - - @staticmethod - def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.startswith(prefix) and filename.endswith(suffix): - part_names.append(filename) - - part_names.sort() - - return part_names - - @staticmethod - def load_hparams(dir_model: Path, is_mistral_format: bool): - if is_mistral_format: - with open(dir_model / "params.json", "r", encoding="utf-8") as f: - config = json.load(f) - return config - - try: - # for security reason, we don't allow loading remote code by default - # if a model need remote code, we will fallback to config.json - config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() - except Exception as e: - logger.warning(f"Failed to load model config from {dir_model}: {e}") - logger.warning("Trying to load config.json instead") - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - config = json.load(f) - if "llm_config" in config: - # rename for InternVL - config["text_config"] = config["llm_config"] - if "lm_config" in config: - # rename for GlmASR - config["text_config"] = config["lm_config"] - if "thinker_config" in config: - # rename for Qwen2.5-Omni - config["text_config"] = config["thinker_config"]["text_config"] - if "language_config" in config: - # rename for DeepSeekOCR - config["text_config"] = config["language_config"] - if "lfm" in config: - # rename for LFM2-Audio - config["text_config"] = config["lfm"] - return config - - @classmethod - def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: - assert names - - def func(modelcls: AnyModel) -> AnyModel: - model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT - for name in names: - cls._model_classes[model_type][name] = modelcls - return modelcls - return func - - @classmethod - def print_registered_models(cls): - for model_type, model_classes in cls._model_classes.items(): - logger.error(f"{model_type.name} models:") - for name in sorted(model_classes.keys()): - logger.error(f" - {name}") - - @classmethod - def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: - try: - return cls._model_classes[model_type][arch] - except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - - -class TextModel(ModelBase): - model_type = ModelType.TEXT - hf_arch: str - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if not self.is_mistral_format: - self.hf_arch = get_model_architecture(self.hparams, self.model_type) - else: - self.hf_arch = "" - - if "text_config" in self.hparams: - # move the text_config to the root level - self.hparams = {**self.hparams, **self.hparams["text_config"]} - - self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} - - rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True) - local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True) - - # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters - if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: - if local_rope_theta is not None: - self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} - if "rope_theta" not in self.rope_parameters and rope_theta is not None: - self.rope_parameters["rope_theta"] = rope_theta - if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: - self.rope_parameters["rope_type"] = rope_type - - @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip multimodal tensors - if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ - or "visual." in name or "vision." in name or "audio." in name or "talker." in name \ - or "vision_" in name or "audio_" in name or "sam_model" in name \ - or "token2wav." in name or "code2wav." in name \ - or "projector." in name or "pre_mm_projector_norm" in name \ - or "image_newline" in name or "view_seperator" in name \ - or "patch_embed" in name or "patch_embedding" in name \ - or "patch_merger." in name or "model.connector." in name: - return None - - return super().filter_tensors(item) - - def set_vocab(self): - self._set_vocab_gpt2() - - def prepare_metadata(self, vocab_only: bool): - super().prepare_metadata(vocab_only=vocab_only) - - total_params = self.gguf_writer.get_total_parameter_count()[0] - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' - output_type: str = self.ftype.name.partition("_")[2] - - # Filename Output - if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata - if not vocab_only: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) - else: - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") - - # Use the default filename - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - - # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - - logger.info("Set model tokenizer") - self.set_vocab() - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if self.hparams.get("is_causal") is False: - self.gguf_writer.add_causal_attention(False) - logger.info("gguf: causal attention = False") - - # TODO: Handle "sliding_attention" similarly when models start implementing it - rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) - if (rope_type := rope_params.get("rope_type")) is not None: - rope_factor = rope_params.get("factor") - rope_gguf_type = gguf.RopeScalingType.NONE - if rope_type == "linear" and rope_factor is not None: - rope_gguf_type = gguf.RopeScalingType.LINEAR - self.gguf_writer.add_rope_scaling_type(rope_gguf_type) - self.gguf_writer.add_rope_scaling_factor(rope_factor) - elif rope_type == "yarn" and rope_factor is not None: - rope_gguf_type = gguf.RopeScalingType.YARN - self.gguf_writer.add_rope_scaling_type(rope_gguf_type) - self.gguf_writer.add_rope_scaling_factor(rope_factor) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) - if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None: - self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor) - if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None: - self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor) - if (yarn_beta_fast := rope_params.get("beta_fast")) is not None: - self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast) - if (yarn_beta_slow := rope_params.get("beta_slow")) is not None: - self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow) - # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - elif rope_type == "su" or rope_type == "longrope": - rope_gguf_type = gguf.RopeScalingType.LONGROPE - self.gguf_writer.add_rope_scaling_type(rope_gguf_type) - elif rope_type == "dynamic": - # HunYuan, handled in model class - pass - elif rope_type.lower() == "llama3": - # Handled in generate_extra_tensors - pass - else: - logger.warning(f"Unknown RoPE type: {rope_type}") - logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") - - if "mrope_section" in self.rope_parameters: - mrope_section = self.rope_parameters["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - logger.info(f"gguf: mrope sections: {mrope_section[:4]}") - - if (rope_theta := rope_params.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) - logger.info(f"gguf: rope theta swa = {local_rope_theta}") - if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - if (n_expert_groups := self.hparams.get("n_group")) is not None: - self.gguf_writer.add_expert_group_count(n_expert_groups) - logger.info(f"gguf: expert groups count = {n_expert_groups}") - if (n_group_used := self.hparams.get("topk_group")) is not None: - self.gguf_writer.add_expert_group_used_count(n_group_used) - logger.info(f"gguf: expert groups used count = {n_group_used}") - - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None: - if score_func == "sigmoid": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - elif score_func == "softmax": - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - raise ValueError(f"Unsupported expert score gating function value: {score_func}") - logger.info(f"gguf: expert score gating function = {score_func}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def write_vocab(self): - if len(self.gguf_writer.tensors) != 1: - raise ValueError('Splitting the vocabulary is not supported') - - self.prepare_metadata(vocab_only=True) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - def does_token_look_special(self, token: str | bytes) -> bool: - if isinstance(token, (bytes, bytearray)): - token_text = token.decode(encoding="utf-8") - elif isinstance(token, memoryview): - token_text = token.tobytes().decode(encoding="utf-8") - else: - token_text = token - - # Some models mark some added tokens which ought to be control tokens as not special. - # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) - seems_special = token_text in ( - "<pad>", # deepseek-coder - "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2} - ) - - seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) - seems_special = seems_special or (token_text.startswith("<๏ฝœ") and token_text.endswith("๏ฝœ>")) # deepseek-coder - - # TODO: should these be marked as UNUSED instead? (maybe not) - seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2} - - return seems_special - - def load_hf_tokenizer(self, *, trust_remote_code: bool = False): - from transformers import AutoTokenizer, PreTrainedTokenizerFast - - try: - return AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=trust_remote_code) - except ValueError as exc: - tokenizer_config_path = self.dir_model / "tokenizer_config.json" - tokenizer_class = None - - if tokenizer_config_path.is_file(): - with open(tokenizer_config_path, encoding="utf-8") as f: - tokenizer_class = json.load(f).get("tokenizer_class") - - if tokenizer_class != "TokenizersBackend": - raise - - tokenizer_json_path = self.dir_model / "tokenizer.json" - tokenizer_config = {} - if tokenizer_config_path.is_file(): - with open(tokenizer_config_path, encoding="utf-8") as f: - tokenizer_config = json.load(f) - - if tokenizer_json_path.is_file(): - logger.warning( - "AutoTokenizer failed for %s with tokenizer_class=%s; " - "falling back to tokenizer.json", - self.dir_model, - tokenizer_class, - ) - return PreTrainedTokenizerFast( - tokenizer_file=str(tokenizer_json_path), - bos_token=tokenizer_config.get("bos_token"), - eos_token=tokenizer_config.get("eos_token"), - unk_token=tokenizer_config.get("unk_token"), - pad_token=tokenizer_config.get("pad_token"), - clean_up_tokenization_spaces=tokenizer_config.get("clean_up_tokenization_spaces", False), - split_special_tokens=tokenizer_config.get("split_special_tokens", False), - ) - - logger.warning( - "AutoTokenizer failed for %s with tokenizer_class=%s; " - "falling back to AutoProcessor.tokenizer", - self.dir_model, - tokenizer_class, - ) - - from transformers import AutoProcessor - - processor = AutoProcessor.from_pretrained( - self.dir_model, - trust_remote_code=trust_remote_code, - ) - tokenizer = getattr(processor, "tokenizer", None) - if tokenizer is None: - raise ValueError("AutoProcessor fallback did not provide a tokenizer") from exc - - return tokenizer - - # used for GPT-2 BPE and WordPiece vocabs - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - tokenizer = self.load_hf_tokenizer() - vocab = tokenizer.get_vocab() - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - # NOTE: this was added for Gemma. - # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - return tokens, toktypes, tokpre - - # NOTE: this function is generated by convert_hf_to_gguf_update.py - # do not modify it manually! - # ref: https://github.com/ggml-org/llama.cpp/pull/6920 - # Marker: Start get_vocab_base_pre - def get_vocab_base_pre(self, tokenizer) -> str: - # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that - # is specific for the BPE pre-tokenizer used by the model - # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can - # use in llama.cpp to implement the same pre-tokenizer - - chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n๐Ÿš€ (normal) ๐Ÿ˜ถ\u200d๐ŸŒซ๏ธ (multiple emojis concatenated) โœ… ๐Ÿฆ™๐Ÿฆ™ 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 แž€แžถแž“แŸ‹แžแŸ‚แž–แžทแžŸแŸแžŸแžขแžถแž…๐Ÿ˜ ?ๆˆ‘ๆƒณๅœจappleๅทฅไฝœ1314151ๅคฉ๏ฝž ------======= ะฝะตั‰ะพ ะฝะฐ ะ‘ัŠะปะณะฐั€ัะบะธ \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' - - chktok = tokenizer.encode(chktxt) - chkhsh = sha256(str(chktok).encode()).hexdigest() - - logger.debug(f"chktok: {chktok}") - logger.debug(f"chkhsh: {chkhsh}") - - res = None - - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script - # or pull the latest version of the model from Huggingface - # don't edit the hashes manually! - if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": - # ref: https://huggingface.co/THUDM/glm-4-9b-chat - res = "chatglm-bpe" - if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516": - # ref: https://huggingface.co/THUDM/glm-4-9b-chat - res = "chatglm-bpe" - if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2": - # ref: https://huggingface.co/THUDM/glm-4-9b-hf - res = "glm4" - if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902": - # ref: https://huggingface.co/zai-org/GLM-4.5-Air - res = "glm4" - if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267": - # ref: https://huggingface.co/zai-org/GLM-4.7-Flash - res = "glm4" - if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": - # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 - res = "minerva-7b" - if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": - # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct - res = "hunyuan" - if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6": - # ref: https://huggingface.co/tencent/Hunyuan-4B-Instruct - res = "hunyuan-dense" - if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": - # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base - res = "falcon-h1" - if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": - # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base - res = "falcon-h1" - if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896": - # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base - res = "falcon-h1" - if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": - # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base - res = "falcon-h1" - if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890": - # ref: https://huggingface.co/moonshotai/Kimi-K2-Base - res = "kimi-k2" - if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": - # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B - res = "qwen2" - if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f": - # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6 - res = "qwen35" - if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": - # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer - res = "grok-2" - if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": - # ref: https://huggingface.co/aari1995/German_Semantic_V3 - res = "jina-v2-de" - if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4": - # ref: https://huggingface.co/evilfreelancer/ruGPT3XL - res = "gpt-2" - if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B - res = "llama-bpe" - if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base - res = "deepseek-llm" - if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base - res = "deepseek-coder" - if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - # ref: https://huggingface.co/tiiuae/falcon-7b - res = "falcon" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 - res = "bert-bge" - if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": - # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base - res = "falcon3" - if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": - # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 - res = "bert-bge-large" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/mosaicml/mpt-7b - res = "mpt" - if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - # ref: https://huggingface.co/bigcode/starcoder2-3b - res = "starcoder" - if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - # ref: https://huggingface.co/openai-community/gpt2 - res = "gpt-2" - if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b - res = "stablelm2" - if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - # ref: https://huggingface.co/smallcloudai/Refact-1_6-base - res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" - if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1": - # ref: https://huggingface.co/CohereLabs/tiny-aya-base - res = "tiny_aya" - if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - # ref: https://huggingface.co/Qwen/Qwen1.5-7B - res = "qwen2" - if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf - res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" - if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": - # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - res = "jina-v1-en" - if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en - res = "jina-v2-en" - if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es - res = "jina-v2-es" - if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de - res = "jina-v2-de" - if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": - # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano - res = "jina-v5-nano" - if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct - res = "smaug-bpe" - if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": - # ref: https://huggingface.co/LumiOpen/Poro-34B-chat - res = "poro-chat" - if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": - # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code - res = "jina-v2-code" - if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": - # ref: https://huggingface.co/LumiOpen/Viking-7B - res = "viking" - if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": - # ref: https://huggingface.co/core42/jais-13b - res = "jais" - if chkhsh == "bc5108ee1eb6a3d600cadd065f63190fbd0554dbc9e4bbd6a0d977970afc8d2a": - # ref: https://huggingface.co/inceptionai/Jais-2-8B-Chat - res = "jais-2" - if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": - # ref: https://huggingface.co/WisdomShell/CodeShell-7B - res = "codeshell" - if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": - # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 - res = "tekken" - if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": - # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M - res = "smollm" - if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": - # ref: https://huggingface.co/bigscience/bloom - res = "bloom" - if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": - # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small - res = "gpt3-finnish" - if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": - # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct - res = "exaone" - if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": - # ref: https://huggingface.co/microsoft/phi-2 - res = "phi-2" - if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": - # ref: https://huggingface.co/facebook/chameleon-7b - res = "chameleon" - if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": - # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base - res = "roberta-bpe" - if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": - # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct - res = "gigachat" - if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": - # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct - res = "megrez" - if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 - res = "deepseek-v3" - if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": - # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B - res = "deepseek-r1-qwen" - if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": - # ref: https://huggingface.co/Xenova/gpt-4o - res = "gpt-4o" - if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": - # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k - res = "superbpe" - if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15": - # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview - res = "trillion" - if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224": - # ref: https://huggingface.co/inclusionAI/Ling-lite - res = "bailingmoe" - if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406": - # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct - res = "llama4" - if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3": - # ref: https://huggingface.co/mistral-community/pixtral-12b - res = "pixtral" - if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": - # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base - res = "seed-coder" - if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf": - # ref: https://huggingface.co/skt/A.X-4.0 - res = "a.x-4.0" - if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4": - # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct - res = "midm-2.0" - if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51": - # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer - res = "lfm2" - if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": - # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B - res = "exaone4" - if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": - # ref: https://huggingface.co/JetBrains/Mellum-4b-base - res = "mellum" - if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152": - # ref: https://huggingface.co/answerdotai/ModernBERT-base - res = "modern-bert" - if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df": - # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer - res = "afmoe" - if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": - # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0 - res = "bailingmoe2" - if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e": - # ref: https://huggingface.co/ibm-granite/granite-docling-258M - res = "granite-docling" - if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95": - # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2 - res = "minimax-m2" - if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665": - # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer - res = "kormo" - if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1": - # ref: https://huggingface.co/tencent/Youtu-LLM-2B - res = "youtu" - if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91": - # ref: https://huggingface.co/upstage/Solar-Open-100B - res = "solar-open" - if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f": - # ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B - res = "exaone-moe" - if chkhsh == "d30d75d9059f1aa2c19359de71047b3ae408c70875e8a3ccf8c5fba56c9d8af4": - # ref: https://huggingface.co/Qwen/Qwen3.5-9B-Instruct - res = "qwen35" - if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": - # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash - res = "joyai-llm" - if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": - # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 - res = "kanana2" - if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": - # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B - res = "f2llmv2" - if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57": - # ref: https://huggingface.co/sarvamai/sarvam-30b - res = "sarvam-moe" - - if res is None: - logger.warning("\n") - logger.warning("**************************************************************************************") - logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") - logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") - logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") - logger.warning("**") - logger.warning(f"** chkhsh: {chkhsh}") - logger.warning("**************************************************************************************") - logger.warning("\n") - raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - - logger.debug(f"tokenizer.ggml.pre: {repr(res)}") - logger.debug(f"chkhsh: {chkhsh}") - - return res - # Marker: End get_vocab_base_pre - - def _set_vocab_none(self) -> None: - self.gguf_writer.add_tokenizer_model("none") - - def _set_vocab_gpt2(self) -> None: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_qwen(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokpre = self.get_vocab_base_pre(tokenizer) - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self, add_to_gguf=True): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _create_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.find_hparam([ - "vocab_size_per_layer_input", # gemma3n - "vocab_size", - ], optional=True) or tokenizer.vocab_size() - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - if token_id >= vocab_size: - logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') - break - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, token_data in added_tokens_decoder.items(): - token_id = int(token_id) - token: str = token_data["content"] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token.encode("utf-8"): - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') - if token_data.get("special") or self.does_token_look_special(token): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - scores[token_id] = -1000.0 - tokens[token_id] = token.encode("utf-8") - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - return tokens, scores, toktypes - - def _set_vocab_llama_hf(self): - vocab = gguf.LlamaHfVocab(self.dir_model) - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_rwkv_world(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = ['<s>'.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - if special_vocab.chat_template is None: - template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja" - if template_path.is_file(): - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - else: - template = "rwkv-world" - special_vocab.chat_template = template - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - # hack: Override these as they have already been set (incorrectly) - special_vocab.special_token_ids["bos"] = 0 - special_vocab.special_token_ids["eos"] = 0 - - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): - tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" - logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") - vocab_reader = gguf.GGUFReader(tokenizer_path, "r") - - default_pre = "mpt" if model_name == "gpt-neox" else "default" - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) - assert field # tokenizer model - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) - self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field # token list - self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) - - if model_name == "llama-spm": - field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) - assert field # token scores - self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field # token types - self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) - - if model_name != "llama-spm": - field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field # token merges - self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) - - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None: - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None: - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None: - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None: - self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None: - self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0]) - if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None: - self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) - - def _try_set_pooling_type(self) -> None: - # get pooling path - pooling_path = None - module_path = self.dir_model / "modules.json" - if module_path.is_file(): - with open(module_path, encoding="utf-8") as f: - modules = json.load(f) - for mod in modules: - if mod["type"].endswith("Pooling"): - pooling_path = mod["path"] - break - - mode_mapping = { - "mean": gguf.PoolingType.MEAN, - "cls": gguf.PoolingType.CLS, - "lasttoken": gguf.PoolingType.LAST, - } - - # get pooling type - if pooling_path is not None: - with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: - pooling = json.load(f) - if pooling.get("pooling_mode_mean_tokens"): - pooling_type = gguf.PoolingType.MEAN - elif pooling.get("pooling_mode_cls_token"): - pooling_type = gguf.PoolingType.CLS - elif pooling.get("pooling_mode_lasttoken"): - pooling_type = gguf.PoolingType.LAST - elif (pooling_mode := pooling.get("pooling_mode")) in mode_mapping: - pooling_type = mode_mapping[pooling_mode] - else: - raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type) - - def _set_vocab_glmedge(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_glm(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - # Special tokens - # Note: Using <|endoftext|> (151329) for eot causes endless generation - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # ty: ignore[unresolved-attribute] # 151331 - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] # 151336 - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] # 151329 - special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # ty: ignore[unresolved-attribute] # 151338 - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_interns1(self): - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) # ty: ignore[unresolved-attribute] - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab._set_special_token("bos", 151643) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_mistral(self): - if not _mistral_common_installed: - raise ImportError(_mistral_import_error_msg) - - vocab = MistralVocab(self.dir_model) - logger.info( - f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." - ) - - self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) - - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size, ( - f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" - ) - - if vocab.tokenizer_type == MistralTokenizerType.tekken: - self.gguf_writer.add_tokenizer_pre("tekken") - self.gguf_writer.add_token_merges( - vocab.extract_vocab_merges_from_model() - ) - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." - ) - - self.gguf_writer.add_bos_token_id(vocab.bos_id) - self.gguf_writer.add_eos_token_id(vocab.eos_id) - self.gguf_writer.add_unk_token_id(vocab.unk_id) - self.gguf_writer.add_pad_token_id(vocab.pad_id) - - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_vocab_size(vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(False) - - local_template_file_path = self.dir_model / "chat_template.jinja" - - if self.is_mistral_format and local_template_file_path.is_file(): - # Ministral-3 and other new Mistral models come with chat templates. - # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main - logger.info("Using an existing Mistral local chat template.") - - with open(local_template_file_path, "r", encoding="utf-8") as f: - template = f.read() - elif not self.is_mistral_format or not self.disable_mistral_community_chat_template: - template_dir = Path(__file__).parent / "models/templates/" - - # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. - if self.is_mistral_format: - logger.info( - "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " - "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." - ) - template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) - else: - logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") - template = None - - if template is not None: - self.gguf_writer.add_chat_template(template) - - def _set_vocab_plamo(self): - # PLaMo models use a custom tokenizer with a .jsonl file - tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl" - tokenizer_config_path = self.dir_model / "tokenizer_config.json" - - if not tokenizer_jsonl_path.is_file(): - raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") - - # Load tokenizer config - with open(tokenizer_config_path, "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - # Load tokens from JSONL file (actually a list format) - tokens = [] - scores = [] - toktypes = [] - - with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f): - if line.strip(): - token_data = json.loads(line) - # Format: [token, score, type, ?, ?, ?, ?] - token = token_data[0].encode("utf-8") - score = float(token_data[1]) - token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL" - - tokens.append(token) - scores.append(score) - - if token_type_str == "UNKNOWN": - toktypes.append(gguf.TokenType.UNKNOWN) - elif token_type_str == "CONTROL": - toktypes.append(gguf.TokenType.CONTROL) - elif token_type_str == "BYTE": - toktypes.append(gguf.TokenType.BYTE) - else: - token_str = token_data[0] - if token_str.startswith("<|plamo:") and token_str.endswith("|>"): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - vocab_size = self.hparams["vocab_size"] - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("plamo2") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None: - token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8")) - self.gguf_writer.add_bos_token_id(token_id) - if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None: - token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8")) - self.gguf_writer.add_eos_token_id(token_id) - if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None: - token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8")) - self.gguf_writer.add_pad_token_id(token_id) - if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None: - token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8")) - self.gguf_writer.add_sep_token_id(token_id) - if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None: - token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8")) - self.gguf_writer.add_unk_token_id(token_id) - - # Add <|plamo:op|> as EOT to ensure appropriate end of generation - self.gguf_writer.add_eot_token_id(4) - - self.gguf_writer.add_add_space_prefix(False) - - -class MmprojModel(ModelBase): - model_type = ModelType.MMPROJ - model_arch = gguf.MODEL_ARCH.MMPROJ - preprocessor_config: dict[str, Any] - global_config: dict[str, Any] - - n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"] - - has_vision_encoder: bool = True # by default - has_audio_encoder: bool = False - - # for models having multiple encoders, we need to separate their hparams - hparams_vision: dict[str, Any] | None = None - hparams_audio: dict[str, Any] | None = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if self.model_arch != gguf.MODEL_ARCH.MMPROJ: - raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") - - # get n_embd of the text model - if not self.is_mistral_format: - if "text_config" not in self.hparams: - self.hparams["text_config"] = {} - if "audio_config" not in self.hparams: - self.hparams["audio_config"] = {} - text_config = {**self.hparams, **self.hparams["text_config"]} - self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) - else: - text_config = { - k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] - } - # mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size) - self.n_embd_text = text_config.get("dim", 0) - - assert self.n_embd_text > 0, "n_embd not found in hparams" - - # move vision config to the top level, while preserving the original hparams in global_config - import copy - self.global_config = copy.deepcopy(self.hparams) - self.hparams_vision = self.get_vision_config() - self.hparams_audio = self.get_audio_config() - - if self.hparams_vision is None and self.hparams_audio is None: - raise ValueError("vision_config / audio_config not found in hparams") - - # for compat with vision-only models - self.hparams = self.hparams_vision or self.hparams_audio or self.hparams - - # TODO @ngxson : this is a hack to support both vision and audio encoders - have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder - self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True) - self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) - - # load preprocessor config - self.preprocessor_config = {} - - # prefer preprocessor_config.json if possible - preprocessor_config_path = self.dir_model / "preprocessor_config.json" - if preprocessor_config_path.is_file(): - with open(preprocessor_config_path, "r", encoding="utf-8") as f: - cfg = json.load(f) - # move media_proc_cfg to root level for compat - if "media_proc_cfg" in cfg: - cfg = { - **cfg, - **cfg["media_proc_cfg"], - } - # merge configs - self.preprocessor_config = {**self.preprocessor_config, **cfg} - - # prefer processor_config.json if possible - processor_config_path = self.dir_model / "processor_config.json" - if processor_config_path.is_file(): - with open(processor_config_path, "r", encoding="utf-8") as f: - cfg = json.load(f) - # move image_processor to root level for compat - if "image_processor" in cfg: - cfg = { - **cfg, - **cfg["image_processor"], - } - # merge configs - self.preprocessor_config = {**self.preprocessor_config, **cfg} - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip non-multimodal tensors - if "language_model." in name: - return None - - return super().filter_tensors(item) - - def get_vision_config(self) -> dict[str, Any] | None: - config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" - return self.global_config.get(config_name) - - def get_audio_config(self) -> dict[str, Any] | None: - mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config" - return self.global_config.get(mm_config_key) - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) - - def prepare_metadata(self, vocab_only: bool): - super().prepare_metadata(vocab_only=vocab_only) - - output_type: str = self.ftype.name.partition("_")[2] - - if self.fname_out.is_dir(): - fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None) - self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf" - else: - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - - def set_gguf_parameters(self): - self.gguf_writer.add_file_type(self.ftype) - - if self.has_vision_encoder: - self.gguf_writer.add_clip_has_vision_encoder(True) - self.gguf_writer.add_vision_projection_dim(self.n_embd_text) - - # vision config - self.image_size = self.find_vparam(["image_size"]) - self.gguf_writer.add_vision_image_size(self.image_size) - self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) - self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"])) - self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"])) - self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) - self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"])) - - # preprocessor config - image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] - image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] - - self.gguf_writer.add_vision_image_mean(image_mean) - self.gguf_writer.add_vision_image_std(image_std) - - if self.has_audio_encoder: - self.gguf_writer.add_clip_has_audio_encoder(True) - self.gguf_writer.add_audio_projection_dim(self.n_embd_text) - - # audio config - self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"])) - self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"])) - self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys)) - self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"])) - - if not self.has_vision_encoder and not self.has_audio_encoder: - raise ValueError("MmprojModel must have either vision or audio encoder") - - def write_vocab(self): - raise ValueError("MmprojModel does not support vocab writing") - - def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any: - assert self.hparams_vision is not None - return self._find_param(self.hparams_vision, keys, optional) - - def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any: - assert self.hparams_audio is not None - return self._find_param(self.hparams_audio, keys, optional) - - def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in obj), None) - if key is not None: - return obj[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name: - return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 - return False - - -@ModelBase.register("GPTNeoXForCausalLM") -class GPTNeoXModel(TextModel): - model_arch = gguf.MODEL_ARCH.GPTNEOX - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - assert n_head is not None - assert n_embed is not None - - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("BloomForCausalLM", "BloomModel") -class BloomModel(TextModel): - model_arch = gguf.MODEL_ARCH.BLOOM - - def set_gguf_parameters(self): - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - assert n_head is not None - assert n_embed is not None - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - assert n_head is not None - assert n_embed is not None - - name = re.sub(r'transformer\.', '', name) - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) - data_torch = torch.cat( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) - data_torch = torch.cat( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - dim=0, - ) - logger.info("re-format attention.linear_qkv.bias") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("MPTForCausalLM") -class MPTModel(TextModel): - model_arch = gguf.MODEL_ARCH.MPT - - def set_vocab(self): - try: - self._set_vocab_gpt2() - except Exception: - # Fallback for SEA-LION model - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_pad_token_id(3) - self.gguf_writer.add_eos_token_id(1) - self.gguf_writer.add_unk_token_id(0) - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - if self.hparams["attn_config"]["alibi"]: - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - else: - self.gguf_writer.add_max_alibi_bias(0.0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "scales" in name: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") - else: - new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - - yield from super().modify_tensors(data_torch, new_name, bid) - - -@ModelBase.register("OrionForCausalLM") -class OrionModel(TextModel): - model_arch = gguf.MODEL_ARCH.ORION - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 - self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - - -@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(TextModel): - model_arch = gguf.MODEL_ARCH.BAICHUAN - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": - logger.info(f"Unpacking and permuting layer {bid}") - yield from [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), - self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), - self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), - self._reverse_hf_part(data_torch, 2)), - ] - else: - yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -@ModelBase.register("XverseForCausalLM") -class XverseModel(TextModel): - model_arch = gguf.MODEL_ARCH.XVERSE - - def set_vocab(self): - assert (self.dir_model / "tokenizer.json").is_file() - dir_model = self.dir_model - hparams = self.hparams - - tokens: list[bytes] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] - # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, - # because vocab_size is the count of items, and indexes start at 0. - max_vocab_index = max(tokenizer.get_vocab().values()) # ty: ignore[unresolved-attribute] - if max_vocab_index >= vocab_size: - raise ValueError("Vocabulary size exceeds expected maximum size.") - - reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - for token_id in range(vocab_size): - token_text = reverse_vocab[token_id].encode('utf-8') - # replace "\x00" to string with length > 0 - if token_text == b"\x00": - toktype = gguf.TokenType.BYTE # special - token_text = f"<{token_text}>".encode('utf-8') - elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE # special - elif reverse_vocab[token_id] in added_vocab: - if tokenizer.added_tokens_decoder[token_id].special: # ty: ignore[unresolved-attribute] - toktype = gguf.TokenType.CONTROL - else: - toktype = gguf.TokenType.USER_DEFINED - else: - toktype = gguf.TokenType.NORMAL - - tokens.append(token_text) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith("q_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) - if name.endswith("k_proj.weight"): - data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - - yield from super().modify_tensors(data_torch, name, bid) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@ModelBase.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(TextModel): - model_arch = gguf.MODEL_ARCH.FALCON - - def set_gguf_parameters(self): - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 - head_dim = self.hparams["hidden_size"] // n_head - - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GPTBigCodeForCausalLM") -class StarCoderModel(TextModel): - model_arch = gguf.MODEL_ARCH.STARCODER - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -@ModelBase.register("GPTRefactForCausalLM") -class RefactModel(TextModel): - model_arch = gguf.MODEL_ARCH.REFACT - - def set_vocab(self): - super().set_vocab() - - # TODO: how to determine special FIM tokens automatically? - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot']) - special_vocab._set_special_token("prefix", 1) - special_vocab._set_special_token("suffix", 3) - special_vocab._set_special_token("middle", 2) - special_vocab.chat_template = None # do not add it twice - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - - if bid is not None: - if name == f"transformer.h.{bid}.attn.kv.weight": - yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) - yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) - return - if name == f"transformer.h.{bid}.attn.q.weight": - yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) - return - if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) - yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(TextModel): - model_arch = gguf.MODEL_ARCH.STABLELM - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab - self._set_vocab_qwen() - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) - self.gguf_writer.add_file_type(self.ftype) - - _q_norms: list[dict[str, Tensor]] | None = None - _k_norms: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - - if name.find("q_layernorm.norms") != -1: - assert bid is not None - - if self._q_norms is None: - self._q_norms = [{} for _ in range(self.block_count)] - - self._q_norms[bid][name] = data_torch - - if len(self._q_norms[bid]) >= n_head: - return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") - else: - return - - if name.find("k_layernorm.norms") != -1: - assert bid is not None - - if self._k_norms is None: - self._k_norms = [{} for _ in range(self.block_count)] - - self._k_norms[bid][name] = data_torch - - if len(self._k_norms[bid]) >= n_kv_head: - return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): - datas: list[Tensor] = [] - # extract the norms in order - for xid in range(n_head): - ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" - datas.append(norms[ename]) - del norms[ename] - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._q_norms is not None or self._k_norms is not None: - # flatten two `list[dict[str, Tensor]]` into a single `list[str]` - norms = ( - [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] - ) + ( - [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] - ) - if len(norms) > 0: - raise ValueError(f"Unprocessed norms: {norms}") - - -@ModelBase.register( - "LLaMAForCausalLM", - "LlamaForCausalLM", - "MistralForCausalLM", - "MixtralForCausalLM", - "VLlama3ForCausalLM", - "LlavaForConditionalGeneration", - "VoxtralForConditionalGeneration", - "IQuestCoderForCausalLM", - "LlamaModel") -class LlamaModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLAMA - undo_permute = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # fix for SmolVLM2, missing `num_attention_heads` in config.json - if self.hf_arch == "VLlama3ForCausalLM": - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) - # Mistral consolidated format has no config.json; origin_hf_arch is HF-only. - if self.is_mistral_format: - self.origin_hf_arch = None - else: - hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - def set_vocab(self): - if self.origin_hf_arch == "GlmasrModel": - return self._set_vocab_glmedge() - - if self.is_mistral_format: - return self._set_vocab_mistral() - - path_tekken_json = self.dir_model / "tekken.json" - path_tokenizer_json = self.dir_model / "tokenizer.json" - if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): - self._set_vocab_mistral() - - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() - - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - if not self.is_mistral_format: - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): - # Mirror the BF16 Q/K RoPE permutation site in modify_tensors; the NVFP4 path bypasses it. - if self.undo_permute: - n_head = self.find_hparam(["n_heads", "num_attention_heads"], optional=True) - n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"], optional=True) - if n_head is not None: - if name.endswith("q_proj.weight"): - weight = LlamaModel.permute(weight, n_head, n_head) - scale = LlamaModel.permute(scale, n_head, n_head) - elif name.endswith("k_proj.weight"): - weight = LlamaModel.permute(weight, n_head, n_kv_head) - scale = LlamaModel.permute(scale, n_head, n_kv_head) - super()._repack_nvfp4(name, weight, scale, scale2, input_scale) - - _experts: list[dict[str, Tensor]] | None = None - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "text_model." in name: - name = name.replace("text_model.", "") # for SmolVLM - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.find_hparam(["n_heads", "num_attention_heads"]) - n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) - - if self.hf_arch == "LlamaModel": - name = "model." + name - - if self.undo_permute: - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = rope_params.get("rope_theta", 10000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 8.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4 - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("ArceeForCausalLM") -class ArceeModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.ARCEE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - -@ModelBase.register("AfmoeForCausalLM") -class AfmoeModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.AFMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # MoE parameters - if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None: - self.gguf_writer.add_expert_shared_count(n_shared_experts) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None: - self.gguf_writer.add_leading_dense_block_count(n_dense_layers) - - # Route normalization and scaling - if (route_norm := self.hparams.get("route_norm")) is not None: - self.gguf_writer.add_expert_weights_norm(route_norm) - if (route_scale := self.hparams.get("route_scale")) is not None: - self.gguf_writer.add_expert_weights_scale(route_scale) - - # Sliding window attention - if (sliding_window := self.hparams.get("sliding_window")) is not None: - self.gguf_writer.add_sliding_window(sliding_window) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".expert_bias"): - name = name.replace(".expert_bias", ".expert_bias.bias") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle expert weights - they're already merged in the HF format - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["gate_proj", "up_proj", "down_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename_to_retrieve]) - del self._experts[bid][ename_to_retrieve] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) - - return - else: - return - - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register( - "LlavaForConditionalGeneration", # pixtral - "Mistral3ForConditionalGeneration", # mistral small 3.1 -) -class LlavaVisionModel(MmprojModel): - img_break_tok_id = -1 - use_break_tok = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams.get("model_type") == "pixtral": - # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py - self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) - if self.use_break_tok: - self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") - elif self.is_mistral_format: - # hparams is already vision config here so norm_eps is only defined in global_config. - self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) - assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" - if self.use_break_tok: - self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) - - # params.json may ship -1 placeholders (Mistral Medium 3.5) - # resolve the real id from the bundled tokenizer in that case - if self.img_break_tok_id < 0: - self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]") - else: - raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") - logger.info(f"Image break token id: {self.img_break_tok_id}") - - def get_token_id(self, token: str) -> int: - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {} - for id_, token_data in added_tokens_decoder.items(): - if token_data.get("content") == token: - return int(id_) - # fallthrough to tokenizer.json - with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - for token_data in tokenizer_json["added_tokens"]: - if token_data["content"] == token: - return int(token_data["id"]) - raise ValueError(f"Token '{token}' not found in tokenizer config.") - - def get_mistral_token_id(self, token: str) -> int: - # mistral native format ships tekken.json or a versioned spm tokenizer - tekken_file = self.dir_model / "tekken.json" - if tekken_file.is_file(): - with open(tekken_file, "r", encoding="utf-8") as f: - data = json.load(f) - for entry in data.get("special_tokens", []): - if entry.get("token_str") == token: - return int(entry["rank"]) - tokenizer_json_file = self.dir_model / "tokenizer.json" - if tokenizer_json_file.is_file(): - with open(tokenizer_json_file, "r", encoding="utf-8") as f: - data = json.load(f) - for entry in data.get("added_tokens", []): - if entry.get("content") == token: - return int(entry["id"]) - raise ValueError(f"Token '{token}' not found in mistral tokenizer files.") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if hparams.get("model_type") == "pixtral": - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) - - # hidden_act - if hparams["hidden_act"] == "silu": - self.gguf_writer.add_vision_use_silu(True) - elif hparams["hidden_act"] == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - else: - raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") - - # spatial_merge_size - if "spatial_merge_size" in self.global_config: - self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = ( - self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) - ) - n_kv_head = n_head - - valid_prefixes = ( - "multi_modal_projector.", - "vision_tower.", - "vision_encoder.", - "vision_language_adapter.", - "patch_merger.", - "pre_mm_projector_norm", - ) - - if any(name.startswith(prefix) for prefix in valid_prefixes): - # process vision tensors - if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format: - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - yield from super().modify_tensors(data_torch, name, bid) - return - - embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" - if self.img_break_tok_id > 0 and embed_key in name: - logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") - # for pixtral model, we need to extract the [IMG_BREAK] token embedding - img_break_embd = data_torch[self.img_break_tok_id] - name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - yield from super().modify_tensors(img_break_embd, name, bid) - - return # skip other tensors - - -@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") -class SmolVLMModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams["model_type"] == "smolvlm_vision": - # fix for SmolVLM2, missing some keys in config.json - # default values are taken from transformers code - self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) - self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) - self.gguf_writer.add_vision_use_gelu(True) - - # Add the preprocessor longest edge size - preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size) - self.gguf_writer.add_vision_preproc_image_size(preproc_image_size) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".embeddings." in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name - - if not is_vision_tensor: - return None - - return super().filter_tensors(item) - - -@ModelBase.register( - "Llama4ForConditionalGeneration", - "Llama4ForCausalLM", -) -class Llama4Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA4 - undo_permute = False - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this - self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] - self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) - if "layer_types" in self.hparams: - if all(lt == "full_attention" for lt in self.hparams["layer_types"]): - # all layers are full attention (for MobileLLM), disable swa - self.gguf_writer.add_sliding_window(0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # split the gate_up into gate and up - if "gate_up_proj" in name: - name_up = name.replace("gate_up_proj", "up_proj.weight") - name_gate = name.replace("gate_up_proj", "gate_proj.weight") - dim_half = data_torch.shape[-1] // 2 - gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) - yield from super().modify_tensors(gate_proj_weight, name_gate, bid) - yield from super().modify_tensors(up_proj_weight, name_up, bid) - return - - if name.endswith("down_proj"): - name += ".weight" - data_torch = data_torch.transpose(-1, -2) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Llama4ForConditionalGeneration") -class Llama4VisionModel(MmprojModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"]) - self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"])) - assert self.hparams["hidden_act"] == "gelu" - self.gguf_writer.add_vision_use_gelu(True) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "multi_modal_projector" not in name and "vision_model" not in name: - return None - - if "positional_embedding_vlm" in name and ".weight" not in name: - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "multi_modal_projector.linear_1" in name: - # despite the name with number postfix, this is a single fully connected layer - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("DeciLMForCausalLM") -class DeciModel(TextModel): - model_arch = gguf.MODEL_ARCH.DECI - - @staticmethod - def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: - # DeciLM-specific code - intermediate_size = int(2 * ffn_mult * n_embd / 3) - return DeciModel._find_multiple(intermediate_size, 256) - - @staticmethod - def _find_multiple(n: int, k: int) -> int: - # DeciLM-specific code - if n % k == 0: - return n - return n + k - (n % k) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] - assert self.block_count == len(_block_configs) - self._num_kv_heads = list() - self._num_heads = list() - _ffn_multipliers = list() - # ***linear attention layer*** - # if n_heads_in_group is None and replace_with_linear is True - # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads - # ***attention-free layer*** - # if n_heads_in_group is None and replace_with_linear is False - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 - # ***normal attention-layer*** - # if n_heads_in_group is not None, then - # _num_kv_heads[il] is num_attention_head // n_heads_in_group and - # _num_heads[il] is num_attention_head - # ***dummy layer*** for nemotron 253B - # if n_heads_in_group is None and ffn_mult is None - # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0 - for il in range(len(_block_configs)): - if _block_configs[il]["attention"]["n_heads_in_group"] is None: - if _block_configs[il]["attention"]["replace_with_linear"] is True: - self._num_kv_heads.append(0) - self._num_heads.append(self.hparams["num_attention_heads"]) - else: - self._num_kv_heads.append(0) - self._num_heads.append(0) - else: - self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) - self._num_heads.append(self.hparams["num_attention_heads"]) - if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer - _ffn_multipliers.append(0.0) - else: - _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(_ffn_multipliers) - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) - assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) - self._ffn_dims: list[int] = [ - DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) - for multiplier in _ffn_multipliers - ] - - def set_vocab(self): - # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's - # eos_token from '|eot_id|' to '|end_of_text|' - if self.hparams.get("vocab_size", 128256) == 128256: - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - else: - # DeciLM-7B - self._set_vocab_llama_hf() - - def set_gguf_parameters(self): - if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_heads) - assert self.block_count == len(self._ffn_dims) - if (rope_theta := self.rope_parameters.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_head_count(self._num_heads) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_file_type(self.ftype) - else: # DeciLM-7B - super().set_gguf_parameters() - if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B - self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] - assert self.block_count == len(self._num_kv_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - if bid is not None: - if "num_key_value_heads_per_layer" in self.hparams: - n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] - elif "block_configs" in self.hparams: - n_kv_head = self._num_kv_heads[bid] - n_head = self._num_heads[bid] - else: - n_kv_head = self.hparams.get("num_key_value_heads") - else: - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = rope_params.get("rope_theta", 10000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 8.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - def prepare_tensors(self): - super().prepare_tensors() - - -@ModelBase.register("BitnetForCausalLM") -class BitnetModel(TextModel): - model_arch = gguf.MODEL_ARCH.BITNET - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - def weight_quant(self, weight: Tensor) -> Tensor: - dtype = weight.dtype - weight = weight.float() - scale = weight.abs().mean().clamp(min=1e-5) - iscale = 1 / scale - # TODO: multiply by the scale directly instead of inverting it twice - # (this is also unnecessarily doubly inverted upstream) - # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 - result = (weight * iscale).round().clamp(-1, 1) / iscale - return result.type(dtype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if any(self.match_model_tensor_name(new_name, key, bid) for key in [ - gguf.MODEL_TENSOR.ATTN_Q, - gguf.MODEL_TENSOR.ATTN_K, - gguf.MODEL_TENSOR.ATTN_V, - gguf.MODEL_TENSOR.ATTN_OUT, - gguf.MODEL_TENSOR.FFN_UP, - gguf.MODEL_TENSOR.FFN_DOWN, - gguf.MODEL_TENSOR.FFN_GATE, - ]): - # transform weight into 1/0/-1 (in fp32) - data_torch = self.weight_quant(data_torch) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") -class GrokModel(TextModel): - model_arch = gguf.MODEL_ARCH.GROK - - def set_vocab(self): - if (self.dir_model / 'tokenizer.model').is_file(): - self._set_vocab_sentencepiece() - return - - if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file(): - logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer') - sys.exit(1) - - self._set_vocab_gpt2() - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0)) - self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0)) - if (final_logit_softcap := self.hparams.get("final_logit_softcapping")): - self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) - - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - - # Treat "original" as "yarn", seems to have been a mistake - if self.hparams.get("rope_type") in ("yarn", "original"): - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"]) - self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"]) - - if temp_len := self.hparams.get("attn_temperature_len"): - self.gguf_writer.add_attn_temperature_length(temp_len) - - self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5)) - self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"]) - self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"]) - - _experts: list[dict[str, list[Tensor]]] | None = None - _cur_expert = "" - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - deferred: list[tuple[Tensor, str, int | None]] = [] - is_expert = ".moe." in name or ".block_sparse_moe.experts." in name - - if not is_expert: - deferred.append((data_torch, name, bid)) - - # process the experts separately - if is_expert or self._cur_expert: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - # concatenate split tensors - if name in self._experts[bid]: - self._cur_expert = name - self._experts[bid][name].append(data_torch) - return - elif is_expert: - self._cur_expert = name - self._experts[bid][name] = [data_torch] - return - else: - self._cur_expert = "" - - for bid in range(self.block_count): - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight" - if ename not in self._experts[bid]: - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight" - tensor_list = self._experts[bid][ename] - datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - - for t in deferred: - yield from super().modify_tensors(*t) - - -@ModelBase.register("DbrxForCausalLM") -class DbrxModel(TextModel): - model_arch = gguf.MODEL_ARCH.DBRX - - def set_gguf_parameters(self): - ffn_config = self.hparams["ffn_config"] - attn_config = self.hparams["attn_config"] - self.gguf_writer.add_block_count(self.block_count) - - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) - - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) - - self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) - - self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) - - self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) - - self.gguf_writer.add_layer_norm_eps(1e-5) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_expert = self.hparams["ffn_config"]["moe_num_experts"] - n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] - n_embd = self.hparams["d_model"] - - # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose - # original implementation expects (n_expert, n_ff, n_embd) for all experts weights - # But llama.cpp moe graph works differently - # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions - # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} - experts = False - - for exp_tensor_name in exp_tensor_names.keys(): - if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: - experts = True - data_torch = data_torch.view(n_expert, n_ff, n_embd) - if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: - data_torch = data_torch.permute(*permute_tensor) - break - - # map tensor names - # In MoE models the ffn tensors are typically most of the model weights, - # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. - # Every other model has the weight names ending in .weight, - # let's assume that is the convention which is not the case for dbrx: - # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 - new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - - yield from super().modify_tensors(data_torch, new_name, bid) - - def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid # unused - - return n_dims > 1 - - -@ModelBase.register("MiniCPMForCausalLM") -class MiniCPMModel(TextModel): - model_arch = gguf.MODEL_ARCH.MINICPM - - def set_gguf_parameters(self): - super().set_gguf_parameters() - embedding_scale = float(self.hparams["scale_emb"]) - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") - residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 - self.gguf_writer.add_residual_scale(residual_scale) - logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") - logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] - self.gguf_writer.add_logit_scale(logit_scale) - logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(TextModel): - model_arch = gguf.MODEL_ARCH.MINICPM3 - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - rope_dims = self.hparams["qk_rope_head_dim"] - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - -@ModelBase.register("QWenLMHeadModel") -class QwenModel(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - self._set_vocab_qwen() - - -@ModelBase.register( - "Qwen2Model", - "Qwen2ForCausalLM", - "Qwen2AudioForConditionalGeneration", - "KORMoForCausalLM", - "AudioFlamingo3ForConditionalGeneration", - "DotsOCRForCausalLM", -) -class Qwen2Model(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except (FileNotFoundError, ModuleNotFoundError, ImportError): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.hf_arch == "Qwen2Model": - name = f"model.{name}" # map to Qwen2ForCausalLM tensors - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("DreamModel") -class DreamModel(TextModel): - model_arch = gguf.MODEL_ARCH.DREAM - - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] - vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) - assert max(vocab_dict.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - # Check if it's a special token - treat special tokens as CONTROL tokens - if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes, tokpre - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - # Dream models use non-causal attention for diffusion - self.gguf_writer.add_causal_attention(False) - - # Add Dream-specific parameters - mask_token_id = self.hparams.get("mask_token_id") - if mask_token_id is not None: - self.gguf_writer.add_mask_token_id(mask_token_id) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Dream model tensors should be mapped directly since it's the base model - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("LLaDAModelLM") -class LLaDAModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLADA - undo_permute = True - - def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - vocab_dict = tokenizer.get_vocab() # ty: ignore[unresolved-attribute] - vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) - assert max(vocab_dict.values()) < vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - # Check if it's a special token - treat special tokens as CONTROL tokens - if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes, tokpre - - def set_vocab(self): - self._set_vocab_gpt2() - - # LLaDA specific parameters - self.gguf_writer.add_add_bos_token(True) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self._try_set_pooling_type() - - # Add parameters similar to LlamaModel - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - n_heads = hparams.get("num_attention_heads", hparams.get("n_heads")) - assert n_heads is not None - rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads - self.gguf_writer.add_rope_dimension_count(rope_dim) - - # Set context length for LLaDA - context_length = self.hparams.get("max_sequence_length", 4096) - self.gguf_writer.add_context_length(context_length) - - # Set embedding length (dimension size) - embedding_length = self.hparams.get("d_model", 4096) - self.gguf_writer.add_embedding_length(embedding_length) - - # Set feed forward length (MLP hidden size) - feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) - self.gguf_writer.add_feed_forward_length(feed_forward_length) - - # LLaDA models use non-causal attention for diffusion, similar to Dream - self.gguf_writer.add_causal_attention(False) - - # LLaDA models don't shift their logits - self.gguf_writer.add_diffusion_shift_logits(False) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads")) - assert n_head is not None - n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads")) - - if self.undo_permute: - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LLaDAModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head) - - # LLaDA model tensors should be mapped directly since it's the base model - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM") -class Ernie4_5Model(TextModel): - model_arch = gguf.MODEL_ARCH.ERNIE4_5 - - def set_vocab(self): - self._set_vocab_sentencepiece() - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "ernie." in name: - name = name.replace("ernie.", "model.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - if (head_dim := self.hparams.get("head_dim")) is None: - head_dim = self.hparams["hidden_size"] // num_heads - - # split the qkv weights - # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size] - if "qkv_proj" in name: - name_q = name.replace("qkv_proj.weight", "q_proj.weight") - name_k = name.replace("qkv_proj.weight", "k_proj.weight") - name_v = name.replace("qkv_proj.weight", "v_proj.weight") - total_q_dim = num_heads * head_dim - total_k_dim = num_kv_heads * head_dim - total_v_dim = num_kv_heads * head_dim - q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) - yield from super().modify_tensors(q_proj_weight, name_q, bid) - yield from super().modify_tensors(k_proj_weight, name_k, bid) - yield from super().modify_tensors(v_proj_weight, name_v, bid) - # split the up_gate_proj into gate and up - # up_gate_proj shape: [2 * intermediate_size, hidden_size] - elif "up_gate_proj" in name: - name_up = name.replace("up_gate_proj.weight", "up_proj.weight") - name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") - dim_half = data_torch.shape[0] // 2 - gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) - yield from super().modify_tensors(gate_proj_weight, name_gate, bid) - yield from super().modify_tensors(up_proj_weight, name_up, bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Ernie4_5_MoeForCausalLM") -class Ernie4_5MoeModel(Ernie4_5Model): - model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE - _experts: list[dict[str, Tensor]] | None = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._experts = [{} for _ in range(self.block_count)] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(self.hparams["moe_k"]) - self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"]) - self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"]) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None: - self.gguf_writer.add_expert_shared_count(shared_expert_count) - if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) - match = re.match(r"model.mtp_block.(\d+)", name) - if match: - return None - - # skip all other MTP tensors for now - match = re.match(r"model.mtp_emb_norm.(\d+)", name) - if match: - return None - - match = re.match(r"model.mtp_hidden_norm.(\d+)", name) - if match: - return None - - match = re.match(r"model.mtp_linear_proj.(\d+)", name) - if match: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["moe_num_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["gate_proj", "up_proj", "down_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename_to_retrieve]) - del self._experts[bid][ename_to_retrieve] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - yield from super().modify_tensors(data_torch, merged_name, bid) - else: - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("PaddleOCRVLForConditionalGeneration") -class PaddleOCRModel(Ernie4_5Model): - model_arch = gguf.MODEL_ARCH.PADDLEOCR - - -@ModelBase.register("PaddleOCRVisionModel") -class PaddleOCRVisionModel(MmprojModel): - # PaddleOCR-VL uses a modified version of Siglip - min_pixels: int = 0 - max_pixels: int = 0 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.min_pixels = self.preprocessor_config["min_pixels"] - self.max_pixels = self.preprocessor_config["max_pixels"] - self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - hparams = self.hparams_vision - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR) - self.gguf_writer.add_vision_max_pixels(self.max_pixels) - self.gguf_writer.add_vision_min_pixels(self.min_pixels) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "vision_model" not in name and "mlp_AR" not in name: - return None - name = name.replace("visual.", "model.") - if "packing_position_embedding" in name: - # unused - return None - if "vision_model.head" in name: - # we don't yet support image embeddings for this model - return None - - return super().filter_tensors((name, gen)) - - -@ModelBase.register( - "Qwen2VLModel", - "Qwen2VLForConditionalGeneration", - "Qwen2_5_VLForConditionalGeneration", - "Qwen2_5OmniModel", -) -class Qwen2VLModel(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN2VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("thinker."): - name = name.replace("thinker.", "") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") -class Qwen2VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) - # rename config.json values - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") - self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") - if "embed_dim" in self.hparams_vision: # qwen2vl - self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size") - self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - hparams = self.hparams_vision - model_type = self.global_config['model_type'] - if model_type == 'qwen2_vl': - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL) - elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni': - if model_type == 'qwen2_5_omni': - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) - else: - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) - self.gguf_writer.add_vision_use_silu(True) - # find n_wa_pattern (window attention pattern) - fullatt_block_indexes = hparams.get("fullatt_block_indexes") - assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl" - n_wa_pattern = fullatt_block_indexes[0] + 1 - # validate n_wa_pattern - for i in range(1, len(fullatt_block_indexes)): - if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: - raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") - self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) - else: - raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") - # default values below are taken from HF tranformers code - self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("visual."): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # split QKV tensors if needed - if ".qkv." in name: - if data_torch.ndim == 2: # weight - c3, _ = data_torch.shape - else: # bias - c3 = data_torch.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = data_torch[:c] - wk = data_torch[c: c * 2] - wv = data_torch[c * 2:] - yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid) - yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid) - yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid) - elif 'patch_embed.proj.weight' in name: - # split Conv3D into Conv2Ds - c1, c2, kt, kh, kw = data_torch.shape - del c1, c2, kh, kw # unused - assert kt == 2, "Current implementation only support temporal_patch_size of 2" - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -class Qwen25AudioModel(MmprojModel): - has_audio_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_audio is not None - self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] - self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"] - self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_audio is not None - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5)) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # SinusoidsPositionEmbedding - assert self.hparams_audio is not None - max_timescale = 10000 - length = 1500 - channels = self.hparams_audio["hidden_size"] - log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) - inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float()) - scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] - pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32) - yield ("audio_tower.embed_positions.weight", pos_embd) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F16 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "conv1.bias" in name or "conv2.bias" in name: - # transpose conv1 and conv2 bias - data_torch = data_torch.unsqueeze(-1) - - yield from MmprojModel.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register("Qwen2_5OmniModel") -class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel): - has_audio_encoder = True - has_vision_encoder = True - - def get_vision_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("vision_config") - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config["thinker_config"].get("audio_config") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("visual.") and not name.startswith("audio_tower."): - return None - - if name.startswith("thinker."): - name = name.replace("thinker.", "") - - if "audio_bos_eos_token" in name: - # this tensor is left unused in transformers code - # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 - return None - - return MmprojModel.filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "visual." in name: - yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid) - elif "audio_tower." in name: - yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) - return # skip other tensors - - -@ModelBase.register("InternVisionModel") -class InternVisionModel(MmprojModel): - - min_dynamic_tiles: int = 0 - max_dynamic_tiles: int = 0 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) - self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) - - def set_gguf_parameters(self): - assert self.hparams_vision is not None - if isinstance(self.hparams_vision['image_size'], list): - self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] - if isinstance(self.hparams_vision['patch_size'], list): - self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] - super().set_gguf_parameters() - - hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) - # hidden_act - if hparams["hidden_act"] == "silu": - self.gguf_writer.add_vision_use_silu(True) - elif hparams["hidden_act"] == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - else: - raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") - # downsample_ratio - downsample_ratio = self.global_config.get("downsample_ratio") - assert downsample_ratio is not None - self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) - # older models may not have min/max_dynamic_patch in config - if self.min_dynamic_tiles > 0: - self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) - if self.max_dynamic_tiles > 0: - self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] - if not any([name.startswith(prefix) for prefix in vision_prefix]): - return None - # deal with intern-s1 special case - names_map = { - "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", - "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", - "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", - "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", - "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", - "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", - } - if name in names_map: - name = names_map[name] - # correct name - if name.startswith("vision_model"): - name = "vision_tower." + name - if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # split QKV tensors if needed - if ".qkv." in name: - if data_torch.ndim == 2: # weight - c3, _ = data_torch.shape - else: # bias - c3 = data_torch.shape[0] - assert c3 % 3 == 0 - c = c3 // 3 - wq = data_torch[:c] - wk = data_torch[c: c * 2] - wv = data_torch[c * 2:] - yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) - yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) - yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register( - "NemotronH_Nano_VL_V2", - "RADIOModel", -) -class NemotronNanoV2VLModel(MmprojModel): - # ViT-Huge architecture parameters for RADIO v2.5-h - _vit_hidden_size = 1280 - _vit_intermediate_size = 5120 - _vit_num_layers = 32 - _vit_num_heads = 16 - - def get_vision_config(self) -> dict[str, Any] | None: - # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually - vision_config = self.global_config.get("vision_config") - if vision_config is None: - return None - # Add ViT-H parameters - vision_config = { - **vision_config, - "hidden_size": self._vit_hidden_size, - "intermediate_size": self._vit_intermediate_size, - "num_hidden_layers": self._vit_num_layers, - "num_attention_heads": self._vit_num_heads, - "image_size": self.global_config.get("force_image_size", 512), - } - return vision_config - - def set_gguf_parameters(self): - if "image_mean" not in self.preprocessor_config: - self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406] - if "image_std" not in self.preprocessor_config: - self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225] - - super().set_gguf_parameters() - hparams = self.global_config - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL) - self.gguf_writer.add_vision_attention_layernorm_eps(1e-6) - self.gguf_writer.add_vision_use_gelu(True) - downsample_ratio = hparams.get("downsample_ratio", 0.5) - self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name or "pos_embed" in new_name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "input_conditioner" in name: - return None - - # mtmd does not support video yet so skip tensors related to video. - if "radio_model.model.patch_generator.video_embedder" in name: - return None - - if not name.startswith("vision_model.radio_model.model.") and not name.startswith("mlp1."): - return None - - if "patch_generator.pos_embed" in name: - if not name.endswith(".weight"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it - if "patch_generator.pos_embed" in name: - # Downsample position embeddings for fixed 512x512 image size - import torch.nn.functional as F - n_embd = self.hparams["hidden_size"] - image_size = self.global_config.get("force_image_size", 512) - patch_size = self.hparams["patch_size"] - target_patches_per_side = image_size // patch_size # 32 - max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128 - if target_patches_per_side != max_patches_per_side: - # Reshape to grid, interpolate, flatten back - data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd) - data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128] - data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side), - mode='bilinear', align_corners=True) - data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd] - data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd) - - # Reshape linear patch embedding to conv2d format for ggml_conv_2d - # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size] - if "patch_generator.embedder" in name: - patch_size = self.hparams["patch_size"] - n_embd = self.hparams["hidden_size"] - data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("WavTokenizerDec") -class WavTokenizerDecModel(TextModel): - model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if \ - name.endswith("codebook.cluster_size") or \ - name.endswith("codebook.embed_avg") or \ - name.endswith("codebook.inited"): - logger.debug(f"Skipping {name!r}") - return None - - return super().filter_tensors(item) - - def set_vocab(self): - self._set_vocab_none() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) - self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) - self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) - self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) - - self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) - self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) - - self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) - self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) - - self.gguf_writer.add_causal_attention(False) - - -@ModelBase.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.QWEN2MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: - self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) - logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # handle aggregated expert tensors - # GGUF stores dimensions reversed from PyTorch, so: - # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A} - # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp) - # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down - if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): - mapped = f"{name}.weight" if not name.endswith(".weight") else name - # HF: [n_expert, n_embd, n_ff] -> GGML: {n_ff, n_embd, n_expert} - yield from super().modify_tensors(data_torch, mapped, bid) - return - - if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): - if data_torch.ndim < 3 or data_torch.shape[-2] % 2 != 0: - raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") - # HF: [n_expert, 2*n_ff, n_embd] -> split on dim=-2 - n_ff = data_torch.shape[-2] // 2 - gate = data_torch[..., :n_ff, :].contiguous() - up = data_torch[..., n_ff:, :].contiguous() - # gate/up: [n_expert, n_ff, n_embd] -> GGML: {n_embd, n_ff, n_expert} - base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj") - mapped_gate = f"{base_name}.gate_proj.weight" - mapped_up = f"{base_name}.up_proj.weight" - yield from super().modify_tensors(gate, mapped_gate, bid) - yield from super().modify_tensors(up, mapped_up, bid) - return - - if name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model") -class Qwen3Model(Qwen2Model): - model_arch = gguf.MODEL_ARCH.QWEN3 - - # extra logic for rerank models - is_rerank: bool = False - is_tied_embeddings: bool = False - token_false_id: int | None = None - token_true_id: int | None = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # track for intern-s1-mini - hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - if self._is_qwen3_reranker(): - self._find_rerank_config() - - def _is_qwen3_reranker(self) -> bool: - readme_path = self.dir_model / "README.md" - readme_text = "" - if readme_path.exists(): - with readme_path.open("r", encoding="utf-8") as f: - readme_text = f.read() - - name_hints = [ - str(self.dir_model.name), - str(self.hparams.get("_name_or_path", "")), - str(self.hparams.get("model_type", "")), - str(self.origin_hf_arch or ""), - ] - name_hints = [hint.lower() for hint in name_hints if hint] - - if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower(): - return True - - if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints): - return True - - return "sequenceclassification" in (self.origin_hf_arch or "").lower() - - def set_vocab(self): - # deal with intern-s1-mini - if self.origin_hf_arch == 'InternS1ForConditionalGeneration': - self._set_vocab_interns1() - return - - super().set_vocab() - - def _find_rerank_config(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - - self.is_rerank = True - self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) - self.token_false_id = tokenizer.convert_tokens_to_ids("no") # ty: ignore[unresolved-attribute, invalid-assignment] - self.token_true_id = tokenizer.convert_tokens_to_ids("yes") # ty: ignore[unresolved-attribute, invalid-assignment] - self.sep_token_id = tokenizer.convert_tokens_to_ids("|") # ty: ignore[unresolved-attribute] - - assert self.token_false_id is not None and self.token_true_id is not None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if self.is_rerank: - self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK) - self.gguf_writer.add_classifier_output_labels(["yes", "no"]) - self.gguf_writer.add_chat_template([{ - "name": "rerank", - "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" - "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n" - "<|im_start|>assistant\n<think>\n\n</think>\n\n" - }]) - - def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: - # extract "yes" and "no" tokens from the output lm_head tensor - false_row = data_torch[self.token_false_id] - true_row = data_torch[self.token_true_id] - return torch.stack([true_row, false_row], dim=0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.is_rerank: - is_tied_head = self.is_tied_embeddings and "embed_tokens" in name - is_real_head = not self.is_tied_embeddings and "lm_head" in name - if is_tied_head or is_real_head: - cls_out_head = ( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", - self._get_cls_out_tensor(data_torch), - ) - yield cls_out_head - if is_tied_head: - yield from super().modify_tensors(data_torch, name, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3ASRForConditionalGeneration") -class Qwen3ASRTextModel(Qwen3Model): - model_arch = gguf.MODEL_ARCH.QWEN3 - - def __init__(self, dir_model: Path, *args, **kwargs): - hparams = kwargs.pop("hparams", None) - if hparams is None: - hparams = ModelBase.load_hparams(dir_model, self.is_mistral_format) - - thinker_config = hparams.get("thinker_config", {}) - text_config = dict(thinker_config.get("text_config", {})) - if text_config and text_config.get("architectures") is None and text_config.get("model_type") == "qwen3": - text_config["architectures"] = ["Qwen3ForCausalLM"] - if text_config: - hparams = {**hparams, "text_config": text_config} - - super().__init__(dir_model, *args, hparams=hparams, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_num_deepstack_layers(0) - - def set_vocab(self): - super().set_vocab() - # fix chat template, use correct chatml format - self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}") - # correct BOS/EOS tokens - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - added_tokens = tokenizer_config.get("added_tokens_decoder", {}) - for token_id, data in added_tokens.items(): - if data.get("content") == "<|im_end|>": - self.gguf_writer.add_bos_token_id(int(token_id)) - self.gguf_writer.add_eos_token_id(int(token_id)) - break - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("thinker.audio_tower."): - return - - if name.startswith("thinker.model.") or name.startswith("thinker.lm_head."): - name = name.removeprefix("thinker.") - elif name.startswith("thinker."): - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3MoeForCausalLM") -class Qwen3MoeModel(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3MOE - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - hparams = ModelBase.load_hparams(self.dir_model, False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - def set_vocab(self): - # deal with intern-s1 - if self.origin_hf_arch == 'InternS1ForConditionalGeneration': - self._set_vocab_interns1() - return - - super().set_vocab() - - -@ModelBase.register("Qwen3NextForCausalLM") -class Qwen3NextModel(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3NEXT - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"]) - self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"]) - self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"]) - self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"]) - self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"]) - self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4)) - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25))) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("mtp"): - # ignore MTP layers for now - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(".A_log"): - data_torch = -torch.exp(data_torch) - elif name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - elif "conv1d" in name: - data_torch = data_torch.squeeze() - elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"): - data_torch = data_torch + 1 - - if "in_proj_qkvz.weight" in name: - # original order: [q, k, v, z] * head_count - # corrected order: [q * head_count, k * head_count, v * head_count, z * head_count] - head_k_dim = self.hparams["linear_key_head_dim"] - head_v_dim = self.hparams["linear_value_head_dim"] - num_v_heads = self.hparams["linear_num_value_heads"] - num_k_heads = self.hparams["linear_num_key_heads"] - hidden_size = self.hparams["hidden_size"] - split_arg_list_qkvz = [ - head_k_dim, # q partition - head_k_dim, # k partition - (num_v_heads // num_k_heads * head_v_dim), # v partition - (num_v_heads // num_k_heads * head_v_dim), # z partition - ] - # view as (n_embd, head_count, [q+k+v+z]) - data_torch = data_torch.permute(1, 0).contiguous() - data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz)) - # split into q, k, v, z - q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1) - # flatten dim + head_count - q = q.contiguous().view(hidden_size, -1) - k = k.contiguous().view(hidden_size, -1) - v = v.contiguous().view(hidden_size, -1) - z = z.contiguous().view(hidden_size, -1) - # stack back - qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous() - z = z.permute(1, 0).contiguous() - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("RND1") -class RND1Model(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.RND1 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # RND1 specific parameters - # RND1 uses bidirectional attention - self.gguf_writer.add_causal_attention(False) - - if (mask_token_id := self.hparams.get("mask_token_id")) is not None: - self.gguf_writer.add_mask_token_id(mask_token_id) - - -@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration", "Qwen3_5ForConditionalGeneration", "Qwen3_5MoeForConditionalGeneration") -class Qwen3VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams_vision is None: - logger.info("No vision config found, skipping vision tensor processing") - return - - # Compute image_size if not present - if "image_size" not in self.hparams_vision: - # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings - num_pos = self.hparams_vision.get("num_position_embeddings", 2304) - patch_size = self.hparams_vision.get("patch_size", 16) - # num_position_embeddings = (image_size / patch_size) ** 2 - # So image_size = sqrt(num_position_embeddings) * patch_size - image_size = int(num_pos**0.5 * patch_size) - self.hparams_vision["image_size"] = image_size - - # Rename config values for compatibility - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") - self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") - - self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0) - for idx in self.hparams_vision.get("deepstack_visual_indexes", []): - self.is_deepstack_layers[idx] = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - # in case mixed modalities, the arch will be handled by subclass - if not self.has_audio_encoder: - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL) - self.gguf_writer.add_vision_use_gelu(True) - - if self.hparams_vision is not None: - merge_size = self.hparams_vision.get("spatial_merge_size") - if merge_size is not None: - self.gguf_writer.add_vision_spatial_merge_size(int(merge_size)) - - # Use text config's rms_norm_eps for vision attention layernorm eps - rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6) - self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) - - if self.is_deepstack_layers: - self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip text model tensors - if name.startswith("lm_head."): - return None - - # Skip MTP tensors - if name.startswith("mtp."): - return None - - if name.startswith("model.visual."): - name = name.replace("model.visual.", "visual.", 1) - - if not name.startswith("visual."): - return None - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - assert self.hparams_vision is not None - - if name.startswith("visual.deepstack_merger_list."): - prefix, rest = name.split(".", maxsplit=3)[2:] - # prefix is the layer index, convert to absolute clip layer index! - idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)] - target = rest - - tensor_type: gguf.MODEL_TENSOR - if target.startswith("norm."): - tensor_type = gguf.MODEL_TENSOR.V_DS_NORM - suffix = target.split(".", 1)[1] - elif target.startswith("linear_fc1."): - tensor_type = gguf.MODEL_TENSOR.V_DS_FC1 - suffix = target.split(".", 1)[1] - elif target.startswith("linear_fc2."): - tensor_type = gguf.MODEL_TENSOR.V_DS_FC2 - suffix = target.split(".", 1)[1] - else: - raise ValueError(f"Unexpected deepstack tensor: {name}") - - new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}") - yield from super().modify_tensors(data_torch, new_name, bid) - return - - if name.startswith("visual.merger."): - suffix = name.split(".", 2)[2] - if suffix.startswith("linear_fc"): - fc_idx_str, tail = suffix.split(".", 1) - fc_num = int(fc_idx_str.replace("linear_fc", "")) - # Qwen3VL has linear_fc1 and linear_fc2 - # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2) - if fc_num == 1: - fc_idx = 0 - elif fc_num == 2: - fc_idx = 2 - else: - raise ValueError(f"unexpected fc index {fc_num} in {name}") - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}") - elif suffix.startswith("norm."): - new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}") - else: - raise ValueError(f"Unexpected merger tensor: {name}") - yield (new_name, data_torch) - return - - if name == "visual.patch_embed.proj.weight": - # split Conv3D into Conv2Ds along temporal dimension - c1, c2, kt, _, _ = data_torch.shape - del c1, c2 - if kt != 2: - raise ValueError("Current implementation only supports temporal_patch_size of 2") - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]) - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) - return - - if name == "visual.patch_embed.proj.bias": - # Include the bias - it's used by the C++ code - yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch) - return - - yield from MmprojModel.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") -class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel): - has_audio_encoder = True - has_vision_encoder = True - - def get_vision_config(self) -> dict[str, Any] | None: - if self.has_vision_encoder: - return self.global_config["thinker_config"].get("vision_config") - else: - return None - - def get_audio_config(self) -> dict[str, Any] | None: - if self.has_audio_encoder: - return self.global_config["thinker_config"].get("audio_config") - else: - return None - - def set_gguf_parameters(self): - if self.has_vision_encoder: - Qwen3VLVisionModel.set_gguf_parameters(self) - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL) - if self.has_audio_encoder: - Qwen25AudioModel.set_gguf_parameters(self) - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip text model tensors - if name.startswith("lm_head."): - return None - - # Skip MTP tensors - if name.startswith("mtp."): - return None - - if name.startswith("model.visual."): - name = name.replace("model.visual.", "visual.", 1) - - if "visual." not in name and "audio_tower." not in name: - return None - - return MmprojModel.filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "visual." in name: - if not self.has_vision_encoder: - raise ValueError(f"Model does not have vision encoder, but found tensor {name}") - # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly - name = name.replace("thinker.visual.", "model.visual.") - if ".merger_list." in name: - name = name.replace(".merger_list.", ".deepstack_merger_list.") - name = name.replace(".ln_q", ".norm") - name = name.replace(".mlp.0", ".linear_fc1") - name = name.replace(".mlp.2", ".linear_fc2") - elif ".merger." in name: - name = name.replace(".ln_q", ".norm") - name = name.replace(".mlp.0", ".linear_fc1") - name = name.replace(".mlp.2", ".linear_fc2") - yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid) - elif "audio_tower." in name: - if not self.has_audio_encoder: - raise ValueError(f"Model does not have audio encoder, but found tensor {name}") - if "conv2d" in name and name.endswith(".bias"): - # transform conv2d bias [n_embd] --> [1, 1, n_embd] - data_torch = data_torch.unsqueeze(-1).unsqueeze(-1) - yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid) - - -@ModelBase.register("Qwen3ASRForConditionalGeneration") -class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel): - has_audio_encoder = True - has_vision_encoder = False - - -@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") -class Glm4VVisionModel(Qwen3VLVisionModel): - def set_gguf_parameters(self): - MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters - assert self.hparams_vision is not None - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) - - hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() - if hidden_act == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - elif hidden_act == "silu": - self.gguf_writer.add_vision_use_silu(True) - - rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) - self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("visual.merger."): - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - return - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("StepVLForConditionalGeneration") -class Step3VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - - if not self.hparams_vision.get("intermediate_size"): - hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0 - assert hidden_size > 0 - mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536)) - self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) - - self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN)) - self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - projector_stride = int(self.global_config.get("understand_projector_stride", -1)) - hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1))) - num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1))) - assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), ( - "current Step3-VL conversion path is only validated for Step3-VL-10B" - ) - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL) - self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5))) - self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2) - # 3024 max resize comes from step3-vl-10b processing_step3.py. - self.gguf_writer.add_vision_preproc_image_size(3024) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".position_embd." in new_name: - return gguf.GGMLQuantizationType.F32 - if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"): - return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("model.", "lm_head.")): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("vision_model.vit_downsampler"): - match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name) - if match is None: - raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}") - - proj_id = int(match.group(1)) - 1 - suffix = f".{match.group(2)}" - yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch) - return - - if name == "vit_large_projector.weight": - yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch) - return - - if name.startswith("vision_model."): - if name == "vision_model.positional_embedding": - name += ".weight" - elif name.endswith(".gamma") and ".ls_" in name: - name = name.removesuffix(".gamma") + ".weight" - - name = name.replace("attn.in_proj_weight", "attn.in_proj.weight") - name = name.replace("attn.in_proj_bias", "attn.in_proj.bias") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3VLForConditionalGeneration") -class Qwen3VLTextModel(Qwen3Model): - model_arch = gguf.MODEL_ARCH.QWEN3VL - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if "thinker_config" in self.hparams: - vision_config = self.hparams["thinker_config"].get("vision_config", {}) - else: - vision_config = self.hparams.get("vision_config", {}) - deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) - self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("thinker.", "") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("StepVLForConditionalGeneration") -class Step3VLTextModel(Qwen3Model): - model_arch = gguf.MODEL_ARCH.QWEN3 - - -@ModelBase.register("Qwen3VLMoeForConditionalGeneration") -class Qwen3VLMoeTextModel(Qwen3MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3VLMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - vision_config = self.hparams.get("vision_config", {}) - deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) - self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("thinker.", "") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Qwen3VL has transposed packed tensors, so we treat it differently from general Qwen2MoE packed tensors - if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): - mapped = f"{name}.weight" if not name.endswith(".weight") else name - permuted = data_torch.permute(0, 2, 1).contiguous() - yield from ModelBase.modify_tensors(self, permuted, mapped, bid) - return - - if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): - if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: - raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") - split_dim = data_torch.shape[-1] // 2 - gate = data_torch[..., :split_dim].contiguous() - up = data_torch[..., split_dim:].contiguous() - # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768) - # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128} - # Need PyTorch: (128, 768, 2048) [reversed of GGML] - # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048) - base_name = name.removesuffix(".weight") - base = base_name.rsplit('.', 1)[0] - mapped_gate = f"{base}.gate_proj.weight" - mapped_up = f"{base}.up_proj.weight" - perm_gate = gate.permute(0, 2, 1).contiguous() - perm_up = up.permute(0, 2, 1).contiguous() - yield from ModelBase.modify_tensors(self, perm_gate, mapped_gate, bid) - yield from ModelBase.modify_tensors(self, perm_up, mapped_up, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3OmniMoeForConditionalGeneration") -class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel): - model_arch = gguf.MODEL_ARCH.QWEN3VLMOE - - def set_vocab(self): - super().set_vocab() - # correct BOS/EOS tokens - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - added_tokens = tokenizer_config.get("added_tokens_decoder", {}) - for token_id, data in added_tokens.items(): - if data.get("content") == "<|im_end|>": - self.gguf_writer.add_bos_token_id(int(token_id)) - self.gguf_writer.add_eos_token_id(int(token_id)) - break - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_num_deepstack_layers(0) - - -class _LinearAttentionVReorderBase(Qwen3NextModel): - model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses - """reorders V heads from grouped to tiled order for ggml broadcast - - see https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306 - - Linear attention may has num_k_heads < num_v_heads. The HF weights store - V heads grouped by K head: [G0_v0..v{r-1}, G1_v0..v{r-1}, ...]. - ggml binary ops use tiled broadcast: [K0, K1, ..., K0, K1, ...]. - We reorder V heads to tiled order so ggml_repeat can replace the expensive - interleaved repeat: [G0_v0, G1_v0, ..., G0_v1, G1_v1, ...]. - """ - - @staticmethod - def _reorder_v_heads(tensor: Tensor, dim: int, num_k_heads: int, num_v_per_k: int, head_dim: int) -> Tensor: - """Reorder V heads from grouped (by K head) to tiled order along the given dimension.""" - shape = list(tensor.shape) - if dim < 0: - dim += len(shape) - new_shape = shape[:dim] + [num_k_heads, num_v_per_k, head_dim] + shape[dim + 1:] - tensor = tensor.reshape(*new_shape) - perm = list(range(len(new_shape))) - perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim] - return tensor.permute(*perm).contiguous().reshape(*shape) - - def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]: - if not name.endswith(( - ".linear_attn.in_proj_qkv.weight", - ".linear_attn.in_proj_z.weight", - ".linear_attn.in_proj_a.weight", - ".linear_attn.in_proj_b.weight", - ".linear_attn.out_proj.weight", - )): - return weight, scale - - num_k_heads = self.hparams["linear_num_key_heads"] - num_v_heads = self.hparams["linear_num_value_heads"] - head_k_dim = self.hparams["linear_key_head_dim"] - head_v_dim = self.hparams["linear_value_head_dim"] - num_v_per_k = num_v_heads // num_k_heads - - def unpack_nibbles(qs: Tensor) -> Tensor: - lo = torch.bitwise_and(qs, 0x0F) - hi = torch.bitwise_right_shift(qs, 4) - return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2) - - def pack_nibbles(codes: Tensor) -> Tensor: - codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2) - lo = torch.bitwise_and(codes[..., 0], 0x0F) - hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4) - return torch.bitwise_or(lo, hi).contiguous() - - def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]: - assert qs.ndim >= 2 - assert scales.ndim >= 2 - - k = qs.shape[-1] * 2 - assert col_perm.numel() == k - assert k % 16 == 0 - - group_cols = col_perm.reshape(-1, 16) - group_starts = group_cols[:, 0] - expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype) - assert torch.equal(group_cols, expected) - assert torch.all(group_starts % 16 == 0) - - group_perm = (group_starts // 16).to(dtype=torch.long) - expected_groups = torch.arange(scales.shape[-1], dtype=torch.long) - assert group_perm.numel() == scales.shape[-1] - assert torch.equal(torch.sort(group_perm).values, expected_groups) - - codes = unpack_nibbles(qs) - codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long)) - qs = pack_nibbles(codes) - scales = scales.index_select(-1, group_perm.to(device=scales.device)) - return qs, scales - - def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]: - row_perm = self._reorder_v_heads( - torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1), - 0, num_k_heads, num_v_per_k, head_dim, - ).squeeze(-1) - return ( - qs.index_select(0, row_perm.to(device=qs.device)), - scales.index_select(0, row_perm.to(device=scales.device)), - ) - - if name.endswith(".linear_attn.in_proj_qkv.weight"): - q_dim = head_k_dim * num_k_heads - k_dim = head_k_dim * num_k_heads - q = weight[:q_dim] - k = weight[q_dim:q_dim + k_dim] - v = weight[q_dim + k_dim:] - q_scale = scale[:q_dim] - k_scale = scale[q_dim:q_dim + k_dim] - v_scale = scale[q_dim + k_dim:] - v, v_scale = reorder_rows(v, v_scale, head_v_dim) - return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0) - - if name.endswith(".linear_attn.in_proj_z.weight"): - weight, scale = reorder_rows(weight, scale, head_v_dim) - elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")): - weight, scale = reorder_rows(weight, scale, 1) - elif name.endswith(".linear_attn.out_proj.weight"): - col_perm = self._reorder_v_heads( - torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0), - 1, num_k_heads, num_v_per_k, head_v_dim, - ).squeeze(0) - weight, scale = apply_col_perm(weight, scale, col_perm) - - return weight, scale - - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): - weight, scale = self._transform_nvfp4_weight(name, weight, scale) - super()._repack_nvfp4(name, weight, scale, scale2, input_scale) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_k_heads = self.hparams.get("linear_num_key_heads", 0) - num_v_heads = self.hparams.get("linear_num_value_heads", 0) - - if num_k_heads > 0 and num_v_heads > 0 and num_k_heads != num_v_heads and "linear_attn." in name: - head_k_dim = self.hparams["linear_key_head_dim"] - head_v_dim = self.hparams["linear_value_head_dim"] - num_v_per_k = num_v_heads // num_k_heads - - if ".in_proj_qkv." in name: - # QKV weight: reorder only the V rows - q_dim = head_k_dim * num_k_heads - k_dim = head_k_dim * num_k_heads - q = data_torch[:q_dim] - k = data_torch[q_dim:q_dim + k_dim] - v = data_torch[q_dim + k_dim:] - v = self._reorder_v_heads(v, 0, num_k_heads, num_v_per_k, head_v_dim) - data_torch = torch.cat([q, k, v], dim=0) - - elif ".in_proj_z." in name: - # Z gate weight: reorder rows (num_v_heads * head_v_dim) - data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, head_v_dim) - - elif ".in_proj_b." in name or ".in_proj_a." in name: - # Beta/Alpha weight: reorder rows (num_v_heads, head_dim=1) - data_torch = self._reorder_v_heads(data_torch, 0, num_k_heads, num_v_per_k, 1) - - elif ".A_log" in name or ".dt_bias" in name or ".dt_proj" in name: - # A_log / dt_bias: 1D parameters with num_v_heads elements - if data_torch.ndim == 1: - data_torch = self._reorder_v_heads( - data_torch.unsqueeze(-1), 0, num_k_heads, num_v_per_k, 1 - ).squeeze(-1) - else: - data_torch = self._reorder_v_heads(data_torch, -1, num_k_heads, num_v_per_k, 1) - - elif ".conv1d" in name: - # Conv1d kernel: reorder only the V channel portion - data = data_torch.squeeze() - qk_channels = head_k_dim * num_k_heads * 2 - qk_part = data[:qk_channels] - v_part = data[qk_channels:] - v_part = self._reorder_v_heads(v_part, 0, num_k_heads, num_v_per_k, head_v_dim) - data_torch = torch.cat([qk_part, v_part], dim=0) - - elif ".out_proj." in name: - # Out projection weight: reorder columns (input dimension) - data_torch = self._reorder_v_heads(data_torch, 1, num_k_heads, num_v_per_k, head_v_dim) - - yield from super().modify_tensors(data_torch, name, bid) - - -class _Qwen35MRopeMixin: - # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers); - # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE - # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always - # written even when a particular checkpoint omits the field in `rope_parameters`. - _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0] - - gguf_writer: gguf.GGUFWriter - rope_parameters: dict - - def set_gguf_parameters(self): - super().set_gguf_parameters() # ty: ignore[unresolved-attribute] - if "mrope_section" not in self.rope_parameters: - self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION) - - -@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM") -class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): - model_arch = gguf.MODEL_ARCH.QWEN35 - - -@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") -class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase): - model_arch = gguf.MODEL_ARCH.QWEN35MOE - - -# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under -# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger -# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as -# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup. - -@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") -class MiniCPMV4_6TextModel(Qwen3_5TextModel): - model_arch = gguf.MODEL_ARCH.QWEN35 - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("model.merger."): - return None - # MTP tensors are not used at inference yet; align with Qwen3Next behaviour - if name.startswith("mtp"): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("MiniCPMV4_6ForConditionalGeneration") -class MiniCPMV4_6VisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams_vision is not None: - # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP - # positional embedding bucket grid (70 x 70), while the per-slice processing - # resolution is the preprocessor's `scale_resolution` (typically 448). - # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size` - # as the slice size and warmup resolution, so report `scale_resolution` there - # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules. - scale_resolution = self.preprocessor_config.get("scale_resolution") - if scale_resolution is not None: - self.hparams_vision["image_size"] = int(scale_resolution) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - # projector type string is consumed by clip_projector_type_from_string() in clip.cpp - # (mapped to PROJECTOR_TYPE_MINICPMV4_6). - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6) - - # ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment - self.gguf_writer.add_vision_projector_scale_factor(4) - - # borrow wa_layer_indexes for vit_merger insertion point - insert_layer_id = int(self.global_config.get( - "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6))) - self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id]) - - # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx). - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps( - self.hparams_vision.get("layer_norm_eps", 1e-6)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # lm_head / MTP -> belong to the LM file - if name.startswith(("lm_head.", "mtp")): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("GPT2LMHeadModel") -class GPT2Model(TextModel): - model_arch = gguf.MODEL_ARCH.GPT2 - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["n_ctx"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # we don't need these - if name.endswith((".attn.bias", ".attn.masked_bias")): - yield from super().modify_tensors(data_torch, name, bid) - return - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - yield from super().modify_tensors(data_torch, new_name, bid) - - -@ModelBase.register("RuGPT3XLForCausalLM") -class RuGPT3XLModel(TextModel): - model_arch = gguf.MODEL_ARCH.GPT2 - - _qkv_parts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Fuse separate Q, K, V projections into a single QKV tensor - if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name: - suffix = "weight" if name.endswith(".weight") else "bias" - part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v") - key = f"{part}.{suffix}" - - assert bid is not None - if self._qkv_parts is None: - self._qkv_parts = [{} for _ in range(self.block_count)] - self._qkv_parts[bid][key] = data_torch - - q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}" - if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]): - q = self._qkv_parts[bid].pop(q_key) - k = self._qkv_parts[bid].pop(k_key) - v = self._qkv_parts[bid].pop(v_key) - data_torch = torch.cat([q, k, v], dim=0) - name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}") - logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}") - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._qkv_parts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()] - if len(parts) > 0: - raise ValueError(f"Unprocessed Q/K/V parts: {parts}") - - -@ModelBase.register("PhiForCausalLM") -class Phi2Model(TextModel): - model_arch = gguf.MODEL_ARCH.PHI2 - - def set_gguf_parameters(self): - rot_pct = self.find_hparam(["partial_rotary_factor"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - - self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) - - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(4 * n_embd) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_add_bos_token(False) - - -@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV") -class Phi3MiniModel(TextModel): - model_arch = gguf.MODEL_ARCH.PHI3 - - def set_vocab(self): - # Phi-4 model uses GPT2Tokenizer - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - tokenizer_class = tokenizer_config_json['tokenizer_class'] - if tokenizer_class == 'GPT2Tokenizer': - return self._set_vocab_gpt2() - - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - raise ValueError(f'Error: Missing {tokenizer_path}') - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"].encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - rms_eps = self.find_hparam(["rms_norm_eps"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) - rope_dims = int(rot_pct * n_embd) // n_head - - self.gguf_writer.add_context_length(max_pos_embds) - self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(rms_eps) - self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"]) - self.gguf_writer.add_file_type(self.ftype) - sliding_window = self.hparams.get("sliding_window") - # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models - if sliding_window is None: - sliding_window = 0 - self.gguf_writer.add_sliding_window(sliding_window) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) - rope_dims = int(rot_pct * n_embd) // n_head - - # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: - return - - scale = max_pos_embds / orig_max_pos_embds - - rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() - if len(rope_scaling_type) == 0: - raise KeyError('Missing the required key rope_scaling.type') - - if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': - attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 - elif rope_scaling_type == 'yarn': - attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 - else: - raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet') - - self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - - if long_factors is None or short_factors is None: - raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') - - if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.') - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) - - -@ModelBase.register("Phi4ForCausalLMV") -class Phi4VisionMmprojModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - - self.vision_total_layers = int(self.find_vparam(self.n_block_keys)) - if self.vision_total_layers < 2: - raise ValueError( - f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}" - ) - - # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and - # drop post-layernorm/head weights. This makes the GGUF runtime output match - # the feature map consumed by the patched siglip.cpp Phi-4 projector path. - self.vision_export_layers = self.vision_total_layers - 1 - self.vision_last_layer_idx = self.vision_total_layers - 1 - - for key in self.n_block_keys: - if key in self.hparams_vision: - self.hparams_vision[key] = self.vision_export_layers - break - - self.block_count = self.vision_export_layers - self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) - - patch_size = self.preprocessor_config.get("patch_size") - if patch_size is None: - raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json") - - self.hparams_vision["patch_size"] = patch_size - - pos_emb_name = next( - ( - name for name in self.model_tensors - if name.endswith("vision_model.embeddings.position_embedding.weight") - ), - None, - ) - if pos_emb_name is None: - raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight") - - pos_emb_shape = self.model_tensors[pos_emb_name]().shape - base_grid_tokens = int(pos_emb_shape[0]) - grid_side = math.isqrt(base_grid_tokens) - if grid_side * grid_side != base_grid_tokens: - raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}") - - self.hparams_vision["image_size"] = grid_side * patch_size - - min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches")) - max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches")) - if min_num_patches is None or max_num_patches is None: - raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches") - - self.min_pixels = int(min_num_patches) * patch_size * patch_size - self.max_pixels = int(max_num_patches) * patch_size * patch_size - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4) - self.gguf_writer.add_vision_min_pixels(self.min_pixels) - self.gguf_writer.add_vision_max_pixels(self.max_pixels) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("model.vision_tower.vision_tower.", "vision_tower.") - - if not name.startswith(("vision_tower.", "model.mm_projector.", "mm_projector.")): - return None - - if ".vision_model.head." in name: - return None - - if ".vision_model.post_layernorm." in name: - return None - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("vision_tower."): - if bid is not None and bid == self.vision_last_layer_idx: - return - - if name.endswith("vision_model.embeddings.patch_embedding.weight"): - assert self.hparams_vision is not None - if data_torch.ndim != 2: - raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}") - - patch_area = self.hparams_vision["patch_size"] ** 2 - in_features = data_torch.shape[1] - if in_features % patch_area != 0: - raise ValueError( - f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}" - ) - - num_channels = in_features // patch_area - patch_size = self.hparams_vision["patch_size"] - data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels) - data_torch = data_torch.permute(0, 3, 1, 2) - - yield from super().modify_tensors(data_torch, name, bid) - return - - if name.startswith(("model.mm_projector.", "mm_projector.")): - local_name = name - local_name = local_name.replace("model.mm_projector.", "") - local_name = local_name.replace("mm_projector.", "") - - if not (local_name.startswith("0.") or local_name.startswith("2.")): - return - - suffix = ".bias" if local_name.endswith(".bias") else ".weight" - mm_idx = int(local_name.split(".", maxsplit=1)[0]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch) - return - - return - - -@ModelBase.register("PhiMoEForCausalLM") -class PhiMoeModel(Phi3MiniModel): - model_arch = gguf.MODEL_ARCH.PHIMOE - - _experts: list[dict[str, Tensor]] | None = None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) - self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("PlamoForCausalLM") -class PlamoModel(TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO - - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(4096) # not in config.json - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong - self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - def shuffle_attn_q_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(8, 5, 128, 5120) - data_torch = torch.permute(data_torch, (1, 0, 2, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def shuffle_attn_output_weight(self, data_torch): - assert data_torch.size() == (5120, 5120) - data_torch = data_torch.reshape(5120, 8, 5, 128) - data_torch = torch.permute(data_torch, (0, 2, 1, 3)) - data_torch = torch.reshape(data_torch, (5120, 5120)) - return data_torch - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") -class Plamo2Model(TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO2 - - def set_vocab(self): - self._set_vocab_plamo() - - def set_gguf_parameters(self): - hparams = self.hparams - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - # Which layers are Mamba layers - # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer) - # This logic matches modeling_plamo.py's is_mamba function - mamba_step = hparams.get("mamba_step", 2) - mamba_enabled = hparams.get("mamba_enabled", True) - num_key_value_heads = [] - num_attention_heads = [] - - if mamba_enabled: - for i in range(self.block_count): - if self.block_count <= (mamba_step // 2): - # use attention in last layer - is_mamba = (i != self.block_count - 1) - else: - is_mamba = (i % mamba_step) != (mamba_step // 2) - if is_mamba: - num_key_value_heads.append(0) - num_attention_heads.append(0) - else: - num_key_value_heads.append(hparams.get("num_key_value_heads", 4)) - num_attention_heads.append(hparams.get("num_attention_heads", 32)) - - if num_key_value_heads and num_attention_heads: - self.gguf_writer.add_head_count_kv(num_key_value_heads) - self.gguf_writer.add_head_count(num_attention_heads) - - self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048)) - self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096)) - self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128)) - self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) - self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000)) - - # Mamba parameters - self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) - self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4)) - self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64)) - intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128) - self.gguf_writer.add_ssm_inner_size(intermediate_size) - self.gguf_writer.add_ssm_group_count(0) - - # MLP feed forward parameters (for attention layers) - self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312)) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(".A_log"): - data_torch = -torch.exp(data_torch) - elif name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - elif name.endswith(".dt_norm_weight"): - name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight" - elif name.endswith(".B_norm_weight"): - name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight" - elif name.endswith(".C_norm_weight"): - name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight" - elif name.endswith(".k_weight"): - name = name.rpartition(".k_weight")[0] + ".k.weight" - elif name.endswith(".q_weight"): - name = name.rpartition(".q_weight")[0] + ".q.weight" - elif name.endswith(".conv1d.weight"): - data_torch = torch.squeeze(data_torch) # remove (, 1, ) - assert data_torch.ndim == 2 - elif name.endswith(".pre_mixer_norm.weight"): - data_torch += 1.0 - elif name.endswith(".post_mixer_norm.weight"): - data_torch += 1.0 / 5 - elif name.endswith(".pre_mlp_norm.weight"): - data_torch += 1.0 - elif name.endswith(".post_mlp_norm.weight"): - data_torch += 1.0 / (5**1.5) - elif name.endswith(".norm.weight"): - data_torch += 1.0 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") -class Plamo3Model(TextModel): - model_arch = gguf.MODEL_ARCH.PLAMO3 - - def set_vocab(self): - self._set_vocab_plamo() - - tokenizer_config_path = self.dir_model / "tokenizer_config.json" - tokenizer_config = {} - - if tokenizer_config_path.is_file(): - with open(tokenizer_config_path, encoding="utf-8") as f: - tokenizer_config = json.load(f) - - chat_template = tokenizer_config.get("chat_template") - chat_template_jinja = self.dir_model / "chat_template.jinja" - - if chat_template_jinja.is_file(): - with open(chat_template_jinja, encoding="utf-8") as f: - chat_template = f.read() - - if chat_template: - self.gguf_writer.add_chat_template(chat_template) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None: - self.gguf_writer.add_sliding_window(sliding_window) - self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - if name.endswith(".pre_mixer_norm.weight"): - data_torch = data_torch + 1.0 - elif name.endswith(".post_mixer_norm.weight"): - data_torch = data_torch + 1.0 / 5 - elif name.endswith(".pre_mlp_norm.weight"): - data_torch = data_torch + 1.0 - elif name.endswith(".post_mlp_norm.weight"): - data_torch = data_torch + 1.0 / (5**1.5) - elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")): - data_torch = data_torch + 1.0 - elif name.endswith(".norm.weight"): - data_torch = data_torch + 1.0 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("CodeShellForCausalLM") -class CodeShellModel(TextModel): - model_arch = gguf.MODEL_ARCH.CODESHELL - - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(10000.0) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(1.0) - - -@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") -class KimiLinearModel(TextModel): - """Kimi-Linear model with hybrid MLA+KDA architecture""" - model_arch = gguf.MODEL_ARCH.KIMI_LINEAR - - _experts: list[dict[str, Tensor]] | None = None - - def set_vocab(self): - try: - self._set_vocab_gpt2() - return - except Exception: - pass - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - tokpre = self.get_vocab_base_pre(tokenizer) - - if tokpre == "kimi-k2": - # Build merges list using the approach similar to HunYuanMoE - merges = [] - vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - # Build token list - vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - # override eos id in config.json with tiktoken eos id - self.gguf_writer.add_eos_token_id(tokenizer.eos_id) # ty: ignore[unresolved-attribute] - else: - raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") - - def set_gguf_parameters(self): - # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group) - self.hparams["num_key_value_heads"] = 1 - - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - # KDA & MLA params - # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv - linear_attn_config = self.hparams["linear_attn_config"] - # n_head == 0 for KDA layers, n_head > 0 for MLA layers - # full_attention_layers list will be used to distinguish layer type - _num_kv_heads = list() - _full_attn_layers = linear_attn_config["full_attn_layers"] - for il in range(self.hparams["num_hidden_layers"]): - if il + 1 in _full_attn_layers: - _num_kv_heads.append(self.hparams["num_key_value_heads"]) - else: - _num_kv_heads.append(0) - assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] - self.gguf_writer.add_head_count_kv(_num_kv_heads) - - if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None: - self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) - if (kda_head_dim := linear_attn_config.get("head_dim")) is not None: - self.gguf_writer.add_kda_head_dim(kda_head_dim) - - # MLA params - use add_* methods that handle arch substitution - # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) - if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None: - self.gguf_writer.add_q_lora_rank(q_lora_rank) - # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA - kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False) - self.gguf_writer.add_kv_lora_rank(kv_lora_rank) - - # MLA head dimensions - # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim - qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") - # Rotation - use qk_rope_head_dim for Kimi - qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False) - self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim) - self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim) - v_head_dim = self.hparams.get("v_head_dim") - - # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim - if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None: - self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) - elif qk_nope_head_dim is not None: - n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim - self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) - - # n_embd_head_v_mla = v_head_dim - if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None: - self.gguf_writer.add_value_length_mla(n_embd_head_v_mla) - elif v_head_dim is not None: - self.gguf_writer.add_value_length_mla(v_head_dim) - - # moe_intermediate_size (1024 for Kimi) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - # num_shared_experts (1 for Kimi) - self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"]) - # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) - self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) - # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") - - # Handle KDA conv1d weights - # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest - # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest - # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] - # Memory layouts match: both have conv_step (d_conv) changing fastest - if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")): - # HF shape: [d_inner, d_conv] e.g. [4096, 4] - # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] - if data_torch.ndim == 2: - d_inner, d_conv = data_torch.shape - # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest) - data_torch = data_torch.reshape(1, d_inner, 1, d_conv) - logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") - elif data_torch.ndim == 3: - # Already 3D [d_inner, 1, d_conv] from unsqueeze - d_inner, _, d_conv = data_torch.shape - data_torch = data_torch.reshape(1, d_inner, 1, d_conv) - logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") - - # Handle A_log: iHF stores as [1, 1, num_heads, 1] - # llama.cpp expects ggml ne = [1, num_heads, 1, 1] - # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] - if name.endswith(".A_log"): - data_torch = -torch.exp(data_torch) - if name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - logger.info("Changed dt_bias to dt_proj.bias") - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - # w1: gate, w2: down, w3: up - for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), - ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), - ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: - datas: list[Tensor] = [] - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - data_torch = torch.stack(datas, dim=0) - new_name = self.format_tensor_name(tname, bid) - yield from super().modify_tensors(data_torch, new_name, bid) - return - - # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed - if name.endswith("kv_b_proj.weight"): - name_kb = name.replace("kv_b_proj", "k_b_proj") - name_vb = name.replace("kv_b_proj", "v_b_proj") - n_head_kv = self.hparams["num_key_value_heads"] - v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False) - qk_nope_head_dim = self.hparams["qk_nope_head_dim"] - logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) - k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2) - yield from super().modify_tensors(k_b, name_kb, bid) - yield from super().modify_tensors(v_b, name_vb, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("InternLM2ForCausalLM") -class InternLM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.INTERNLM2 - - def set_vocab(self): - # (TODO): Is there a better way? - # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character - # \x00 specially and convert it into an emoji character to prevent it from being mistakenly - # recognized as an empty string in C++. - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - if text == b"\x00": - # (TODO): fixme - # Hack here and replace the \x00 characters. - logger.warning(f"InternLM2 convert token '{text}' to '๐Ÿ‰'!") - text = "๐Ÿ‰".encode("utf-8") - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - # take care of ununsed raw token - if piece.startswith('[UNUSED'): - toktype = SentencePieceTokenTypes.UNUSED - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - chat_eos_token = '<|im_end|>' - chat_eos_token_id = None - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) - for token_id, foken_data in added_tokens_decoder.items(): - token_id = int(token_id) - token = foken_data["content"] - if token == chat_eos_token: - chat_eos_token_id = token_id - token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - tokenizer_file = self.dir_model / 'tokenizer.json' - if tokenizer_file.is_file(): - with open(tokenizer_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - added_tokens = tokenizer_json.get("added_tokens", []) - for foken_data in added_tokens: - token_id = int(foken_data["id"]) - token = foken_data["content"] - if token == chat_eos_token: - chat_eos_token_id = token_id - token = token.encode("utf-8") - if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - if tokens[token_id] != token: - logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') - tokens[token_id] = token - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): - toktypes[token_id] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - old_eos = special_vocab.special_token_ids["eos"] - if chat_eos_token_id is not None: - # For the chat model, we replace the eos with '<|im_end|>'. - # TODO: this is a hack, should be fixed - # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 - special_vocab.special_token_ids["eos"] = chat_eos_token_id - logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" - " in chat mode so that the conversation can end normally.") - - special_vocab.add_to_gguf(self.gguf_writer) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - num_heads = self.hparams["num_attention_heads"] - num_kv_heads = self.hparams["num_key_value_heads"] - n_embd = self.hparams["hidden_size"] - q_per_kv = num_heads // num_kv_heads - head_dim = n_embd // num_heads - num_groups = num_heads // q_per_kv - - if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: - qkv = data_torch - - qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) - q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] - - # The model weights of q and k equire additional reshape. - q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) - k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) - v = v.reshape((-1, v.shape[-1])) - - yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) - yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) - yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("InternLM3ForCausalLM") -class InternLM3Model(TextModel): - model_arch = gguf.MODEL_ARCH.LLAMA - - def set_vocab(self): - tokens, scores, toktypes = self._create_vocab_sentencepiece() - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - if "added_tokens_decoder" in tokenizer_config_json: - for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items(): - if token_data.get("special"): - token_id = int(token_id) - token = token_data["content"] - special_vocab._set_special_token(token, token_id) - # update eos token - if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids: - special_vocab.special_token_ids["eos"] = token_id - - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("mlp", "vision_model")): - # skip visual tensors - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") -class BertModel(TextModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.vocab_size = None - - if cls_out_labels := self.hparams.get("id2label"): - if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0": - # Remove dummy labels added by AutoConfig - cls_out_labels = None - self.cls_out_labels = cls_out_labels - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_causal_attention(False) - self._try_set_pooling_type() - - if self.cls_out_labels: - self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())]) - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.vocab_size = len(tokens) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - # convert to phantom space vocab - def phantom(tok, toktype): - if toktype == gguf.TokenType.CONTROL: - return tok - if tok.startswith("##"): - return tok[2:] - return "\u2581" + tok - assert len(tokens) == len(toktypes) - tokens = list(map(phantom, tokens, toktypes)) - - # add vocab to gguf - self.gguf_writer.add_tokenizer_model("bert") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # handle special tokens - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("bert."): - name = name[5:] - - if name.endswith(".gamma"): - name = name[:-6] + ".weight" - - if name.endswith(".beta"): - name = name[:-5] + ".bias" - - # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return None - - if name.startswith("cls.predictions"): - return None - - if name.startswith("cls.seq_relationship"): - return None - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.cls_out_labels: - # For BertForSequenceClassification (direct projection layer) - if name == "classifier.weight": - name = "classifier.out_proj.weight" - - if name == "classifier.bias": - name = "classifier.out_proj.bias" - - yield from super().modify_tensors(data_torch, name, bid) - - def _xlmroberta_tokenizer_init(self) -> None: - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def _xlmroberta_set_vocab(self) -> None: - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' - - tokenizer_json = {} - tokenizer_config_json = {} - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'tokenizer.json' - tokenizer_config_path = self.dir_model / 'tokenizer_config.json' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - from base64 import b64decode - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - - with open(tokenizer_path, "r", encoding="utf-8") as fp: - tokenizer_json = json.load(fp) - - if tokenizer_config_path.is_file(): - with open(tokenizer_config_path, "r", encoding="utf-8") as fp: - tokenizer_config_json = json.load(fp) - - add_prefix = tokenizer.add_prefix_space # ty: ignore[unresolved-attribute] - remove_whitespaces = tokenizer.clean_up_tokenization_spaces # ty: ignore[unresolved-attribute] - precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) - - vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) # ty: ignore[unresolved-attribute] - else: - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - if isinstance(tokenizer, SentencePieceProcessor): - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - else: - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - unk_token = tokenizer_config_json.get("unk_token") - unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) # ty: ignore[no-matching-overload] - - for token_id in range(tokenizer.vocab_size): # ty: ignore[unresolved-attribute] - piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] - if (piece := tokenizer._convert_id_to_token(token_id)) is not None: # ty: ignore[unresolved-attribute] - text = piece.encode("utf-8") - score = tokenizer_json["model"]["vocab"][token_id][1] - - toktype = SentencePieceTokenTypes.NORMAL - if token_id == unk_token_id: - toktype = SentencePieceTokenTypes.UNKNOWN - elif token_id in tokenizer.all_special_ids: # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.CONTROL - elif token_id in added_vocab.values(): - toktype = SentencePieceTokenTypes.USER_DEFINED - # No reliable way to detect this, but jina doesn't have any - # elif tokenizer.IsByte(token_id): - # toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - if isinstance(tokenizer, SentencePieceProcessor): - # realign tokens (see HF tokenizer code) - tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1] - scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] - toktypes = [ - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.UNKNOWN, - ] + toktypes[3:-1] - - if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE: - # Add mask token missing from sentencepiece.bpe.model - tokens[250001] = b'<mask>' - scores[250001] = 0.0 - toktypes[250001] = SentencePieceTokenTypes.CONTROL - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification") -class DistilBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def set_gguf_parameters(self): - self.gguf_writer.add_layer_norm_eps(1e-12) - logger.info("gguf: layer norm epsilon = 1e-12") - super().set_gguf_parameters() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("distilbert."): - name = name[11:] - - # These layers act as MLM head, so we don't need them - if name.startswith("vocab_"): - return None - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("RobertaModel", "RobertaForSequenceClassification") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - else: - return super().set_vocab() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): - hparams = kwargs.pop("hparams", None) - if hparams is None: - hparams = ModelBase.load_hparams(dir_model, False) - - self.is_moe = bool(hparams.get("moe_every_n_layers")) - self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT - - super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) - - self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() - if self._tokenizer_is_xlmroberta: - self._xlmroberta_tokenizer_init() - - npos, mtp = self.hparams["n_positions"], self.hparams.get("max_trained_positions", 2048) - if npos == 8192 and mtp == 2048: - self.hparams["n_positions"] = 2048 # nomic-embed-text v1 and v1.5 are trained for 2048 tokens. - elif npos == 2048 and mtp == 2048: - self.hparams["n_positions"] = 512 # nomic-embed-text-v2-moe is trained for 512 tokens. - else: - raise ValueError(f"unrecognized parameters: n_positions={npos}, max_trained_positions={mtp}") - - assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" - - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors unless MoE - assert self.hparams["qkv_proj_bias"] == self.is_moe - assert self.hparams["mlp_fc1_bias"] == self.is_moe - assert self.hparams["mlp_fc2_bias"] == self.is_moe - - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_vocab(self) -> None: - if self._tokenizer_is_xlmroberta: - return self._xlmroberta_set_vocab() - return super().set_vocab() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # If the tensor is an experts bias tensor, skip it. - if "mlp.experts.bias" in name: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - if "mlp.experts.mlp.w1" in name: - data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) - name += ".weight" - - if "mlp.experts.mlp.w2" in name: - data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"]) - data_torch = data_torch.transpose(1, 2) - name += ".weight" - - yield from super().modify_tensors(data_torch, name, bid) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if self.is_moe: - self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) - self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) - - def _is_tokenizer_xlmroberta(self) -> bool: - with open(self.dir_model / "tokenizer.json") as f: - tokenizer_json = json.load(f) - toktyp = tokenizer_json["model"]["type"] - if toktyp == "Unigram": - return True - if toktyp == "WordPiece": - return False - raise ValueError(f"unknown tokenizer: {toktyp}") - - -@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification") -class NeoBert(BertModel): - model_arch = gguf.MODEL_ARCH.NEO_BERT - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # NeoBERT uses 2/3 of the intermediate size as feed forward length - self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3)) - self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - - self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("decoder."): - return None - - if name.startswith("model."): - name = name[6:] - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") -class EuroBertModel(TextModel): - model_arch = gguf.MODEL_ARCH.EUROBERT - - def set_vocab(self): - self.gguf_writer.add_add_bos_token(False) - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # EuroBert is bidirectional (encoder) - self.gguf_writer.add_causal_attention(False) - - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - self._try_set_pooling_type() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("model."): - name = name[6:] - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - _lora_files = {} - _lora_names = [] - - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): - hparams = kwargs.pop("hparams", None) - if hparams is None: - hparams = ModelBase.load_hparams(dir_model, False) - - if lora_names := hparams.get("lora_adaptations"): - self._lora_names = lora_names - self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 - - super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) - self._xlmroberta_tokenizer_init() - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if self._lora_names: - for name in self._lora_names: - fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-") - self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run) - - return super().generate_extra_tensors() - - def set_type(self): - for lora_writer in self._lora_files.values(): - lora_writer.add_type(gguf.GGUFType.ADAPTER) - lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") - super().set_type() - - def set_vocab(self): - self._xlmroberta_set_vocab() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # jina-embeddings-v3 - if ".parametrizations." in name: - name = name.replace(".parametrizations.", ".") - if name.endswith(".original"): - name = name[:-9] - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): - if name.startswith("pooler.dense"): - return - - num_loras = data_torch.size(0) - assert num_loras == len(self._lora_names) - - # Split out each LoRA in their own GGUF - for i, lora_writer in enumerate(self._lora_files.values()): - new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower() - data = data_torch[i, :, :] - # Transpose/flip token_embd/types into correct shape - if new_name == "token_embd.weight.lora_b": - data = data.T - elif new_name.startswith("token_types.weight."): - new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") - lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) - - return - - yield from super().modify_tensors(data_torch, name, bid) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # jina-embeddings-v3 - lora_alpha = self.hparams.get("lora_alpha") - if lora_prompt_prefixes := self.hparams.get("task_instructions"): - assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) - for lora_name, lora_writer in self._lora_files.items(): - lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0) - lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name) - if lora_prompt_prefixes: - lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name]) - - def write(self): - super().write() - for lora_writer in self._lora_files.values(): - lora_writer.write_header_to_file() - lora_writer.write_kv_data_to_file() - lora_writer.write_tensors_to_file(progress=True) - lora_writer.close() - - -@ModelBase.register("GemmaForCausalLM") -class GemmaModel(TextModel): - model_arch = gguf.MODEL_ARCH.GEMMA - - def set_vocab(self): - self._set_vocab_sentencepiece() - - # TODO: these special tokens should be exported only for the CodeGemma family - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) - special_vocab._set_special_token("prefix", 67) - special_vocab._set_special_token("suffix", 69) - special_vocab._set_special_token("middle", 68) - special_vocab._set_special_token("fsep", 70) - special_vocab._set_special_token("eot", 107) - special_vocab.chat_template = None # do not add it twice - special_vocab.add_to_gguf(self.gguf_writer) - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma2ForCausalLM") -class Gemma2Model(TextModel): - model_arch = gguf.MODEL_ARCH.GEMMA2 - - def set_vocab(self): - self._set_vocab_sentencepiece() - - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - hparams = self.hparams - - self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_key_length(hparams["head_dim"]) - self.gguf_writer.add_value_length(hparams["head_dim"]) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_attn_logit_softcapping( - self.hparams["attn_logit_softcapping"] - ) - self.gguf_writer.add_final_logit_softcapping( - self.hparams["final_logit_softcapping"] - ) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model - # To prevent errors, skip loading lm_head.weight. - if name == "lm_head.weight": - logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") -class Gemma3Model(TextModel): - model_arch = gguf.MODEL_ARCH.GEMMA3 - - def norm_shift(self, name: str) -> float: - return 1.0 if name.endswith("norm.weight") else 0.0 # Gemma3RMSNorm adds 1.0 to the norm value - - def set_vocab(self): - if (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - self.gguf_writer.add_add_space_prefix(False) - else: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - # some default values are not specified in the hparams - self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) - self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) - self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) - self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) - self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers - # attn_logit_softcapping is removed in Gemma3 - assert hparams.get("attn_logit_softcapping") is None - if (final_logit_softcap := hparams.get("final_logit_softcapping")): - self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) - if hparams.get("sliding_window_pattern") != 1: - self.gguf_writer.add_sliding_window(hparams["sliding_window"]) - self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # remove OOV (out-of-vocabulary) rows in token_embd - if "embed_tokens.weight" in name: - n_vocab_real = -1 - if (self.dir_model / "tokenizer.model").is_file(): - tokens = self._create_vocab_sentencepiece()[0] - n_vocab_real = len(tokens) - else: - with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - n_vocab_real = len(tokenizer_json["model"]["vocab"]) + len(tokenizer_json["added_tokens"]) - data_torch = data_torch[:n_vocab_real] - - # ref code in Gemma3RMSNorm - # output = output * (1.0 + self.weight.float()) - # note: this is not the case on gemma3n - f_shift = self.norm_shift(name) - if f_shift != 0.0: - data_torch = data_torch + f_shift - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma3TextModel") -class EmbeddingGemma(Gemma3Model): - model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING - module_paths = [] - dense_features_dims = {} - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.sentence_transformers_dense_modules: - # read modules.json to determine if model has Dense layers - modules_file = self.dir_model / "modules.json" - if modules_file.is_file(): - with open(modules_file, encoding="utf-8") as modules_json_file: - mods = json.load(modules_json_file) - for mod in mods: - if mod["type"].endswith("Dense"): - mod_path = mod["path"] - # check if model.safetensors file for Dense layer exists - model_tensors_file = self.dir_model / mod_path / "model.safetensors" - if model_tensors_file.is_file(): - self.module_paths.append(mod_path) - # read config.json of the Dense layer to get in/out features - mod_conf_file = self.dir_model / mod_path / "config.json" - if mod_conf_file.is_file(): - with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file: - mod_conf = json.load(mod_conf_json_file) - # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights - prefix = self._get_dense_prefix(mod_path) - if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None: - self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"]) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - from safetensors.torch import load_file - module_paths = list(self.module_paths) - for i, module_path in enumerate(module_paths): - tensors_file = self.dir_model / module_path / "model.safetensors" - local_tensors = load_file(tensors_file) - tensor_name = self._get_dense_prefix(module_path) - for name, local_tensor in local_tensors.items(): - if not name.endswith(".weight"): - continue - orig_name = name.replace("linear", tensor_name) - name = self.map_tensor_name(orig_name) - yield name, local_tensor.clone() - - @staticmethod - def _get_dense_prefix(module_path) -> str: - """Get the tensor name prefix for the Dense layer from module path.""" - tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3" - return tensor_name - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # Override the sliding window size as it gets adjusted by the Gemma3TextConfig - # constructor. We want to use the value from the original model's config.json. - # ref: https://github.com/huggingface/transformers/pull/40700 - with open(self.dir_model / "config.json", "r", encoding="utf-8") as f: - config = json.load(f) - orig_sliding_window = config.get("sliding_window") - if orig_sliding_window is None: - raise ValueError("sliding_window not found in model config - this is required for the model") - - logger.info(f"Using original sliding_window from config: {orig_sliding_window} " - f"instead of {self.hparams['sliding_window']}") - self.gguf_writer.add_sliding_window(orig_sliding_window) - if self.sentence_transformers_dense_modules: - for dense, dims in self.dense_features_dims.items(): - logger.info(f"Setting dense layer {dense} in/out features to {dims}") - self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1]) - - self._try_set_pooling_type() - - -@ModelBase.register("Gemma3ForConditionalGeneration") -class Gemma3VisionModel(MmprojModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3) - # default values below are taken from HF transformers code - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_vision_use_gelu(True) - # calculate proj_scale_factor (used by tinygemma3 test model) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - n_per_side = int(image_seq_length ** 0.5) - image_size = self.hparams["image_size"] - patch_size = self.hparams["patch_size"] - proj_scale_factor = (image_size // patch_size) // n_per_side - if proj_scale_factor > 0 and proj_scale_factor != 4: - # we only need to write this if it's not the default value - # in this case, we are converting a test model - self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # related to https://github.com/ggml-org/llama.cpp/issues/13025 - if "input_projection" in name: - return gguf.GGMLQuantizationType.F16 - if ".embeddings." in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "vision_model.head." in name: - # skip redundant tensors for tinygemma3 - return None - - if not name.startswith(("multi_modal_projector.", "vision_tower.", "multimodal_projector.", "vision_model.")): - return None - - name = name.replace("_weight", ".weight") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector - # the other norm values are part of SigLIP model, and they are already correct - # ref code: Gemma3RMSNorm - if "soft_emb_norm.weight" in name: - logger.info(f"Correcting norm value for '{name}'") - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -class ConformerAudioModel(MmprojModel): - _batch_norm_tensors: list[dict[str, Tensor]] | None = None - - @staticmethod - def is_audio_tensor(name: str): - return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"]) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ConformerAudioModel.is_audio_tensor(name): - if ".conv" in name or "_conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # fold running_mean, running_var and eps into weight and bias for batch_norm - if "batch_norm" in name: - if self._batch_norm_tensors is None: - self._batch_norm_tensors = [{} for _ in range(self.block_count)] - assert bid is not None - self._batch_norm_tensors[bid][name] = data_torch - - if len(self._batch_norm_tensors[bid]) < 5: - return - - weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] - bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] - running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"] - running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"] - eps = 1e-5 # default value - - a = weight / torch.sqrt(running_var + eps) - b = bias - running_mean * a - yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) - yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid) - return - - # reshape conv weights - if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): - data_torch = data_torch[:, None, None] - if "conv.depthwise_conv" in name and name.endswith(".weight"): - assert data_torch.shape[1] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) - if "conv.pointwise_conv" in name and name.endswith(".weight"): - assert data_torch.shape[2] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) - - mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) - yield (mapped_name, data_torch) - - -@ModelBase.register("DeepseekOCRForCausalLM") -class DeepseekOCRVisionModel(MmprojModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) - # default values below are taken from HF tranformers code - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_vision_use_gelu(True) - # calculate proj_scale_factor (used by tinygemma3 test model) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - n_per_side = int(image_seq_length ** 0.5) - image_size = self.hparams["image_size"] - patch_size = self.hparams["patch_size"] - proj_scale_factor = (image_size // patch_size) // n_per_side - if proj_scale_factor > 0 and proj_scale_factor != 4: - # we only need to write this if it's not the default value - # in this case, we are converting a test model - self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) - # @bluebread: there's no window_size in config but just add it here anyway - self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) - - # SAM configuration - sam_hparams = hparams['sam'] - self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) - self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) - self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) - - def get_vision_config(self) -> dict[str, Any]: - vision_config: dict[str, Any] | None = self.global_config.get("vision_config") - - if not vision_config: - raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") - - vision_config['sam'] = vision_config['width']['sam_vit_b'] - vision_config.update(vision_config['width']['clip-l-14-224']) - vision_config['hidden_size'] = vision_config['width'] - vision_config['num_heads'] = vision_config['heads'] - vision_config['intermediate_size'] = vision_config['heads'] * 4 - - return vision_config - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".embeddings." in name or 'pos_embed' in name: - return gguf.GGMLQuantizationType.F32 - if ".rel_pos_h" in name or '.rel_pos_w' in name: - return gguf.GGMLQuantizationType.F32 - if ".neck." in name or ".net_" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Only process vision-related tensors, skip language model tensors - # Vision components: sam_model, vision_model, projector, image_newline, view_seperator - # Language model components to skip: lm_head, embed_tokens, layers, norm - if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): - return None - - if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("Gemma3nForConditionalGeneration") -class Gemma3nVisionAudioModel(ConformerAudioModel): - has_audio_encoder = True - has_vision_encoder = True - - # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py) - # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py - block_tensor_mapping = { - "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight", - "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight", - } - - def __init__(self, *args, **kwargs): - # Parent init will call find_hparam which now returns 0 for empty keys - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it - self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4 - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8) - - # MobileNetV5 does not use image_mean/std - self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0] - self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0] - self.hparams_vision["image_size"] = self.preprocessor_config.get( - "size", {"height": 768, "width": 768} - )["height"] - - # Image sequence length (256 tokens = 16x16 for Gemma3n) - image_seq_length = self.preprocessor_config.get("image_seq_length", 256) - image_size = self.hparams_vision["image_size"] - self.hparams_vision["patch_size"] = image_size // image_seq_length - - # remap audio hparams - assert self.hparams_audio is not None - self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"] - self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"] - self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"] - self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # vision params - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - - # audio params - assert self.hparams_audio is not None - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # Force quantization settings for specific tensor types - if "input_projection" in name or "input_proj" in name: - return gguf.GGMLQuantizationType.F16 - if ".embeddings." in name or "stem" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def custom_map(self, name: str) -> str: - """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping.""" - parts = name.split(".") - # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix - if len(parts) >= 7: - bid, sid = parts[4], parts[5] - suffix = ".".join(parts[6:]) - template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}" - if template in self.block_tensor_mapping: - return self.block_tensor_mapping[template].format(bid=bid, sid=sid) - - raise ValueError(f"Unknown name: {name}") - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if (ConformerAudioModel.is_audio_tensor(name)): - name = name.replace("model.audio_tower.conformer.", "conformer.layers.") - yield from super().modify_tensors(data_torch, name, bid) - - # Gemma3n uses - # - model.embed_vision.* for projection layers - # - model.vision_tower.* for vision encoder - # Skip non-vision tensors - if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): - return - - if name.startswith("model.vision_tower.timm_model.blocks."): - # Double-indexed block tensors through custom logic - yield (self.custom_map(name), data_torch) - return - else: - # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py - new_name = self.map_tensor_name(name) - - if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): - data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] - - yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) - - -@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") -class Gemma3NModel(Gemma3Model): - model_arch = gguf.MODEL_ARCH.GEMMA3N - - _altup_proj: list[Tensor] = [] - _altup_unembd: list[Tensor] = [] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs" - self._altup_proj = [ - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - ] - self._altup_unembd = [ - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - torch.Tensor(), # to be replaced - ] - - def norm_shift(self, name: str) -> float: - del name - return 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code - - def set_vocab(self): - # For Gemma3n multimodal models, we need the FULL vocab_size (262400) - # which includes special tokens from 262144-262399 for vision/audio. - # The vocab_size_per_layer_input (262144) is only the embedding size per layer. - # Temporarily override the hparams lookup order to prioritize vocab_size. - - # Store original vocab_size_per_layer_input if it exists - vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input") - - # Temporarily remove vocab_size_per_layer_input to force using vocab_size - if vocab_size_per_layer_input is not None: - del self.hparams["vocab_size_per_layer_input"] - - # Call parent set_vocab which will now use vocab_size (262400) - super().set_vocab() - - # Restore vocab_size_per_layer_input for later use - if vocab_size_per_layer_input is not None: - self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) - self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"]) - self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"]) - self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"]) - - activation_sparsity_scale = [] - for s in self.hparams["activation_sparsity_pattern"]: - normal_dist = torch.distributions.normal.Normal(0, 1) - std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32)) - activation_sparsity_scale.append(std_multiplier.item()) - self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale) - - sliding_window_pattern = [] - for t in self.hparams["layer_types"]: - sliding_window_pattern.append(t == "sliding_attention") - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - - def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None: - has_all = all(m.numel() > 0 for m in matrices) - if not has_all: - return None - else: - return torch.stack(matrices, dim=0) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith("_scale"): - name = name + ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # TODO: implement self.prediction_coefs.weight.clamp_(...) - - # Pad token embeddings for vision/audio special tokens (262144-262399) - if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: - # Move to CPU to avoid meta device issues during padding - data_torch = data_torch.to(device="cpu") - - vocab_size = self.hparams.get("vocab_size", 262400) - current_size = data_torch.shape[0] # First dimension is vocab_size - - if current_size < vocab_size: - # Pad with zeros for vision/audio tokens (they get embeddings from vision tower) - padding_size = vocab_size - current_size - tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings" - logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)") - - # Create padding with zeros (vision tokens won't use these embeddings) - padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device) - data_torch = torch.cat([data_torch, padding], dim=0) - - # Continue with normal processing - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - return - - if "altup_unembed_projections" in name: - data_torch = data_torch.to(device="cpu") - # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based - # They should NOT be padded - if ".0." in name: - self._altup_unembd[0] = data_torch - elif ".1." in name: - self._altup_unembd[1] = data_torch - elif ".2." in name: - self._altup_unembd[2] = data_torch - else: - raise ValueError(f"Unknown name: {name}") - out = self._stack_matrices(self._altup_unembd) - if out is not None: - yield from ModelBase.modify_tensors(self, out, "model.altup_unembed_projections.weight", bid) - return - else: - return - - if "altup_projections" in name: - data_torch = data_torch.to(device="cpu") - if ".0." in name: - self._altup_proj[0] = data_torch - elif ".1." in name: - self._altup_proj[1] = data_torch - elif ".2." in name: - self._altup_proj[2] = data_torch - else: - raise ValueError(f"Unknown name: {name}") - out = self._stack_matrices(self._altup_proj) - if out is not None: - yield from ModelBase.modify_tensors(self, out, "model.altup_projections.weight", bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma4ForConditionalGeneration") -class Gemma4Model(Gemma3Model): - model_arch = gguf.MODEL_ARCH.GEMMA4 - - def norm_shift(self, name: str) -> float: - del name # unused - return 0.0 - - def set_vocab(self): - vocab = gguf.LlamaHfVocab(self.dir_model) - tokens = [] - scores = [] - toktypes = [] - visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"} - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - text_str = text.decode() - if text_str in visible_tokens: - # always render these tokens, so that the chat parser can read them - toktypes.append(gguf.TokenType.USER_DEFINED) - logger.info(f"Token '{text_str}' is set to USER_DEFINED") - else: - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size - - self.gguf_writer.add_tokenizer_model("gemma4") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - self.gguf_writer.add_add_space_prefix(False) - self.gguf_writer.add_add_bos_token(True) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - num_kv_shared_layers = self.hparams["num_kv_shared_layers"] - self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers) - - # per-layer embedding is optional - n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0 - self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd) - - swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]] - self.gguf_writer.add_sliding_window_pattern(swa_layers) - - head_dim_full = self.hparams["global_head_dim"] - head_dim_swa = self.hparams["head_dim"] - # correct the head dim for global/swa layers - self.gguf_writer.add_key_length(head_dim_full) - self.gguf_writer.add_value_length(head_dim_full) - self.gguf_writer.add_key_length_swa(head_dim_swa) - self.gguf_writer.add_value_length_swa(head_dim_swa) - - expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"]) - if expert_intermediate_size is not None: - self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) - - # if use_double_wide_mlp is set, we need to adjust the value for kv shared layers - use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False) - first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers - if use_double_wide_mlp: - n_ff = self.hparams["intermediate_size"] - n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)] - self.gguf_writer.add_feed_forward_length(n_ff_arr) - - # handle num_global_key_value_heads - num_key_value_heads_full = self.hparams.get("num_global_key_value_heads") - num_key_value_heads_swa = self.hparams.get("num_key_value_heads") - if num_key_value_heads_full is not None and num_key_value_heads_swa is not None: - value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers] - self.gguf_writer.add_head_count_kv(value_arr) - - # handle n_rot differently for global vs swa layers - partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) - n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors - n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) - self.gguf_writer.add_rope_dimension_count(n_rot_full) - self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # full layer uses "proportional" rope with partial_rotary_factor=0.25 - # the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated), - # but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention - # solution is to set specific freq_factors for the unrotated dims - - # IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers - rope_params_full = self.hparams["rope_parameters"]["full_attention"] - assert rope_params_full["rope_type"] == "proportional" - head_dim_full = (self.hparams["global_head_dim"]) - partial_rotary_factor_full = rope_params_full["partial_rotary_factor"] - n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2) - n_unrot_full = int(head_dim_full / 2) - n_rot_full - values = [1.0] * n_rot_full + [1e30] * n_unrot_full - rope_freqs_full = torch.tensor(values, dtype=torch.float32) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full) - - def _generate_nvfp4_tensors(self): - # Gemma-4 stores a per-layer router.per_expert_scale ([n_expert]) that scales - # each expert's contribution. It's mathematically equivalent to a per-expert - # scalar on the down_proj output, which is exactly where ffn_down_exps_s is - # applied at inference. Fold it into each expert's NVFP4 weight_scale_2 so the - # existing NVFP4 path produces the right scales. - n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0 - for name in [n for n in self.model_tensors if n.endswith(".router.per_expert_scale")]: - bid_match = re.search(r"\.layers\.(\d+)\.", name) - if bid_match is None: - continue - bid = bid_match.group(1) - prefix = name[: name.index(f".layers.{bid}.") + len(f".layers.{bid}.")] - w2_targets = [f"{prefix}experts.{e}.down_proj.weight_scale_2" for e in range(n_experts)] - present = [w2 in self.model_tensors for w2 in w2_targets] - if not any(present): - continue - assert all(present), f"layer {bid}: partial NVFP4 quantization across experts" - r = self.model_tensors.pop(name) - for e, w2 in enumerate(w2_targets): - s = self.model_tensors[w2] - self.model_tensors[w2] = lambda s=s, r=r, i=e: s() * r()[i] - super()._generate_nvfp4_tensors() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith("per_dim_scale") or name.endswith("layer_scalar"): - name = name + ".weight" - if ".experts." in name and not name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith("router.scale"): - name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale") - yield (name, data_torch) - return - if ".per_expert_scale" in name: - # convert per-expert scale to FFN down scale - name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale") - yield (name, data_torch) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Gemma4ForConditionalGeneration") -class Gemma4VisionAudioModel(MmprojModel): - has_audio_encoder = True - has_vision_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = 224 # unused, but set to avoid error - - # remap audio hparams - if self.hparams_audio: - self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128) - self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4 - else: - self.has_audio_encoder = False - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # vision params - self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - - # audio params - if self.hparams_audio: - self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - - def is_audio_tensor(self, name: str) -> bool: - return "audio_tower" in name or "embed_audio" in name - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if self.is_audio_tensor(name): - if ".conv" in name or "_conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - if "position_embedding_table" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - if len(data_torch.shape) == 0: - # convert scalar tensors (input/output_mix/max) to 1D tensors - data_torch = data_torch.unsqueeze(0) - - if self.is_audio_tensor(name): - assert self.hparams_audio is not None - name = name.replace("model.audio_tower.", "conformer.") - name = name.replace(".linear.", ".") - if name.endswith("per_dim_key_scale") or name.endswith("per_dim_scale"): - name = name + ".weight" - data_torch = torch.nn.functional.softplus(data_torch) - if "lconv1d.depthwise_conv1d" in name and name.endswith(".weight"): - assert data_torch.shape[1] == 1 - data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2]) - mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) - yield (mapped_name, data_torch) - - else: - name = name.replace("model.vision_tower.encoder.", "vision_model.model.") - name = name.replace(".linear.weight", ".weight") - if name.endswith("layer_scalar") or name.endswith("position_embedding_table"): - name = name + ".weight" - if name.endswith("patch_embedder.input_proj.weight"): - n_embd, ksize_sq_c = data_torch.shape - patch_size = int((ksize_sq_c // 3) ** 0.5) - data_torch = data_torch.reshape(n_embd, patch_size, patch_size, 3) - data_torch = data_torch.permute(0, 3, 1, 2).contiguous() - mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min")) - yield (mapped_name, data_torch) - - -@ModelBase.register("Starcoder2ForCausalLM") -class StarCoder2Model(TextModel): - model_arch = gguf.MODEL_ARCH.STARCODER2 - - -@ModelBase.register("Rwkv6ForCausalLM") -class Rwkv6Model(TextModel): - model_arch = gguf.MODEL_ARCH.RWKV6 - - def set_vocab(self): - self._set_vocab_rwkv_world() - - def set_gguf_parameters(self): - head_size = self.hparams["head_size"] - hidden_size = self.hparams["hidden_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] - rescale_every_n_layers = self.hparams["rescale_every"] - intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) - time_mix_extra_dim = 64 if hidden_size == 4096 else 32 - time_decay_extra_dim = 128 if hidden_size == 4096 else 64 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - lerp_weights: dict[int, dict[str, Tensor]] = {} - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): - data_torch = data_torch.transpose(0, 1) - - if new_name.endswith("time_mix_w2.weight"): - data_torch = data_torch.permute(0, 2, 1) - - if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: - data_torch = data_torch.squeeze() - - try: - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) - except KeyError: - pass - - # concat time_mix_lerp weights to reduce some cpu overhead - # also reduces the number of tensors in the model - if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: - try: - self.lerp_weights[bid][new_name] = data_torch - except KeyError: - self.lerp_weights[bid] = {new_name: data_torch} - if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) - yield (new_name, data) - return - - yield (new_name, data_torch) - - -@ModelBase.register("RWKV6Qwen2ForCausalLM") -class RWKV6Qwen2Model(Rwkv6Model): - model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - num_attention_heads = self.hparams["num_attention_heads"] - num_key_value_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] - head_size = hidden_size // num_attention_heads - rms_norm_eps = self.hparams["rms_norm_eps"] - intermediate_size = self.hparams["intermediate_size"] - time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32) - time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64) - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) - self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # special parameters for time_mixing in RWKV6QWEN2 - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_token_shift_count(1) - # RWKV6QWEN2 use grouped key/value like GQA - self.gguf_writer.add_head_count_kv(num_key_value_heads) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - for new_name, data in super().modify_tensors(data_torch, name, bid): - if "time_mix_w1" in new_name or "time_mix_w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg - # permute them here to avoid code changes - data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) - if "w2" in new_name: - data = data.view(5, -1, data.shape[-1]) - yield (new_name, data) - continue - yield (new_name, data) - - -@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") -class Rwkv7Model(TextModel): - model_arch = gguf.MODEL_ARCH.RWKV7 - - def set_vocab(self): - self._set_vocab_rwkv_world() - - def calc_lora_rank(self, hidden_size, exponent, multiplier): - return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 - - def set_gguf_parameters(self): - try: - head_size = self.hparams["head_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] - except KeyError: - head_size = self.hparams["head_dim"] - layer_norm_eps = self.hparams["norm_eps"] - hidden_size = self.hparams["hidden_size"] - intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) - - # ICLR: In-Context-Learning-Rate - try: - lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) - lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) - except KeyError: - lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) - lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_eps(layer_norm_eps) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_decay_lora_rank(lora_rank_decay) - self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) - self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) - self.gguf_writer.add_gate_lora_rank(lora_rank_gate) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - lerp_weights: dict[int, dict[str, Tensor]] = {} - lora_needs_transpose: bool = True - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # unify tensor names here to make life easier - name = name.replace("blocks", "layers").replace("ffn", "feed_forward") - name = name.replace("self_attn", "attention").replace("attn", "attention") - name = name.replace("time_mixer.", "") - - name = name.replace("feed_forward_norm", "ln2") - name = name.replace("g_norm", "ln_x") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # lora layer names in fla-hub's impl - if "_lora.lora" in name: - self.lora_needs_transpose = False - name = name.replace("_lora.lora.0.weight", "1.weight") - name = name.replace("_lora.lora.2.weight", "2.weight") - name = name.replace("_lora.lora.2.bias", "0.weight") - - if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: - # some models have dummy v0/v1/v2 on first layer while others don't - # ignore them all since they are not used - return - - wkv_has_gate = self.hparams.get("wkv_has_gate", True) - lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] - - if bid is not None and "attention.x_" in name: - if "attention.x_x" in name: - # already concatenated - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = data_torch.reshape(len(lerp_list), 1, 1, -1) - yield (new_name, data) - else: - try: - self.lerp_weights[bid][name] = data_torch - except KeyError: - self.lerp_weights[bid] = {name: data_torch} - if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) - yield (new_name, data) - return - else: - data_torch = data_torch.squeeze() - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if self.lora_needs_transpose and any( - new_name.endswith(t) for t in [ - "time_mix_w1.weight", "time_mix_w2.weight", - "time_mix_a1.weight", "time_mix_a2.weight", - "time_mix_v1.weight", "time_mix_v2.weight", - "time_mix_g1.weight", "time_mix_g2.weight", - ] - ): - data_torch = data_torch.transpose(0, 1) - - if 'r_k' in new_name: - data_torch = data_torch.flatten() - - if bid == 0 and "time_mix_a" in new_name: - # dummy v0/v1/v2 on first layer - # easiest way to make llama happy - yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) - - yield (new_name, data_torch) - - -@ModelBase.register("RwkvHybridForCausalLM") -class ARwkv7Model(Rwkv7Model): - model_arch = gguf.MODEL_ARCH.ARWKV7 - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - hidden_size = self.hparams["hidden_size"] - head_size = self.hparams["head_size"] - rms_norm_eps = self.hparams["rms_norm_eps"] - intermediate_size = self.hparams["intermediate_size"] - wkv_has_gate = self.hparams["wkv_has_gate"] - assert self.hparams["wkv_version"] == 7 - - # ICLR: In-Context-Learning-Rate - lora_rank_decay = 64 - lora_rank_iclr = 64 - lora_rank_value_residual_mix = 32 - lora_rank_gate = 128 if wkv_has_gate else 0 - - # RWKV isn't context limited - self.gguf_writer.add_context_length(1048576) - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_wkv_head_size(head_size) - self.gguf_writer.add_decay_lora_rank(lora_rank_decay) - self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) - self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) - self.gguf_writer.add_gate_lora_rank(lora_rank_gate) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_token_shift_count(1) - - # required by llama.cpp, unused - self.gguf_writer.add_head_count(0) - - -@ModelBase.register("MaincoderForCausalLM") -class MaincoderModel(TextModel): - model_arch = gguf.MODEL_ARCH.MAINCODER - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_rope_dimension_count(head_dim) - - -@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(TextModel): - model_arch = gguf.MODEL_ARCH.MAMBA - - def __init__(self, dir_model: Path, *args, **kwargs): - # Avoid using AutoConfig for hparams - hparams = kwargs.pop("hparams", None) - if hparams is None: - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - super().__init__(dir_model, *args, hparams=hparams, **kwargs) - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 8 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - elif (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - self._set_vocab_builtin("gpt-neox", vocab_size) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - use_dt_b_c_norm = False - # For falconmamba we do apply RMS norm on B / DT and C layers - if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): - use_dt_b_c_norm = True - # Fail early for models which don't have a block expansion factor of 2 - assert d_inner == 2 * d_model - - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers - self.gguf_writer.add_file_type(self.ftype) - - _tok_embd = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - - new_name = self.map_tensor_name(name) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - # [4 1 8192 1] -> [4 8192 1 1] - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): - data_torch = data_torch.squeeze() - - # assuming token_embd.weight is seen before output.weight - if self._tok_embd is not None and new_name == output_name: - if torch.equal(self._tok_embd, data_torch): - logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return - elif new_name == tok_embd_name: - self._tok_embd = data_torch - - yield from super().modify_tensors(data_torch, new_name, bid) - - -@ModelBase.register("Mamba2ForCausalLM") -class Mamba2Model(TextModel): - model_arch = gguf.MODEL_ARCH.MAMBA2 - - def __init__(self, dir_model: Path, *args, **kwargs): - # Avoid using AutoConfig for hparams - # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1 - hparams = kwargs.pop("hparams", None) - if hparams is None: - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - if "llm_config" in hparams: - hparams["text_config"] = hparams["llm_config"] - super().__init__(dir_model, *args, hparams=hparams, **kwargs) - self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) - self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model - self.n_group = self.find_hparam(["n_groups"], optional=True) or 1 - - def set_vocab(self): - vocab_size = self.hparams["vocab_size"] - # Round vocab size to next multiple of 16 - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) - # pad using ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - if (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - elif (self.dir_model / "tokenizer.model.v3").is_file(): - # mamba-codestral - raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}") - elif (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() - else: - # Use the GPT-NeoX tokenizer when no tokenizer files are present - self._set_vocab_builtin("gpt-neox", vocab_size) - - def set_gguf_parameters(self): - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 - head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64 - - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - - # Fail early for models which don't have a block expansion factor of 2 - # TODO: does this really matter? - # skip the assertion for FalconH1 Model - if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: - assert self.d_inner == 2 * self.d_model - assert self.d_inner % head_dim == 0 - - self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default - self.gguf_writer.add_embedding_length(self.d_model) - self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading - self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(self.d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim) - self.gguf_writer.add_ssm_group_count(self.n_group) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_file_type(self.ftype) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("model.backbone", "model.lm_head")): - # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2 - name = name.removeprefix("model.") - - if name.endswith(".dt_bias"): - name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - new_name = self.map_tensor_name(name) - - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): - data_torch = data_torch.squeeze() - elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [ - gguf.MODEL_TENSOR.SSM_A, - gguf.MODEL_TENSOR.SSM_D, - ]): - # unsqueeze A to use similar shape semantics as Mamba-1 - # (D is also unsqueezed, but for more straightforward broadcast internally) - data_torch = data_torch.reshape((*data_torch.shape, 1)) - elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid): - data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group)) - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - yield (new_name, data_torch) - - -@ModelBase.register("JambaForCausalLM") -class JambaModel(TextModel): - model_arch = gguf.MODEL_ARCH.JAMBA - - def set_vocab(self): - if (self.dir_model / "tokenizer.model").is_file(): - self._set_vocab_sentencepiece() - else: - self._set_vocab_llama_hf() - self.gguf_writer.add_add_space_prefix(False) - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) - d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 - d_inner = self.hparams["mamba_expand"] * d_model - d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16) - rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6 - n_kv_head = self.hparams["num_key_value_heads"] - attn_offset = self.hparams["attn_layer_offset"] - attn_period = self.hparams["attn_layer_period"] - n_kv_vec = [0 for _ in range(attn_offset)] + [ - n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count) - ] - - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"])) - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(n_kv_vec) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"])) - self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"])) - self.gguf_writer.add_file_type(self.ftype) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - # Mini-Jamba - name = name.replace(".moe.", ".feed_forward.") - if bid is not None: - moe_offset = self.hparams["expert_layer_offset"] - moe_period = self.hparams["expert_layer_period"] - - if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): - name = name.replace(".experts.0.", ".") - - # process the experts separately - if ".feed_forward.experts." in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - - # merge the experts into a single 3d tensor - for wid in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - # using the same merged name as qwen2moe - merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight" - - new_name = self.map_tensor_name(merged_name) - - yield new_name, data_torch - return - - new_name = self.map_tensor_name(name) - - if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid): - data_torch = data_torch.squeeze() - - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) - - yield (new_name, data_torch) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("CohereForCausalLM") -class CommandR2Model(TextModel): - model_arch = gguf.MODEL_ARCH.COMMAND_R - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # max_position_embeddings = 8192 in config.json but model was actually - # trained on 128k context length - # aya-23 models don't have model_max_length specified - self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - -@ModelBase.register("Cohere2ForCausalLM") -class Cohere2Model(TextModel): - model_arch = gguf.MODEL_ARCH.COHERE2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - rotary_pct = self.hparams["rotary_pct"] - hidden_size = self.hparams["hidden_size"] - num_attention_heads = self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Cohere2 runtime in llama.cpp expects no bias tensors; - # the actual weight only contains 0-value tensors as bias, we can skip them - if name.endswith(".bias"): - if torch.any(data_torch != 0): - raise ValueError(f"Bias tensor {name!r} is not zero.") - logger.debug(f"Skipping bias tensor {name!r} for Cohere2 conversion.") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("OlmoForCausalLM") -@ModelBase.register("OLMoForCausalLM") -class OlmoModel(TextModel): - model_arch = gguf.MODEL_ARCH.OLMO - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_eps(1e-5) - clip_qkv = self.hparams.get("clip_qkv") - if clip_qkv is not None: - self.gguf_writer.add_clamp_kqv(clip_qkv) - - # Same as super class, but permuting q_proj, k_proj - # Copied from: LlamaModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("SeedOssForCausalLM") -class SeedOssModel(TextModel): - model_arch = gguf.MODEL_ARCH.SEED_OSS - - -@ModelBase.register("Olmo2ForCausalLM") -@ModelBase.register("Olmo3ForCausalLM") -class Olmo2Model(TextModel): - model_arch = gguf.MODEL_ARCH.OLMO2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - if "sliding_window" in self.hparams: - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - - sliding_window_pattern = [] - if "layer_types" in self.hparams: - sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] - else: - # Olmo2 does not use sliding window attention. - # Olmo3 defaults to using sliding window for all layers except every 4th. - for i in range(self.hparams["num_hidden_layers"]): - sliding_window_pattern.append((i + 1) % 4 != 0) - - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - - -@ModelBase.register("OlmoeForCausalLM") -class OlmoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.OLMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_layer_norm_rms_eps(1e-5) - - _experts: list[dict[str, Tensor]] | None = None - - # Copied from: Qwen2MoeModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - # Copied from: Qwen2MoeModel - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") -class JinaBertV2Model(BertModel): - model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 - - def set_vocab(self): - tokenizer_class = 'BertTokenizer' - with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_class = json.load(f)['tokenizer_class'] - - if tokenizer_class == 'BertTokenizer': - super().set_vocab() - elif tokenizer_class == 'RobertaTokenizer': - self._set_vocab_gpt2() - self.gguf_writer.add_token_type_count(2) - else: - raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') - - -@ModelBase.register("OpenELMForCausalLM") -class OpenELMModel(TextModel): - model_arch = gguf.MODEL_ARCH.OPENELM - - @staticmethod - def _make_divisible(v: float | int, divisor: int) -> int: - # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 - new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - ffn_multipliers: list[float] = self.hparams["ffn_multipliers"] - ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"] - self._n_embd: int = self.hparams["model_dim"] - self._num_kv_heads: list[int] = self.hparams["num_kv_heads"] - self._num_query_heads: list[int] = self.hparams["num_query_heads"] - self._ffn_dims: list[int] = [ - OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor) - for multiplier in ffn_multipliers - ] - assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) - assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int) - - # Uses the tokenizer from meta-llama/Llama-2-7b-hf - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"]) - - def set_gguf_parameters(self): - n_embd = self._n_embd - head_dim = self.hparams["head_dim"] - rot_pct = 1.0 - assert self.block_count == len(self._num_kv_heads) - assert self.block_count == len(self._num_query_heads) - assert self.block_count == len(self._ffn_dims) - - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_context_length"]) - self.gguf_writer.add_embedding_length(n_embd) - self.gguf_writer.add_feed_forward_length(self._ffn_dims) - self.gguf_writer.add_head_count(self._num_query_heads) - self.gguf_writer.add_head_count_kv(self._num_kv_heads) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 - self.gguf_writer.add_layer_norm_rms_eps(1e-6) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - self.gguf_writer.add_file_type(self.ftype) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - if "n_layers" in keys: - return self.hparams["num_transformer_layers"] - - return super().find_hparam(keys, optional) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - - # split ff - if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": - ff_dim = self._ffn_dims[bid] - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim]) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:]) - return - - yield (self.map_tensor_name(name), data_torch) - - -@ModelBase.register("ArcticForCausalLM") -class ArcticModel(TextModel): - model_arch = gguf.MODEL_ARCH.ARCTIC - - def set_vocab(self): - # The reason for using a custom implementation here is that the - # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from - # tokenizer.model and used them as BOS and EOS instead of adding new tokens. - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - if not tokenizer_path.is_file(): - logger.error(f'Error: Missing {tokenizer_path}') - sys.exit(1) - - # Read the whole vocabulary from the tokenizer.model file - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - # Use the added_tokens_decoder field from tokeniser_config.json as the source - # of information about added/redefined tokens and modify them accordingly. - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - - if "added_tokens_decoder" in tokenizer_config_json: - added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] - for token_id, token_json in added_tokens_decoder.items(): - token_id = int(token_id) - if token_id >= vocab_size: - logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - token_content = token_json["content"] - token_type = SentencePieceTokenTypes.USER_DEFINED - token_score = -10000.0 - - # Map unk_token to UNKNOWN, other special tokens to CONTROL - # Set the score to 0.0 as in the original tokenizer.model - if ("special" in token_json) and token_json["special"]: - if token_content == tokenizer_config_json["unk_token"]: - token_type = SentencePieceTokenTypes.UNKNOWN - else: - token_type = SentencePieceTokenTypes.CONTROL - token_score = 0.0 - - logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") - tokens[token_id] = token_content.encode("utf-8") - toktypes[token_id] = token_type - scores[token_id] = token_score - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith("q_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith("k_proj.weight"): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("block_sparse_moe.experts") != -1: - n_experts = self.hparams["num_local_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for wid in ["w1", "w2", "w3"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("DeepseekForCausalLM") -class DeepseekModel(TextModel): - model_arch = gguf.MODEL_ARCH.DEEPSEEK - - def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_weights_scale(1.0) - self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) - - _experts: list[dict[str, Tensor]] | None = None - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register( - "DeepseekV2ForCausalLM", - "DeepseekV3ForCausalLM", - "KimiVLForConditionalGeneration", - "KimiK25ForConditionalGeneration", - "YoutuForCausalLM", - "YoutuVLForConditionalGeneration", -) -class DeepseekV2Model(TextModel): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - # TODO @ngxson : remove this when we support MTP for deepseek models - skip_mtp = True - - merge_expert = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) - self.origin_hf_arch = hparams.get('architectures', [None])[0] - - # special handling for Deepseek OCR - if self.origin_hf_arch == "DeepseekOCRForCausalLM": - self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] - self.gguf_writer.add_architecture() - # default jinja template - self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") - - def set_vocab(self): - try: - self._set_vocab_gpt2() - return - except Exception: - pass - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - tokpre = self.get_vocab_base_pre(tokenizer) - - if tokpre == "kimi-k2": - # Build merges list using the approach similar to HunYuanMoE - merges = [] - vocab = {} - mergeable_ranks = tokenizer.model._mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # Build token list - vocab_size = self.hparams["vocab_size"] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - else: - raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") - - def set_gguf_parameters(self): - is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) - - if is_ocr: - self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) - else: - # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) - self.hparams["num_key_value_heads"] = 1 - - self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) - - super().set_gguf_parameters() - hparams = self.hparams - - # first_k_dense_replace: number of leading layers using dense FFN instead of MoE - # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers - # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers - has_moe = hparams.get("n_routed_experts") is not None - first_k_dense_replace = hparams.get("first_k_dense_replace") - if first_k_dense_replace is None: - # Default: if no MoE, all layers are dense; if MoE, none are dense - first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0 - self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - kv_lora_rank = hparams.get("kv_lora_rank", 512) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: - self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - - # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - if not is_ocr: - self.gguf_writer.add_kv_lora_rank(kv_lora_rank) - self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(kv_lora_rank) - self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) - - # MoE parameters (required by C++ code for DEEPSEEK2 arch) - # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length - moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False) - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - - if (n_routed_experts := hparams.get("n_routed_experts")) is not None: - self.gguf_writer.add_expert_count(n_routed_experts) - - # expert_shared_count is required by C++ code, default to 0 for non-MoE models - n_shared_experts = hparams.get("n_shared_experts", 0) - self.gguf_writer.add_expert_shared_count(n_shared_experts) - - # When not set, C++ code will use scale_w = false to skip the no-op scaling - if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None: - self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) - - if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob: - self.gguf_writer.add_expert_weights_norm(norm_topk_prob) - - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: - # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] - # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul - # ref https://github.com/ggml-org/llama.cpp/pull/17945 - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip lm_head.weight if tie_word_embeddings is True - if self.hparams.get("tie_word_embeddings", False): - if name == "lm_head.weight" or name == "model.lm_head.weight": - logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") - return - - # skip Multi-Token Prediction (MTP) layers - if self.skip_mtp: - block_count = self.hparams["num_hidden_layers"] - match = re.match(r"model.layers.(\d+)", name) - if match and int(match.group(1)) >= block_count: - return - - # process the experts separately - if self.merge_expert and name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed - if name.endswith("kv_b_proj.weight"): - name_kb = name.replace("kv_b_proj", "k_b_proj") - name_vb = name.replace("kv_b_proj", "v_b_proj") - - n_head_kv = self.hparams["num_key_value_heads"] - v_head_dim = self.hparams["v_head_dim"] - qk_nope_head_dim = self.hparams["qk_nope_head_dim"] - - assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) - - kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) - k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) - k_b = k_b.transpose(1, 2) - - yield from super().modify_tensors(k_b, name_kb, bid) - yield from super().modify_tensors(v_b, name_vb, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register( - "Mistral3ForConditionalGeneration", - "Ministral3ForCausalLM", -) -class Mistral3Model(TextModel): - class Ministral3Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.MISTRAL3 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - rope_params = self.rope_parameters - if self.hparams.get("model_type") == "ministral3": - assert rope_params, "ministral3 must have 'rope_parameters' config" - assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" - self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) - - class Mistral4Model(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.MISTRAL4 - skip_mtp = False # model contains no MTP layers, so no need to skip - merge_expert = False # experts are already stacked as 3D - - def modify_tensors(self, data_torch, name, bid): - if name.endswith(".down_proj") or name.endswith(".gate_up_proj"): - name = name + ".weight" - yield from super().modify_tensors(data_torch, name, bid) - - model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused - impl: TextModel - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if self.hparams.get("model_type") == "mistral4": - self.impl = Mistral3Model.Mistral4Model(*args, **kwargs) - else: - self.impl = Mistral3Model.Ministral3Model(*args, **kwargs) - - def set_vocab(self): - self.impl.set_vocab() - - def set_gguf_parameters(self): - self.impl.set_gguf_parameters() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - yield from self.impl.modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - self.impl.prepare_tensors() - - def write_vocab(self): - self.impl.write_vocab() - - def write(self): - self.impl.write() - - -@ModelBase.register("MiniMaxM2ForCausalLM") -class MiniMaxM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.MINIMAXM2 - _experts_cache: dict[int, dict[str, Tensor]] = {} - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"])) - self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"])) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # merge expert weights - if 'experts' in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - expert_cache = self._experts_cache.setdefault(bid, {}) - expert_cache[name] = data_torch - expert_weights = ["w1", "w2", "w3"] - - # not enough expert weights to merge - if len(expert_cache) < n_experts * len(expert_weights): - return - - for w_name in expert_weights: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(expert_cache[ename]) - del expert_cache[ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - yield from super().modify_tensors(data_torch, new_name, bid) - - del self._experts_cache[bid] - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("MiMoV2FlashForCausalLM", "MiMoV2ForCausalLM") -class MimoV2Model(TextModel): - model_arch = gguf.MODEL_ARCH.MIMO2 - - # MiMo V2-Flash, V2.5 and V2.5-Pro all ship 3 trained MTP layers under model.mtp.layers.{0,1,2}. - # The HF config does not expose the count, so it's hardcoded to match the count found in the safetensors. - _n_nextn = 3 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.block_count = self.hparams["num_hidden_layers"] + self._n_nextn - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - @staticmethod - def _tp_aware_qkv_dequant(weight: Tensor, scale_inv: Tensor, - n_q: int, n_kv: int, hd: int, vhd: int, - bs: int = 128) -> Tensor: - # MiMo-V2.5 (TP=4) and V2.5-Pro (TP=8) ship qkv_proj sharded across TP - # ranks; per rank, rows are stacked as [Q_per | K_per | V_per]. - # weight_scale_inv has ceil(rows_per_rank/bs) block-rows per rank (last - # may extend past rows_per_rank with phantom rows not in the weight). - # Naive repeat_interleave aligns rank 0 only and mis-applies scales to - # later ranks once rows_per_rank isn't a multiple of bs. - # Re-group the per-rank [Q_per|K_per|V_per] rows into a single fused - # [Q | K | V] tensor matching the un-sharded original layout. - q_size = n_q * hd - k_size = n_kv * hd - v_size = n_kv * vhd - total_rows = q_size + k_size + v_size - if weight.shape[0] != total_rows: - raise ValueError(f"qkv_proj weight rows {weight.shape[0]} != q+k+v {total_rows}") - - # detect TP from scale_inv block count, descending order so larger matches first - tp = None - for cand in (8, 4): - if total_rows % cand != 0: - continue - rpr = total_rows // cand - bpr = (rpr + bs - 1) // bs - if scale_inv.shape[0] == cand * bpr: - tp = cand - break - if tp is None: - raise ValueError( - f"qkv_proj: cannot detect TP - scale_inv rows {scale_inv.shape[0]}, " - f"q+k+v {total_rows}") - - q_per = q_size // tp - k_per = k_size // tp - v_per = v_size // tp - rows_per_rank = q_per + k_per + v_per - blocks_per_rank = (rows_per_rank + bs - 1) // bs - - scale_inv = scale_inv.float() - # per-row scale-row index: rank * blocks_per_rank + (rr_in_rank // bs) - row_idx = torch.arange(total_rows) - rr = row_idx % rows_per_rank - rank = row_idx // rows_per_rank - scale_row_idx = rank * blocks_per_rank + (rr // bs) - # gather: (total_rows, n_col_blocks) - scale_per_row_block = scale_inv[scale_row_idx] - # expand col-blocks -> cols: each block-col covers `bs` weight cols - scale_full = scale_per_row_block.repeat_interleave(bs, dim=1) - # crop to weight col count (in case last col-block isn't full) - scale_full = scale_full[:, : weight.shape[1]] - dequant = weight.float() * scale_full - - if tp == 1: - return dequant - - # Re-group per-rank [Q_per|K_per|V_per] rows into unified [Q | K | V] - qs, ks, vs = [], [], [] - for r in range(tp): - base = r * rows_per_rank - qs.append(dequant[base : base + q_per]) - ks.append(dequant[base + q_per : base + q_per + k_per]) - vs.append(dequant[base + q_per + k_per : base + rows_per_rank]) - return torch.cat(qs + ks + vs, dim=0) - - def dequant_model(self): - # Capture raw FP8 (weight, scale_inv) lambdas for qkv_proj BEFORE super - # rewrites them with the existing dequant. Replace super's lambda after - # it runs so scale_inv removal still happens via the standard path. - qkv_overrides: dict[str, tuple[Callable, Callable, int]] = {} - qc = self.hparams.get("quantization_config") - if isinstance(qc, dict) and qc.get("quant_method") == "fp8": - pat = re.compile(r"^model\.layers\.(\d+)\.self_attn\.qkv_proj\.weight_scale_inv$") - for name in list(self.model_tensors.keys()): - m = pat.match(name) - if not m: - continue - weight_name = name.removesuffix("_scale_inv") - if weight_name not in self.model_tensors: - continue - qkv_overrides[weight_name] = ( - self.model_tensors[weight_name], - self.model_tensors[name], - int(m.group(1)), - ) - - super().dequant_model() - - if not qkv_overrides: - return - - n_q = self.hparams["num_attention_heads"] - hd = self.hparams["head_dim"] - vhd = self.hparams["v_head_dim"] - hybrid = self.hparams["hybrid_layer_pattern"] - n_layer_text = self.hparams["num_hidden_layers"] - for weight_name, (w_fn, s_fn, bid) in qkv_overrides.items(): - # MTP layers (bid >= n_layer_text) use SWA-style attention dims - is_swa = True if bid >= n_layer_text else hybrid[bid] == 1 - n_kv = self.hparams["swa_num_key_value_heads" if is_swa else "num_key_value_heads"] - self.model_tensors[weight_name] = ( - lambda w_fn=w_fn, s_fn=s_fn, n_q=n_q, n_kv=n_kv, hd=hd, vhd=vhd: - MimoV2Model._tp_aware_qkv_dequant(w_fn(), s_fn(), n_q, n_kv, hd, vhd) - ) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - assert self.hparams["swa_head_dim"] == self.hparams["head_dim"] - assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"] - assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"] - assert self.hparams["topk_method"] == "noaux_tc" - - n_head_kv = self.hparams["num_key_value_heads"] - n_head_kv_swa = self.hparams["swa_num_key_value_heads"] - # Extend the per-layer pattern with SWA entries for the MTP blocks so the - # runtime arrays (sized to extended block_count) are fully populated. - hybrid = list(self.hparams["hybrid_layer_pattern"]) + [1] * self._n_nextn - n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in hybrid] - self.gguf_writer.add_head_count_kv(n_head_kv_arr) - - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_sliding_window_pattern(hybrid) - self.gguf_writer.add_value_length(self.hparams["v_head_dim"]) - self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - - rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"]) - self.gguf_writer.add_rope_dimension_count(rope_dim) - - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5)) - - v_scale = self.hparams.get("attention_value_scale") - if v_scale is not None: - self.gguf_writer.add_attn_value_scale(float(v_scale)) - - self.gguf_writer.add_nextn_predict_layers(self._n_nextn) - - _experts: list[dict[str, Tensor]] | None = None - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "attention_sink" in name and not name.endswith(".weight"): - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch, name, bid): - # Remap MTP/NextN tensors to additional layer slots so the standard tensor map handles them. - # HF: model.mtp.layers.{i}.foo -> model.layers.{n_layer_text + i}.foo - m = re.match(r"^model\.mtp\.layers\.(\d+)\.(.*)$", name) - if m is not None: - mtp_idx = int(m.group(1)) - assert mtp_idx < self._n_nextn, f"MTP layer index {mtp_idx} >= _n_nextn ({self._n_nextn})" - rest = m.group(2) - n_layer_text = self.hparams["num_hidden_layers"] - new_bid = n_layer_text + mtp_idx - name = f"model.layers.{new_bid}.{rest}" - bid = new_bid - - # process the experts separately - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["gate_proj", "up_proj", "down_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename_to_retrieve]) - del self._experts[bid][ename_to_retrieve] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("MiMoV2ForCausalLM") -class MiMoV2VisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - hp = self.hparams_vision - - hp["image_size"] = hp.get("image_size", 560) - hp["num_attention_heads"] = hp.get("num_heads", 32) - hp["num_hidden_layers"] = hp.get("depth", 28) - - self.n_q_heads = int(hp["num_heads"]) - self.num_kv_heads = int(hp.get("num_key_value_heads", 8)) - self.head_dim = int(hp.get("qk_channels", 64)) - self.spatial_merge_size = int(hp["spatial_merge_size"]) - # MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the - # field is absent from MiMo-V2.5's vision_config - self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6)) - - # fullatt_block_indexes are also reflected in vit_window_attn_types as -1 - self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or []) - self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or []) - self.visual_token_window_size = int(hp.get("visual_token_window_size", -1)) - self.use_sink = bool(hp.get("use_sink", False)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL) - self.gguf_writer.add_vision_use_silu(True) - self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads) - self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size) - self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size) - self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types) - self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps) - self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) - self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # Sinks must be F32: any sink-style softmax/mask add in ggml requires - # F32, and we fold sinks into a host-built F32 mask at encode time. - if new_name.endswith(".attn_sinks"): - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, _ = item - if not name.startswith("visual."): - return None - return super().filter_tensors(item) - - def modify_tensors(self, data_torch, name, bid): - # Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D - # weights that the existing qwen2vl-style two-Conv2D path consumes. - if name == "visual.patch_embed.proj.weight": - _, _, kt, _, _ = data_torch.shape - if kt != 2: - raise ValueError(f"unexpected temporal_patch_size: {kt}") - embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] - yield (embd_name + ".weight", data_torch[:, :, 0, ...]) - yield (embd_name + ".weight.1", data_torch[:, :, 1, ...]) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Step3p5ForCausalLM") -class Step35Model(TextModel): - model_arch = gguf.MODEL_ARCH.STEP35 - - def set_gguf_parameters(self): - rope_theta = self.hparams.get("rope_theta") - if isinstance(rope_theta, list): - self.hparams["rope_theta"] = float(rope_theta[0]) - self.hparams["local_rope_theta"] = float(rope_theta[1]) - self.rope_parameters["rope_theta"] = self.hparams["rope_theta"] - self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]} - - super().set_gguf_parameters() - - layer_types = self.hparams.get("layer_types") or [] - partial_rotary_factors = self.hparams.get("partial_rotary_factors") or [] - attn_other = self.hparams.get("attention_other_setting") or {} - - n_head_base = self.hparams["num_attention_heads"] - n_kv_base = self.hparams["num_attention_groups"] - - n_head_swa = attn_other.get("num_attention_heads", n_head_base) - n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) - - layer_types = layer_types[: self.block_count] - partial_rotary_factors = partial_rotary_factors[: self.block_count] - assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors - head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] - kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] - swa_pat = [lt == "sliding_attention" for lt in layer_types] - - self.gguf_writer.add_head_count(head_arr) - self.gguf_writer.add_head_count_kv(kv_arr) - - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_sliding_window_pattern(swa_pat) - - self.gguf_writer.add_value_length(self.hparams["head_dim"]) - - # MoE params - self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) - self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"]) - - if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None: - self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor) - if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None: - self.gguf_writer.add_expert_weights_norm(norm_expert_weight) - - # leading dense blocks - leading_dense = 0 - moe_layers_enum = self.hparams.get("moe_layers_enum") - if isinstance(moe_layers_enum, str) and moe_layers_enum.strip(): - moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(",")) - if moe_layers: - leading_dense = max(0, moe_layers[0]) - self.gguf_writer.add_leading_dense_block_count(leading_dense) - self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1))) - - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) - - # Optional per-layer SwiGLU clamps. - if (limits := self.hparams.get("swiglu_limits")) is not None: - limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] - self.gguf_writer.add_swiglu_clamp_exp(limits_f) - if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: - limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] - self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Map router bias (expert selection bias) to a GGUF bias tensor - if name.endswith(".moe.router_bias"): - name += ".bias" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # remove mtp layers - if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None: - il = int(m.group(1)) - n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) - if il >= n_main: - return - if name.endswith("norm.weight"): - data_torch += 1.0 - - if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): - data_torch = data_torch.squeeze().contiguous() - - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). - # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS). - rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) - rope_type = rope_params.get("rope_type") or "" - if rope_type.lower() != "llama3": - return - - # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. - rope_theta = self.hparams.get("rope_theta", 10000.0) - if isinstance(rope_theta, list): - rope_theta = rope_theta[0] - base = float(rope_theta) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - dim = int(dim) - - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = float(rope_params.get("factor", 8.0)) - low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) - high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) - old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - - rope_factors: list[float] = [] - for freq in freqs: - wavelen = 2 * math.pi / float(freq) - if wavelen < high_freq_wavelen: - rope_factors.append(1.0) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("PanguEmbeddedForCausalLM") -class PanguEmbeddedModel(TextModel): - model_arch = gguf.MODEL_ARCH.PANGU_EMBED - - def set_vocab(self): - self._set_vocab_sentencepiece() - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - # PanguEmbedded's hparam loaded from config.json without head_dim - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - if hparams.get("head_dim") is None: - self.gguf_writer.add_key_length(rope_dim) - self.gguf_writer.add_value_length(rope_dim) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name == "lm_head.weight": - if self.hparams.get("tie_word_embeddings", False): - logger.info("Skipping tied output layer 'lm_head.weight'") - return - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Dots1ForCausalLM") -class Dots1Model(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.DOTS1 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.hparams["num_experts"] = self.hparams["n_routed_experts"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) - self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - if "shared_experts" in name: - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("PLMForCausalLM") -class PLMModel(TextModel): - model_arch = gguf.MODEL_ARCH.PLM - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) - self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - - def prepare_tensors(self): - super().prepare_tensors() - - -@ModelBase.register("T5WithLMHeadModel") -@ModelBase.register("T5ForConditionalGeneration") -@ModelBase.register("MT5ForConditionalGeneration") -@ModelBase.register("UMT5ForConditionalGeneration") -@ModelBase.register("UMT5Model") -class T5Model(TextModel): - model_arch = gguf.MODEL_ARCH.T5 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.block_count) - if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: - self.gguf_writer.add_decoder_block_count(dec_n_layer) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("T5EncoderModel") -class T5EncoderModel(TextModel): - model_arch = gguf.MODEL_ARCH.T5ENCODER - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.shared_token_embeddings_found = False - - def set_vocab(self): - # to avoid TypeError: Descriptors cannot be created directly - # exception when importing sentencepiece_model_pb2 - os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" - from sentencepiece import SentencePieceProcessor - from sentencepiece import sentencepiece_model_pb2 as model - - tokenizer_path = self.dir_model / 'tokenizer.model' - - # many older models use spiece.model tokenizer model filename - if not tokenizer_path.is_file(): - tokenizer_path = self.dir_model / 'spiece.model' - - if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") - - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - - # some models like Pile-T5 family use BPE tokenizer instead of Unigram - if sentencepiece_model.trainer_spec.model_type == 2: # BPE - # assure the tokenizer model file name is correct - assert tokenizer_path.name == 'tokenizer.model' - return self._set_vocab_sentencepiece() - else: - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM - - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap - - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) - - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] - scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - for key in added_tokens_json: - token_id = added_tokens_json[key] - if token_id >= vocab_size: - logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') - continue - - tokens[token_id] = key.encode("utf-8") - scores[token_id] = -1000.0 - toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - - if vocab_size > len(tokens): - pad_count = vocab_size - len(tokens) - logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") - for i in range(1, pad_count + 1): - tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.UNUSED) - - self.gguf_writer.add_tokenizer_model("t5") - self.gguf_writer.add_tokenizer_pre("default") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) - if precompiled_charsmap: - self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: - logger.warning("Couldn't find context length in config.json, assuming default value of 512") - n_ctx = 512 - self.gguf_writer.add_context_length(n_ctx) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(self.hparams["num_heads"]) - self.gguf_writer.add_key_length(self.hparams["d_kv"]) - self.gguf_writer.add_value_length(self.hparams["d_kv"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. - if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: - if not self.shared_token_embeddings_found: - name = "shared.weight" - self.shared_token_embeddings_found = True - else: - logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Jais2ForCausalLM") -class Jais2Model(TextModel): - model_arch = gguf.MODEL_ARCH.JAIS2 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - head_dim = hparams.get("head_dim", hparams["hidden_size"] // hparams["num_attention_heads"]) - self.gguf_writer.add_rope_dimension_count(head_dim) - - -@ModelBase.register("JAISLMHeadModel") -class JaisModel(TextModel): - model_arch = gguf.MODEL_ARCH.JAIS - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # ALiBi position embedding - assert self.hparams["position_embedding_type"] == "alibi" - - # Embeddings scale - self.embeddings_scale = 1.0 - if 'mup_embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['mup_embeddings_scale'] - elif 'embeddings_scale' in self.hparams: - self.embeddings_scale = self.hparams['embeddings_scale'] - else: - assert False - - self.width_scale = 1.0 - if 'mup_output_alpha' in self.hparams: - assert 'mup_width_scale' in self.hparams - self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] - elif 'width_scale' in self.hparams: - self.width_scale = self.hparams['width_scale'] - else: - assert False - - self.max_alibi_bias = 8.0 - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # we don't need these - if name.endswith((".attn.bias")): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(("relative_pe.slopes")): - # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) - # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, - # but Jais's PyTorch model simply precalculates the slope values and places them - # in relative_pes.slopes - n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) - first_val = float(data_torch[0].item()) - self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - - return - - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): - data_torch = data_torch.transpose(1, 0) - - new_name = self.map_tensor_name(name) - - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid) - elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid) - else: - yield from super().modify_tensors(data_torch, new_name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) - - -@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") -class Glm4Model(TextModel): - model_arch = gguf.MODEL_ARCH.GLM4 - use_mrope = False - partial_rotary_factor = 0.5 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) - if "mrope_section" in self.rope_parameters: - self.use_mrope = True - logger.info("Q/K weight will need to be permuted for M-RoPE") - - def set_vocab(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) - - @staticmethod - def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: - orig_shape = weights.shape - if len(orig_shape) == 1: - weights = weights.unsqueeze(1) # [out_dim, 1] - if len(weights.shape) != 2: - raise ValueError("Only 1D and 2D tensors are supported.") - n_effective_heads = weights.shape[0] // head_dim - if n_head_kv is not None and n_effective_heads != n_head: - if n_effective_heads != n_head_kv: - raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") - rotary_dim = int(head_dim * partial_rotary_factor) - if rotary_dim % 2 != 0: - raise ValueError("rotary_dim must be even.") - reshaped = weights.reshape(n_effective_heads, head_dim, -1) - rot_part = reshaped[:, :rotary_dim, :] - non_rot_part = reshaped[:, rotary_dim:, :] - permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) - combined = torch.cat((permuted_rot, non_rot_part), dim=1) - result = combined.reshape(weights.shape) - return result if len(orig_shape) != 1 else result.squeeze(1) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.use_mrope: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams["num_key_value_heads"] - n_embd = self.hparams["hidden_size"] - head_dim = self.hparams.get("head_dim", n_embd // n_head) - # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GlmOcrForConditionalGeneration") -class GlmOCRModel(Glm4Model): - model_arch = gguf.MODEL_ARCH.GLM4 - use_mrope = False - partial_rotary_factor = 0.5 - - # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - # NextN/MTP prediction layers - if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - - -@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") -class Glm4MoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.GLM4_MOE - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - return self._set_vocab_glm() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (rope_dim := self.hparams.get("head_dim")) is None: - rope_dim = ( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - ) - self.gguf_writer.add_rope_dimension_count( - int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) - ) - - # MoE parameters - Use only routed expert count (shared experts handled separately) - if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None: - self.gguf_writer.add_expert_count(n_routed_experts) - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None: - self.gguf_writer.add_expert_shared_count(n_shared_experts) - if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None: - self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) - - # Expert gating function (sigmoid for GLM4_MOE) - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - - # Routed scaling factor - if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None: - self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) - - # Normalise topk probabilities - if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None: - self.gguf_writer.add_expert_weights_norm(norm_topk_prob) - - # NextN/MTP prediction layers - if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - - _experts: list[dict[str, Tensor]] | None = None - - # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle main token embedding (but not layer-specific NextN embeddings) - if name == "model.embed_tokens.weight" and ".layers." not in name: - yield from super().modify_tensors(data_torch, "token_embd.weight", bid) - return - - # Handle routed experts - if name.find("mlp.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("Glm4MoeLiteForCausalLM") -class Glm4MoeLiteModel(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - - def set_vocab(self): - return self._set_vocab_glm() - - -@ModelBase.register("GlmMoeDsaForCausalLM") -class GlmMoeDsaModel(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.GLM_DSA - skip_mtp = False - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - return self._set_vocab_glm() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - rope_dim = self.hparams["qk_rope_head_dim"] - partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) - self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) - - # NextN/MTP prediction layers - if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) - - # DSA indexer parameters - self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"]) - self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"]) - self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"]) - - -@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(TextModel): - model_arch = gguf.MODEL_ARCH.CHATGLM - - def set_vocab_chatglm3(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytes] = [] - toktypes: list[int] = [] - scores: list[float] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab())) # ty: ignore[unresolved-attribute] - assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] - role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] - special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens - for token_id in range(vocab_size): - piece = tokenizer._convert_id_to_token(token_id) # ty: ignore[unresolved-attribute] - if token_id == 0: - piece = "<unk>" - elif token_id == 1: - piece = "<bos>" - elif token_id == 2: - piece = "<eos>" - - text = piece.encode("utf-8") # ty: ignore[unresolved-attribute] - score = 0.0 - # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), - # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() - if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute, invalid-argument-type] - score = tokenizer.tokenizer.sp_model.get_score(token_id) # ty: ignore[unresolved-attribute] - - if token_id >= tokenizer.tokenizer.sp_model.vocab_size(): # ty: ignore[unresolved-attribute] - if piece in special_tokens: - toktype = SentencePieceTokenTypes.CONTROL - elif len(piece) == 0: # ty: ignore[invalid-argument-type] - text = f"[PAD{token_id}]".encode("utf-8") - toktype = SentencePieceTokenTypes.UNUSED - else: - toktype = SentencePieceTokenTypes.USER_DEFINED - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - continue - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.tokenizer.sp_model.is_unknown(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.tokenizer.sp_model.is_control(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.tokenizer.sp_model.is_unused(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.tokenizer.sp_model.is_byte(token_id): # ty: ignore[unresolved-attribute] - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - self.gguf_writer.add_tokenizer_model("llama") - # glm3 needs prefix and suffix formatted as: - # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" - self.gguf_writer.add_tokenizer_pre("chatglm-spm") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - @staticmethod - def token_bytes_to_string(b): - from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # ty: ignore[unresolved-import] - byte_encoder = bytes_to_unicode() - return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) - - @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] - return parts - - def set_vocab(self): - if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""): - self.set_vocab_chatglm3() - return - - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"]) - assert max(tokenizer.get_vocab().values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - # only add special tokens when they were not already loaded from config.json - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # ty: ignore[unresolved-attribute] - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - assert n_embed is not None - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - assert n_head is not None - n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head)) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed))) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5)) - self.gguf_writer.add_file_type(self.ftype) - if "attention_dim" in self.hparams: - rope_dim = self.hparams["attention_dim"] - else: - rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - self.gguf_writer.add_add_bos_token(False) - rope_freq = 10000 - if "rope_ratio" in self.hparams: - rope_freq = rope_freq * self.hparams["rope_ratio"] - self.gguf_writer.add_rope_freq_base(rope_freq) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".rotary_pos_emb.inv_freq"): - return None - - name = name.removeprefix("transformer.") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("NemotronForCausalLM") -class NemotronModel(TextModel): - model_arch = gguf.MODEL_ARCH.NEMOTRON - - def set_vocab(self): - self._set_vocab_sentencepiece() - self.gguf_writer.add_pad_token_id(0) - self.gguf_writer.add_unk_token_id(1) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"]) - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - - # * Partial RoPE - rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) - - # * RopeScaling for Nemotron - if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side - # model.layers.{l}.input_layernorm.weight - # model.layers.{l}.post_attention_layernorm.weight - # model.norm.weight - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("ExaoneForCausalLM") -class ExaoneModel(TextModel): - model_arch = gguf.MODEL_ARCH.EXAONE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - assert (hparams["activation_function"] == "silu") - - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) - rotary_factor = rotary_factor if rotary_factor is not None else 1.0 - self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = self.rope_parameters.get("rope_theta", 10000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 8.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - assert low_freq_wavelen != high_freq_wavelen - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("Exaone4ForCausalLM") -class Exaone4Model(TextModel): - model_arch = gguf.MODEL_ARCH.EXAONE4 - - def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - - if hparams.get("sliding_window") is not None: - self.gguf_writer.add_sliding_window(hparams["sliding_window"]) - if "layer_types" in hparams: - self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]]) - elif "sliding_window_pattern" in hparams: - sliding_window_pattern = [] - if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG - for i in range(hparams["num_hidden_layers"]): - sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L") - if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4 - for i in range(hparams["num_hidden_layers"]): - sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0) - if len(sliding_window_pattern) == hparams["num_hidden_layers"]: - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): - if rope_params.get("rope_type", '').lower() == "llama3": - base = rope_params.get("rope_theta", 10_000.0) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - - factor = rope_params.get("factor", 16.0) - low_freq_factor = rope_params.get("low_freq_factor", 1.0) - high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) - - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor - - rope_factors = [] - for freq in freqs: - wavelen = 2 * math.pi / freq - if wavelen < high_freq_wavelen: - rope_factors.append(1) - elif wavelen > low_freq_wavelen: - rope_factors.append(factor) - else: - smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - - yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("ExaoneMoEForCausalLM") -class ExaoneMoEModel(Exaone4Model): - model_arch = gguf.MODEL_ARCH.EXAONE_MOE - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - moe_intermediate_size = self.hparams["moe_intermediate_size"] - num_shared_experts = self.hparams["num_shared_experts"] - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - self.gguf_writer.add_expert_shared_count(num_shared_experts) - self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) - n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0)) - self.gguf_writer.add_leading_dense_block_count(n_dense_layer) - self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0)) - - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("mtp."): - if name.find("layers.") != -1: - # `mtp.layers.0.[module_name]` format - name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}") - else: - # mtp fc/norm weights - remapper = { - "mtp.fc": "model.layers.{bid}.eh_proj", - "mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm", - "mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm", - "mtp.norm": "model.layers.{bid}.shared_head.norm", - } - _n = Path(name) - new_name = remapper[_n.stem] + _n.suffix - - # set shared weights for all NextN/MTP layers - for bid in range(self.hparams['num_hidden_layers'], self.block_count): - yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid) - return - - if name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - yield from super().modify_tensors(data_torch, new_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration") -class GraniteModel(LlamaModel): - """Conversion for IBM's GraniteForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE - - def set_gguf_parameters(self): - """Granite uses standard llama parameters with the following differences: - - - No head_dim support - - New multiplier params: - - attention_scale - - embedding_scale - - residual_scale - - logits_scaling - """ - if head_dim := self.hparams.pop("head_dim", None): - logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) - super().set_gguf_parameters() - # NOTE: Convert _multiplier params to _scale params for naming - # consistency - if attention_scale := self.hparams.get("attention_multiplier"): - self.gguf_writer.add_attention_scale(attention_scale) - logger.info("gguf: (granite) attention_scale = %s", attention_scale) - if embedding_scale := self.hparams.get("embedding_multiplier"): - self.gguf_writer.add_embedding_scale(embedding_scale) - logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) - if residual_scale := self.hparams.get("residual_multiplier"): - self.gguf_writer.add_residual_scale(residual_scale) - logger.info("gguf: (granite) residual_scale = %s", residual_scale) - if logits_scale := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scale) - logger.info("gguf: (granite) logits_scale = %s", logits_scale) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if name.startswith("encoder."): - return None - return super().filter_tensors(item) - - -@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM") -class GraniteMoeModel(GraniteModel): - """Conversion for IBM's GraniteMoeForCausalLM""" - model_arch = gguf.MODEL_ARCH.GRANITE_MOE - - def set_gguf_parameters(self): - """GraniteMoeShared uses GraniteMoe parameters plus the following: - - shared_intermediate_size - """ - super().set_gguf_parameters() - if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"): - self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length) - logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - """In modeling_granitemoe, the JetMoe implementation of parallel experts - is used. This essentially merges w1 and w3 into a single tensor with 2x - the hidden size that is then split during forward. To keep compatibility - with existing mixtral support, we pull them apart here. - """ - - if name.endswith("block_sparse_moe.input_linear.weight"): - ffn_dim = self.hparams["intermediate_size"] - assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" - gate, up = data_torch.split(ffn_dim, dim=-2) - yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) - yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) - return - - has_experts = bool(self.hparams.get('num_local_experts')) - - if name.endswith("shared_mlp.input_linear.weight"): - ffn_dim = self.hparams["shared_intermediate_size"] - assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" - gate, up = data_torch.split(ffn_dim, dim=-2) - if has_experts: - yield from ModelBase.modify_tensors(self, gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) - yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) - return - yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) - yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) - return - - if not has_experts and name.endswith("shared_mlp.output_linear.weight"): - yield from ModelBase.modify_tensors(self, data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM") -class GraniteHybridModel(Mamba2Model, GraniteMoeModel): - """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM - layers and optionally uses MoE w/ a shared expert""" - model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID - undo_permute = True - - def __init__(self, *args, **kwargs): - - # Hybrid mamba models use a prefix for the mamba-specific params. - # TODO: Extend this if the prefix(es) need to be configurable - self.hparam_prefixes = ["mamba"] - - super().__init__(*args, **kwargs) - - # Lists of which layers use ssm vs attention - self._attn_layers = self.get_attn_layers() - self._ssm_layers = [ - i for i in range(self.block_count) - if i not in self._attn_layers - ] - - # There are some models in this family that are non-hybrid, but keep the - # same parent class by setting all layers to "attention." If this is the - # case, the model architecture needs to be updated to a standard - # "granite" or "granitemoe" model - if not self._ssm_layers: - has_experts = self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True) - new_arch = ( - gguf.MODEL_ARCH.GRANITE_MOE - if has_experts else - gguf.MODEL_ARCH.GRANITE - ) - self.model_arch = new_arch - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] - self.gguf_writer.add_architecture() - - # n_group and d_inner are used during reshape_tensors for mamba2 - # NOTE: Explicitly include hparam prefix prefix for d_model to - # disambiguate with top-level head_dim - # NOTE 2: If needed for future models, this can be isolated in a method - # to separate the prefix setting and the keys used - self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"]) - self.n_group = self.find_hparam(["n_groups", "num_groups"]) - self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model - - def get_attn_layers(self): - # Explicit list of layer type names - if layer_types := self.hparams.get("layer_types"): - return [ - i for i, typ in enumerate(layer_types) - if typ == "attention" - ] - - # Layer types indicated by index or period - attn_layers = self.hparams.get("attn_layer_indices", []) - if not attn_layers: - attn_period = self.hparams.get("attn_layer_period") - assert attn_period, "Didn't find attn_layer_indices or attn_layer_period" - attn_offset = self.hparams.get("attn_layer_offset") - assert attn_offset is not None, "No attention layer offset set with attn_layer_period" - attn_layers = [ - i for i in range(self.block_count) - if i % attn_period == attn_offset - ] - return attn_layers - - def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: - prefixed = [] - for pfx in self.hparam_prefixes: - prefixed.extend( - "_".join([pfx, k]) - for k in keys - ) - keys = list(keys) + prefixed - return Mamba2Model.find_hparam(self, keys, *args, **kwargs) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if ( - name.endswith("block_sparse_moe.input_linear.weight") - or "shared_mlp" in name - ): - yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return - - # Determine whether this is a mamba layer or an attention layer - if bid in self._ssm_layers: - yield from Mamba2Model.modify_tensors(self, data_torch, name, bid) - return - elif bid in self._attn_layers: - yield from GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - - def set_gguf_parameters(self): - """This method merges params from both parents and some that are - specific to this model. The result is some duplication of how the params - get set. The following warnings are expected during conversion: - - WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv' - WARNING:Duplicated key name 'granitehybrid.context_length' - """ - GraniteMoeModel.set_gguf_parameters(self) - - ## Mamba mixer params ## - self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) - self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"])) - self.gguf_writer.add_ssm_group_count(self.n_group) - self.gguf_writer.add_ssm_inner_size(self.d_inner) - # NOTE: The mamba_dt_rank is _not_ the right field for how this is used - # in llama.cpp - self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"])) - - ## Attention params ## - head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) - head_count_kv_vec = [ - head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count) - ] - if rope_dim := self.hparams.get("attn_rotary_emb"): - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_head_count_kv(head_count_kv_vec) - - ## If Bamba or non-hybrid, use rope, otherwise don't - use_rope = ( - "BambaForCausalLM" in self.hparams["architectures"] - or not self._ssm_layers - ) - self.gguf_writer.add_rope_scaling_finetuned(use_rope) - if not use_rope: - self.gguf_writer.add_context_length(2**20) - - ## Validation ## - d_head = self.find_hparam(["d_head"], optional=True) or 64 - assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" - assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" - - def set_vocab(self): - self.hparams["pad_vocab_size_multiple"] = 8 - Mamba2Model.set_vocab(self) - - -@ModelBase.register("NemotronHForCausalLM") -class NemotronHModel(GraniteHybridModel): - """Hybrid mamba2/attention model from NVIDIA""" - model_arch = gguf.MODEL_ARCH.NEMOTRON_H - is_moe: bool = False - - def __init__(self, *args, **kwargs): - # We have to determine the correct model architecture (MoE vs non-MoE) before - # calling the parent __init__. This is because the parent constructor - # uses self.model_arch to build the tensor name map, and all MoE-specific - # mappings would be missed if it were called with the default non-MoE arch. - hparams = ModelBase.load_hparams(args[0], self.is_mistral_format) - has_moe_params = ( - "num_experts_per_tok" in hparams - or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"]) - ) - if has_moe_params: - self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE - self.is_moe = True - - super().__init__(*args, **kwargs) - - # Save the top-level head_dim for later - self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim")) - assert self.head_dim is not None, "Could not find the attention head dim in config" - - # Don't use expand to calculate d_inner - self.d_inner = self.find_hparam(["num_heads"]) * self.d_model - - # Update the ssm / attn / mlp layers - # M: Mamba2, *: Attention, -: MLP - # MoE: - # M: Mamba2, *: Attention, E: Expert - pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") - if pattern is None: - self._ssm_layers = [] - self._mlp_layers = [] - elif isinstance(pattern, str): - self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"] - self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")] - else: - self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"] - self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"] - - def get_attn_layers(self): - pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type") - if pattern is None: - return [] - assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!" - if isinstance(pattern, str): - return [i for i, val in enumerate(pattern) if val == "*"] - - return [i for i, val in enumerate(pattern) if val == "attention"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - head_dim = self.head_dim - if head_dim is None: - raise ValueError("Could not find the attention head dim in config") - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - # Set feed_forward_length - # NOTE: This will trigger an override warning. This is preferable to - # duplicating all the parent logic - if not self.is_moe: - n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) - self.gguf_writer.add_feed_forward_length([ - n_ff if i in self._mlp_layers else 0 for i in range(self.block_count) - ]) - else: - moe_intermediate_size = self.hparams["moe_intermediate_size"] - self.gguf_writer.add_feed_forward_length([ - moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count) - ]) - self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"]) - self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) - self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"]) - self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"]) - self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_group_count(self.hparams["n_group"]) - - # number of experts used per token (top-k) - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - - if (latent_size := self.hparams.get("moe_latent_size")) is not None: - self.gguf_writer.add_moe_latent_size(latent_size) - - def set_vocab(self): - # The NemotronH config uses pattern characters (e.g. '-') that may not - # be supported by the installed transformers version. AutoTokenizer - # internally calls AutoConfig which triggers this parsing failure. - # Using trust_remote_code=True to load the model's own config class. - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - # Pad vocab size (from Mamba2Model/GraniteHybridModel) - self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now. - # From Mamba2Model.set_vocab(): - vocab_size = self.hparams["vocab_size"] - pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) - # ref: https://stackoverflow.com/a/17511341/22827863 - vocab_size = -(vocab_size // -pad_vocab) * pad_vocab - self.hparams["vocab_size"] = vocab_size - - assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] - - tokpre = self.get_vocab_base_pre(tokenizer) - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] - added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] - - added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment] - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) - - # From TextModel.set_vocab_gpt2(): - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - # The tokenizer _does_ add a BOS token (via post_processor type - # TemplateProcessing) but does not set add_bos_token to true in the - # config, so we need to explicitly override it here. - if not self.is_moe: - self.gguf_writer.add_add_bos_token(True) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.is_moe and bid is not None: - # Skip Multi-Token Prediction (MTP) tensors. These are used for - # for speculative decoding but we don't include them in this model - # conversion. See https://github.com/ggml-org/llama.cpp/pull/18886 - if name.startswith("mtp."): - logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}") - return - - if name.endswith("mixer.gate.e_score_correction.bias"): - yield from ModelBase.modify_tensors(self, data_torch, name, bid) - return - - if name.endswith("mixer.dt_bias"): - new_name = name.replace("dt_bias", "dt.bias") - yield from ModelBase.modify_tensors(self, data_torch, new_name, bid) - return - - if name.endswith("mixer.conv1d.weight"): - squeezed_data = data_torch.squeeze() - yield from ModelBase.modify_tensors(self, squeezed_data, name, bid) - return - - if name.endswith("mixer.A_log"): - transformed_data = -torch.exp(data_torch) - reshaped_data = transformed_data.squeeze().reshape(-1, 1) - yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) - return - - if name.endswith("mixer.D"): - reshaped_data = data_torch.squeeze().reshape(-1, 1) - yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) - return - - if name.endswith("mixer.norm.weight"): - reshaped_data = data_torch.reshape(self.n_group, -1) - yield from ModelBase.modify_tensors(self, reshaped_data, name, bid) - return - - if name.find("mixer.experts") != -1: - n_experts = self.hparams["n_routed_experts"] - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 2: - # merge the experts into a single tensor - for w_name in ["down_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("LlamaBidirectionalModel") -class LlamaEmbedNemotronModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA_EMBED - - -@ModelBase.register("BailingMoeForCausalLM") -class BailingMoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.BAILINGMOE - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(rope_dim) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_weights_scale(1.0) - self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - - _experts: list[dict[str, Tensor]] | None = None - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - n_embd = self.hparams["hidden_size"] - if (head_dim := self.hparams.get("head_dim")) is None: - head_dim = n_embd // n_head - - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - - if name.endswith("attention.dense.weight"): - yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) - return - elif name.endswith("query_key_value.weight"): - q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) - - yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) - yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) - yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) - return - elif name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - yield from super().modify_tensors(data_torch, new_name, bid) - - return - - new_name = self.map_tensor_name(name) - - if new_name == output_name and self.hparams.get("norm_head"): - data_torch = data_torch.float() - data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 - - yield from super().modify_tensors(data_torch, new_name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("BailingMoeV2ForCausalLM") -class BailingMoeV2Model(TextModel): - model_arch = gguf.MODEL_ARCH.BAILINGMOE2 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0): - self.block_count = self.hparams["num_hidden_layers"] + nextn_layers - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) - self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"])) - self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) - self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) - self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) - - if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None: - self.gguf_writer.add_nextn_predict_layers(nextn_layers) - - _experts: list[dict[str, Tensor]] | None = None - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".expert_bias"): - name = name.replace(".expert_bias", ".expert_bias.bias") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "mlp.experts" in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM") -class SarvamMoEModel(BailingMoeV2Model): - model_arch = gguf.MODEL_ARCH.BAILINGMOE2 - # Sarvam-MoE shares the BailingMoeV2 architecture; only differences: - # - full rotary (no partial_rotary_factor) - # - expert bias is zero-mean normalized at load time - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - if (rope_dim := hparams.get("head_dim")) is None: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim - self.gguf_writer.add_rope_dimension_count(rope_dim) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if name.endswith(".expert_bias"): - # Sarvam normalizes expert bias to zero mean - inner = gen - - def gen(): - t = inner() - return t - t.mean() - return super().filter_tensors((name, gen)) - - -@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") -class GroveMoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.GROVEMOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299 - self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128) - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298 - self.gguf_writer.add_experts_per_group(2) - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 - self.gguf_writer.add_expert_group_scale(0.05) - - _experts: list[dict[str, Tensor]] | None = None - _chunk_experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.endswith(".expert_bias"): - # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 - return - - # process the experts separately - if name.find("chunk_experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) // 2 # see add_experts_per_group - assert bid is not None - - if self._chunk_experts is None: - self._chunk_experts = [{} for _ in range(self.block_count)] - - self._chunk_experts[bid][name] = data_torch - - if len(self._chunk_experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight" - datas.append(self._chunk_experts[bid][ename]) - del self._chunk_experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - elif name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._chunk_experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - chunk_experts = [k for d in self._chunk_experts for k in d.keys()] - if len(chunk_experts) > 0: - raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}") - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("ChameleonForConditionalGeneration") -@ModelBase.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(TextModel): - model_arch = gguf.MODEL_ARCH.CHAMELEON - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) - - def set_vocab(self): - self._set_vocab_gpt2() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # ignore image tokenizer for now - # TODO: image support for Chameleon - if name.startswith("model.vqmodel"): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") - hidden_dim = self.hparams.get("hidden_size") - - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - if name.endswith(("q_norm.weight", "q_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim) - if name.endswith(("k_norm.weight", "k_norm.bias")): - data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) - - yield from super().modify_tensors(data_torch, name, bid) - - # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 - @staticmethod - def _reverse_hf_permute(data_torch, n_heads, hidden_dim): - head_dim = hidden_dim // n_heads - data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) - data_torch = data_torch.repeat_interleave(n_heads, 0) - return data_torch - - -@ModelBase.register("UltravoxModel") -class UltravoxModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLAMA # dummy - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument") - - -@ModelBase.register("GlmasrModel") -class GlmASRWhisperEncoderModel(MmprojModel): - has_vision_encoder = False - has_audio_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: - self.hparams["hidden_size"] = self.hparams["d_model"] - self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] - self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA) - self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"]) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F16 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith(("model.", "lm_head.")): - # skip language model tensors - return None - - if name.startswith("audio_encoder.whisper."): - name = name.replace("audio_encoder.whisper.","audio_tower.") - if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name: - name = name.replace("audio_encoder.", "audio_encoder.adapting.") - if name.startswith("audio_encoder.adapting."): - name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") - if ".layer_norm." in name: - name = name.replace(".layer_norm.", ".ln_pre.") - if ".0." in name: - name = name.replace(".0.", ".linear_1.") - if ".2." in name: - name = name.replace(".2.", ".linear_2.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("audio_encoder.audio_bos_eos_token."): - yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid) - yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid) - return - - if name.startswith("audio_encoder.adapting."): - if ".proj." in name: - return - - if "conv1.bias" in name or "conv2.bias" in name: - # transpose conv1 and conv2 bias - data_torch = data_torch.unsqueeze(-1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen2AudioForConditionalGeneration") -class WhisperEncoderModel(MmprojModel): - has_vision_encoder = False # no vision encoder - has_audio_encoder = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: - self.hparams["hidden_size"] = self.hparams["d_model"] - self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] - self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) - self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F16 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # prevent clash naming with vision tensors - if name.startswith("multi_modal_projector"): - name = "audio." + name - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "conv1.bias" in name or "conv2.bias" in name: - # transpose conv1 and conv2 bias - data_torch = data_torch.unsqueeze(-1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("UltravoxModel") -class UltravoxWhisperEncoderModel(WhisperEncoderModel): - has_vision_encoder = False # no vision encoder - has_audio_encoder = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX) - self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) - - -@ModelBase.register("MERaLiON2ForConditionalGeneration") -class MERaLiONWhisperEncoderModel(WhisperEncoderModel): - has_vision_encoder = False - has_audio_encoder = True - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("speech_config") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION) - self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("text_decoder."): - return None - - if name.startswith("speech_encoder."): - name = name.replace("speech_encoder.", "audio_tower.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - suffix = "." + name.rsplit(".", 1)[-1] - - if name.startswith("ln_speech."): - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch) - return - - if name.startswith("speech_audio_adapter."): - if ".mlp_adapter.0." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch) - elif ".gate_proj." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch) - elif ".pool_proj." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch) - elif ".out_proj." in name: - yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("VoxtralForConditionalGeneration") -class VoxtralWhisperEncoderModel(WhisperEncoderModel): - has_vision_encoder = False # no vision encoder - has_audio_encoder = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL) - self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size - - -@ModelBase.register("AudioFlamingo3ForConditionalGeneration") -class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel): - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".conv" in name and ".weight" in name: - # Was trained in BF16, being safe, avoiding quantizing to FP16 - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - -@ModelBase.register("FalconH1ForCausalLM") -class FalconH1Model(Mamba2Model): - model_arch = gguf.MODEL_ARCH.FALCON_H1 - - def __init__(self, *args, **kwargs): - # Set the hparam prefixes for Falcon Mamba2 - self.hparam_prefixes = ["mamba"] - - # Initialize the base Mamba2Model - super().__init__(*args, **kwargs) - - # Use Llama conversion for attention - self._transformer_model_class = LlamaModel - - # n_group and d_inner are used during reshape_tensors for mamba2 - self.n_group = self.find_hparam(["n_groups"]) - self.d_inner = self.find_hparam(["mamba_d_ssm"]) - self.d_head = self.find_hparam(["d_head"]) - - # Initialize any Falcon Mamba2 specific attributes - self.has_attention = True # Falcon Mamba2 has attention components - - # Load Falcon-H1 multipliers from hyperparameters - self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True) - self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True) - self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True) - self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True) - self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) - self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) - self.intermediate_size = self.find_hparam(["intermediate_size"]) - self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True) - - def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: - prefixed = [] - for pfx in self.hparam_prefixes: - prefixed.extend( - "_".join([pfx, k]) - for k in keys - ) - keys = list(keys) + prefixed - return super().find_hparam(keys, *args, **kwargs) - - def set_vocab(self): - self._set_vocab_gpt2() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - tensors = list(super().modify_tensors(data_torch, name, bid)) - tensor = tensors[0][1] - - if "down_proj" in name: - tensor = tensor * self.mlp_multipliers[1] - elif "gate_proj" in name: - tensor = tensor * self.mlp_multipliers[0] - elif "k_proj" in name: - tensor = tensor * self.key_multiplier * self.attention_in_multiplier - elif "q_proj" in name: - tensor = tensor * self.attention_in_multiplier - elif "v_proj" in name: - tensor = tensor * self.attention_in_multiplier - elif "o_proj" in name: - tensor = tensor * self.attention_out_multiplier - elif "out_proj" in name: - tensor = tensor * self.ssm_out_multiplier - elif "in_proj" in name: - tensor = tensor * self.ssm_in_multiplier - zxbcdt_multipliers = self.hparams["ssm_multipliers"] - intermediate_size = self.hparams["mamba_d_ssm"] - groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] - tensor[:intermediate_size, :] *= zxbcdt_multipliers[0] - tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1] - tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2] - tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3] - tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4] - elif "lm_head" in name: - tensor = tensor * self.hparams["lm_head_multiplier"] - elif "embed_tokens" in name: - tensor = tensor * self.hparams["embedding_multiplier"] - elif "mamba.norm" in name: - tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group) - - tensors = [(tensors[0][0], tensor)] - return tensors - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - ## General Params ## - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - # Override some Mamba2 defaults - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - - ## Attention params ## - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_key_length(self.hparams["head_dim"]) - self.gguf_writer.add_value_length(self.hparams["head_dim"]) - - ## Validation ## - assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" - assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" - - # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"]) - - -@ModelBase.register("HunYuanMoEV1ForCausalLM") -class HunYuanMoEModel(TextModel): - model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE - - def set_vocab(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - # 1. Get the pre-tokenizer identifier hash - tokpre = self.get_vocab_base_pre(tokenizer) - - # 2. Reverse-engineer the merges list from mergeable_ranks - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: # todo this is an assert in Qwen, why? - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # 3. Generate the tokens and toktypes lists - vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - # 4. Write all vocab-related fields to the GGUF writer - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - # 5. Add special tokens and chat templates - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - # FIX for BOS token: Overwrite incorrect id read from config.json - self.gguf_writer.add_bos_token_id(127959) # <|bos|> - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"]) - - moe_intermediate_size = hparams["moe_intermediate_size"] - assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size) - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0]) - - moe_topk = hparams["moe_topk"] - assert all(topk == moe_topk[0] for topk in moe_topk) - self.gguf_writer.add_expert_used_count(moe_topk[0]) - - moe_shared_expert = hparams["num_shared_expert"] - assert all(n == moe_shared_expert[0] for n in moe_shared_expert) - self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) - - # Rope - if self.rope_parameters.get("rope_type") == "dynamic": - # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ - # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = self.rope_parameters.get("alpha", 1000) - base = self.rope_parameters.get("rope_theta", 10000.0) - dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 - scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 - self.gguf_writer.add_rope_freq_base(scaled_base) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_rope_scaling_factor(1) - # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k - self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length - self.gguf_writer.add_context_length(256 * 1024) # 256k context length - - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated - assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ - "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name == "lm_head.weight": - if self.hparams.get("tie_word_embeddings", False): - logger.info("Skipping tied output layer 'lm_head.weight'") - return - - if name.find("mlp.experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - if self._experts is not None: - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") -class LLaDAMoEModel(TextModel): - model_arch = gguf.MODEL_ARCH.LLADA_MOE - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) - - self.gguf_writer.add_mask_token_id(156895) - self.gguf_writer.add_causal_attention(False) - self.gguf_writer.add_diffusion_shift_logits(False) - - _experts: list[dict[str, Tensor]] | None = None - - # Copied from: Qwen2MoeModel - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - # Copied from: Qwen2MoeModel - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("HunYuanDenseV1ForCausalLM") -class HunYuanModel(TextModel): - model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE - - def _get_eod_token_id(self) -> int | None: - """Get the actual end-of-generation token from config (eod_token_id).""" - return self.hparams.get("eod_token_id") - - def _get_eot_token_id(self) -> int | None: - """Get the end-of-turn token from generation_config.json. - This is the first entry in eos_token_id when it's a list.""" - gen_cfg_path = self.dir_model / "generation_config.json" - if gen_cfg_path.is_file(): - with open(gen_cfg_path, encoding="utf-8") as f: - gen_cfg = json.load(f) - eos = gen_cfg.get("eos_token_id") - if isinstance(eos, list) and len(eos) >= 2: - return eos[0] - return None - - def _fix_special_tokens(self): - """Fix EOS/EOT tokens that are incorrect in upstream configs.""" - eod_id = self._get_eod_token_id() - if eod_id is not None: - self.gguf_writer.add_eos_token_id(eod_id) - eot_id = self._get_eot_token_id() - if eot_id is not None: - self.gguf_writer.add_eot_token_id(eot_id) - - def set_vocab(self): - if (self.dir_model / "tokenizer.json").is_file(): - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab - token_types = None - if (self.hparams.get("pad_token_id") or 0) < 0: - token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types) - special_vocab.add_to_gguf(self.gguf_writer) - self._fix_special_tokens() - else: - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - - # 1. Get the pre-tokenizer identifier hash - tokpre = self.get_vocab_base_pre(tokenizer) - - # 2. Reverse-engineer the merges list from mergeable_ranks - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks # ty: ignore[unresolved-attribute] - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # 3. Generate the tokens and toktypes lists - vocab_size = self.hparams["vocab_size"] - assert tokenizer.vocab_size == vocab_size # ty: ignore[unresolved-attribute] - special_tokens = tokenizer.special_tokens # ty: ignore[unresolved-attribute] - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} - tokens: list[str] = [] - toktypes: list[int] = [] - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token = reverse_vocab[i] - tokens.append(token) - if i in special_tokens.values(): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.NORMAL) - - # 4. Write all vocab-related fields to the GGUF writer - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_token_merges(merges) - - # 5. Add special tokens and chat templates - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.add_to_gguf(self.gguf_writer) - # FIX for BOS token: Overwrite incorrect id read from config.json - if self.hparams['hidden_size'] == 4096: - self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token - self._fix_special_tokens() - - def set_gguf_parameters(self): - # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it - saved_num_experts = self.hparams.pop("num_experts", None) - super().set_gguf_parameters() - if saved_num_experts is not None and saved_num_experts > 1: - self.hparams["num_experts"] = saved_num_experts - hparams = self.hparams - - # Rope - if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"): - # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ - # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = self.rope_parameters.get("alpha", 50) - base = self.rope_parameters.get("rope_theta", 10000.0) - dim = hparams["head_dim"] - scaled_base = base * (alpha ** (dim / (dim - 2))) - self.gguf_writer.add_rope_freq_base(scaled_base) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_rope_scaling_factor(1) - if self.rope_parameters.get("rope_type") == "dynamic": - # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k - self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length - self.gguf_writer.add_context_length(256 * 1024) # 256k context length - - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated - assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ - "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name == "lm_head.weight": - if self.hparams.get("tie_word_embeddings", False): - logger.info("Skipping tied output layer 'lm_head.weight'") - return - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("HunYuanVLForConditionalGeneration") -class HunyuanVLVisionModel(MmprojModel): - # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name - # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. - # Each variant maps to a different projector type in clip.cpp so image - # preprocessing follows the correct code path. - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size - if "image_size" not in self.hparams_vision: - self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) - - @staticmethod - def is_ocr_variant(hparams: dict) -> bool: - """Return True for HunyuanOCR, False for HunyuanVL. - - The projector's output dim must equal the text model's hidden_size by - construction (that's what "projector" means). HunyuanOCR pairs a 1B text - backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the - ViT -> LLM projection dim is a hard architectural signature, not a - magic number. - """ - vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) - return vision_out == 1024 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - vcfg = self.hparams_vision - - if self.is_ocr_variant(self.global_config): - # --- HunyuanOCR --- - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) - self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) - return - - # --- HunyuanVL --- - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) - self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") - self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) - self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) - self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) - self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("vit."): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # strip CLS token (row 0) from position embeddings so resize_position_embeddings works - if "position_embedding" in name: - data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] - yield from super().modify_tensors(data_torch, name, bid) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal - # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. - if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): - return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - -@ModelBase.register("HunYuanVLForConditionalGeneration") -class HunyuanVLTextModel(HunYuanModel): - # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR - # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), - # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from - # the config and pick the matching GGUF architecture. - model_arch = gguf.MODEL_ARCH.HUNYUAN_VL - - @staticmethod - def _is_ocr_config(hparams: dict) -> bool: - # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that - # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with - # HunyuanVLVisionModel.is_ocr_variant. - return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 - - def __init__(self, dir_model: Path, *args, **kwargs): - raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) - if self._is_ocr_config(raw_hparams): - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE - else: - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL - super().__init__(dir_model, *args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses - # the HunYuan-Dense arch which already handles standard rope in super(). - if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: - return - - if self.rope_parameters.get("rope_type") != "xdrope": - return - - # defaults for HunyuanVL. The C++ side later computes: - # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) - self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) - self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1))) - - ctx_len = int(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len) - self.gguf_writer.add_context_length(ctx_len) - - self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"])) - - -@ModelBase.register("SmolLM3ForCausalLM") -class SmolLM3Model(LlamaModel): - model_arch = gguf.MODEL_ARCH.SMOLLM3 - - -@ModelBase.register("GptOssForCausalLM") -class GptOssModel(TextModel): - model_arch = gguf.MODEL_ARCH.GPT_OSS - - # TODO: remove once MXFP4 is supported more generally - def dequant_model(self): - if self._is_mxfp4: - return - return super().dequant_model() - - def transform_nibble_layout(self, tensor): - assert tensor.dtype == torch.uint8 - assert tensor.shape[-1] == 16 - # swap nibbles - t_lo = tensor & 0x0F - t_hi = tensor & 0xF0 - t_swapped = (t_lo << 4) | (t_hi >> 4) - tensor = t_swapped - # transform aaaa...bbbb... to abababab... - blk_a, blk_b = tensor.chunk(2, dim=-1) - # get a_ - blk_a0 = (blk_a & 0xF0).view(-1, 1) - blk_a1 = (blk_a << 4).view(-1, 1) - blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape) - # get _b - blk_b0 = (blk_b >> 4).view(-1, 1) - blk_b1 = (blk_b & 0x0F).view(-1, 1) - blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape) - # swap once more - out = blk_a | blk_b - out_h = out & 0xF0 - out_l = out & 0x0F - out = (out_h >> 4) | (out_l << 4) - return out - - def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor): - assert blocks.dtype == torch.uint8 - assert scales.dtype == torch.uint8 - scales = scales.unsqueeze(-1) - assert len(blocks.shape) == 4 - assert len(scales.shape) == 4 - blocks = self.transform_nibble_layout(blocks) - new_data = torch.concat((scales, blocks), dim=-1) - new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32] - logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4") - # flatten last dim - new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3]) - new_data = new_data.numpy() - self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - blocks0: Tensor = torch.zeros(1) - blocks1: Tensor = torch.zeros(1) - # we assume that tensors are loaded in the correct order - for name, data_torch in self.get_tensors(): - if "mlp.experts.down_proj_blocks" in name: - blocks0 = data_torch - elif "mlp.experts.down_proj_scales" in name: - new_name = self.map_tensor_name(name.replace("_scales", ".weight")) - self.repack_mxfp4(new_name, blocks0, data_torch) - elif "mlp.experts.gate_up_proj_blocks" in name: - blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] - elif "mlp.experts.gate_up_proj_scales" in name: - scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :] - new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight")) - new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) - self.repack_mxfp4(new_name_gate, blocks0, scales0) - self.repack_mxfp4(new_name_up, blocks1, scales1) - return [] - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if "sinks" in name: - name += ".weight" - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # correct naming for down_proj - if "down_proj" in name: - if name.endswith("_bias"): - name = name.replace("down_proj_bias", "down_proj.bias") - elif "_blocks" not in name and "_scales" not in name: - logger.warning(f"{name} is not in MXFP4, performance may be degraded") - name = name.replace("down_proj", "down_proj.weight") - data_torch = data_torch.transpose(-1, -2) - else: - # otherwise, it should already be repacked to ggml MXFP4 format - return - - # split the gate_up into gate and up - if "gate_up_proj" in name: - if name.endswith("_bias"): - name_up = name.replace("gate_up_proj_bias", "up_proj.bias") - name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") - gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] - yield from super().modify_tensors(gate_proj_bias, name_gate, bid) - yield from super().modify_tensors(up_proj_bias, name_up, bid) - elif "_blocks" not in name and "_scales" not in name: - logger.warning(f"{name} is not in MXFP4, performance may be degraded") - name_up = name.replace("gate_up_proj", "up_proj.weight") - name_gate = name.replace("gate_up_proj", "gate_proj.weight") - data_torch = data_torch.transpose(-1, -2) - gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] - yield from super().modify_tensors(gate_proj_weight, name_gate, bid) - yield from super().modify_tensors(up_proj_weight, name_up, bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - def set_vocab(self): - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"]) - - -@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") -class LFM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.LFM2 - - def _add_feed_forward_length(self): - ff_dim = self.find_hparam(["block_ff_dim", "intermediate_size"]) - auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"] - ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"] - multiple_of = self.hparams["block_multiple_of"] - - if auto_adjust_ff_dim: - ff_dim = int(2 * ff_dim / 3) - # custom dim factor multiplier - if ffn_dim_multiplier is not None: - ff_dim = int(ffn_dim_multiplier * ff_dim) - ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) - - self.gguf_writer.add_feed_forward_length(ff_dim) - - def set_gguf_parameters(self): - # set num_key_value_heads only for attention layers - self.hparams["num_key_value_heads"] = [ - self.hparams["num_key_value_heads"] if layer_type != "conv" else 0 - for layer_type in self.hparams["layer_types"] - ] - - super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) - self._add_feed_forward_length() - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if ConformerAudioModel.is_audio_tensor(name): - # skip multimodal tensors - return None - - name = name.replace("lfm.", "model.") # audio - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # conv op requires 2d tensor - if 'conv.conv' in name: - data_torch = data_torch.squeeze(1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Lfm2Model") -class LFM2ColBertModel(LFM2Model): - model_arch = gguf.MODEL_ARCH.LFM2 - dense_tensor_name = "dense_2" - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if not name.startswith(self.dense_tensor_name): - name = "model." + name - - yield from super().modify_tensors(data_torch, name, bid) - - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # dense tensor is stored in a separate safetensors file - from safetensors.torch import load_file - tensors_file = self.dir_model / "1_Dense" / "model.safetensors" - assert tensors_file.is_file() - tensor = load_file(tensors_file)["linear.weight"] - self.gguf_writer.add_embedding_length_out(tensor.shape[0]) - yield f"{self.dense_tensor_name}.weight", tensor.clone() - - -@ModelBase.register("Lfm2MoeForCausalLM") -class LFM2MoeModel(TextModel): - model_arch = gguf.MODEL_ARCH.LFM2MOE - - def set_gguf_parameters(self): - # set num_key_value_heads only for attention layers - self.hparams["num_key_value_heads"] = [ - self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 - for layer_type in self.hparams["layer_types"] - ] - - super().set_gguf_parameters() - - self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"]) - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) - - # cache for experts weights for merging - _experts_cache: dict[int, dict[str, Tensor]] = {} - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.endswith(".expert_bias"): - name = name.replace(".expert_bias", ".expert_bias.bias") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # conv op requires 2d tensor - if 'conv.conv' in name: - data_torch = data_torch.squeeze(1) - - # merge expert weights - if 'experts' in name: - n_experts = self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - expert_cache = self._experts_cache.setdefault(bid, {}) - expert_cache[name] = data_torch - expert_weights = ["w1", "w2", "w3"] - - # not enough expert weights to merge - if len(expert_cache) < n_experts * len(expert_weights): - return - - for w_name in expert_weights: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight" - datas.append(expert_cache[ename]) - del expert_cache[ename] - - data_torch = torch.stack(datas, dim=0) - merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - - del self._experts_cache[bid] - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - assert not self._experts_cache - - -@ModelBase.register("Lfm2VlForConditionalGeneration") -class LFM2VLModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility - self.hparams_vision["image_size"] = 256 - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) - self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) - self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) - self.gguf_writer.add_vision_use_gelu(True) - # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 - vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) - self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("model.vision_tower.", "vision_tower.") - name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "patch_embedding.weight" in name: - data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Lfm2AudioForConditionalGeneration") -class LFM2AudioModel(ConformerAudioModel): - has_vision_encoder = False - has_audio_encoder = True - model_name = "Lfm2AudioEncoder" - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("encoder") - - def set_gguf_parameters(self): - assert self.hparams_audio is not None - self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] - self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] - self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A) - self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # skip language model tensors - if name.startswith("lfm."): - return None - - # for training only - if any(p in name for p in ["audio_loss_weight"]): - return None - - # for audio output - if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("GraniteSpeechForConditionalGeneration") -class GraniteSpeechMmprojModel(MmprojModel): - has_vision_encoder = False - has_audio_encoder = True - - _batch_norm_tensors: list[dict[str, Tensor]] | None = None - - def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("encoder_config") - - def set_gguf_parameters(self): - assert self.hparams_audio is not None - a = self.hparams_audio - a["hidden_size"] = a["hidden_dim"] - a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"] - a["num_attention_heads"] = a["num_heads"] - a["num_hidden_layers"] = a["num_layers"] - - super().set_gguf_parameters() - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH) - self.gguf_writer.add_audio_num_mel_bins(a["input_dim"]) - self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) - self.gguf_writer.add_audio_chunk_size(a["context_size"]) - self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"]) - self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"]) - - p = self.global_config - self.gguf_writer.add_audio_projector_window_size(p["window_size"]) - self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"]) - self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"]) - - def tensor_force_quant(self, name, new_name, bid, n_dims): - if "encoder" in name or "projector" in name: - if ".conv" in name and ".weight" in name: - return gguf.GGMLQuantizationType.F32 - return super().tensor_force_quant(name, new_name, bid, n_dims) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if "attention_dists" in name or "num_batches_tracked" in name: - return None - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # fold running_mean, running_var and eps into weight and bias for batch_norm - if "batch_norm" in name and "encoder.layers." in name: - if self._batch_norm_tensors is None: - self._batch_norm_tensors = [{} for _ in range(self.block_count)] - assert bid is not None - self._batch_norm_tensors[bid][name] = data_torch - if len(self._batch_norm_tensors[bid]) < 4: - return - prefix = f"encoder.layers.{bid}.conv.batch_norm" - weight = self._batch_norm_tensors[bid][f"{prefix}.weight"] - bias = self._batch_norm_tensors[bid][f"{prefix}.bias"] - running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"] - running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"] - eps = 1e-5 - a = weight / torch.sqrt(running_var + eps) - b = bias - running_mean * a - yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid) - yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid) - return - - if ".attn.to_kv.weight" in name: - k_weight, v_weight = data_torch.chunk(2, dim=0) - yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid) - yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid) - return - - if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"): - if data_torch.ndim == 3 and data_torch.shape[2] == 1: - data_torch = data_torch.squeeze(2) - - if "depth_conv" in name and name.endswith(".weight"): - if data_torch.ndim == 3 and data_torch.shape[1] == 1: - data_torch = data_torch.squeeze(1) - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Lfm25AudioTokenizer") -class LFM25AudioTokenizer(LFM2Model): - model_arch = gguf.MODEL_ARCH.LFM2 - - def set_vocab(self): - self._set_vocab_none() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) - self.gguf_writer.add_embedding_length_out(self.hparams["output_size"]) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # skip language model tensors - if name == "istft.window" or name.startswith("emb.emb"): - return None - - if name.startswith("lin"): - name = name.replace("lin", "dense_2_out") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("SmallThinkerForCausalLM") -class SmallThinkerModel(TextModel): - model_arch = gguf.MODEL_ARCH.SMALLTHINKER - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (n_experts := self.hparams.get("moe_num_primary_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.hparams.get("moe_num_active_primary_experts")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - self.gguf_writer.add_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (self.hparams.get('moe_primary_router_apply_softmax')): - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - - sliding_window_layout = self.hparams.get("sliding_window_layout") - if sliding_window_layout: - for i in sliding_window_layout: - if i != 0: - sliding_window = self.hparams.get("sliding_window_size") - if sliding_window: - self.gguf_writer.add_sliding_window(sliding_window) - break - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams.get("moe_num_primary_experts") or self.find_hparam(["num_local_experts", "num_experts"]) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor - for w_name in ["down", "gate", "up"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - yield from super().modify_tensors(data_torch, merged_name, bid) - return - else: - return - - yield from super().modify_tensors(data_torch, name, bid) - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - -@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification") -class ModernBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.MODERN_BERT - - def set_vocab(self): - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - self.gguf_writer.add_add_sep_token(True) - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_sliding_window(self.hparams["local_attention"]) - if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None: - self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if name.startswith("model."): - name = name[6:] - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if self.cls_out_labels: - # For BertForSequenceClassification (direct projection layer) - if name == "classifier.weight": - name = "classifier.out_proj.weight" - - if name == "classifier.bias": - name = "classifier.out_proj.bias" - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("ApertusForCausalLM") -class ApertusModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.APERTUS - undo_permute = False - - _alpha_n = {} - _alpha_p = {} - _beta = {} - _eps = {} - - def modify_tensors(self, data_torch, name, bid): - # Handle xIELU activation parameters - n_layers = self.hparams["num_hidden_layers"] - if name.endswith(".act_fn.alpha_n"): - self._alpha_n[bid] = data_torch.to("cpu").float().item() - if (len(self._alpha_n) == n_layers): - self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) - return - if name.endswith(".act_fn.alpha_p"): - self._alpha_p[bid] = data_torch.to("cpu").float().item() - if (len(self._alpha_p) == n_layers): - self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) - return - if name.endswith(".act_fn.beta"): - self._beta[bid] = data_torch.to("cpu").float().item() - if (len(self._beta) == n_layers): - self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) - return - if name.endswith(".act_fn.eps"): - self._eps[bid] = data_torch.to("cpu").float().item() - if (len(self._eps) == n_layers): - self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) - return - - yield from super().modify_tensors(data_torch, name, bid) - - -class MistralModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.MISTRAL3 - model_name = "Mistral" - hf_arch = "" - is_mistral_format = True - undo_permute = False - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # for compatibility, we use LLAMA arch for older models - # TODO: remove this once everyone migrates to newer version of llama.cpp - if "llama_4_scaling" not in self.hparams: - self.model_arch = gguf.MODEL_ARCH.LLAMA - self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] - self.gguf_writer.add_architecture() - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def dequant_model(self): - # transform quantization config into HF format - quant_config = self.hparams.get("quantization") - if quant_config is not None: - assert quant_config["qformat_weight"] == "fp8_e4m3" - self.hparams["quantization_config"] = { - "activation_scheme": "static", - "quant_method": "fp8", - "weight_block_size": None, - } - return super().dequant_model() - - @staticmethod - def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): - assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg - assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( - f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" - ) - - if vocab.tokenizer.version == TokenizerVersion.v1: - return "mistral-v1" - elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm: - return "mistral-v3" - elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken: - return "mistral-v3-tekken" - elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm: - return "mistral-v7" - elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken: - return "mistral-v7-tekken" - elif vocab.tokenizer.version == TokenizerVersion.v11: - template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja" - elif vocab.tokenizer.version == TokenizerVersion.v13: - template_file = "unsloth-mistral-Devstral-Small-2507.jinja" - else: - err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}" - if is_mistral_format: - err_message += ( - " . Please pass --disable-mistral-community-chat-template argument to the CLI " - "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." - ) - raise ValueError(err_message) - - template_path = templates_dir / template_file - if not template_path.exists(): - raise FileNotFoundError(f"Template file not found: {template_path}") - - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - - return template - - def set_gguf_parameters(self): - super().set_gguf_parameters() - MistralModel.set_mistral_config(self.gguf_writer, self.hparams) - - @staticmethod - def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): - if "yarn" in hparams: - yarn_params = hparams["yarn"] - mscale_all_dim = 1.0 if not yarn_params["apply_scale"] else 0.0 - gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - gguf_writer.add_rope_scaling_factor(yarn_params["factor"]) - gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"]) - gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"]) - gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim) - gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"]) - - if "llama_4_scaling" in hparams: - gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) - - -class MistralMoeModel(DeepseekV2Model): - model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - model_name = "Mistral" - hf_arch = "" - is_mistral_format = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - logger.info("Using MistralMoeModel") - # remap hparams from Mistral MoE format to DeepseekV2 format - # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic - # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py - config = self.hparams - # Mistral key -> HF key - config_mapping = { - "dim": "hidden_size", - "norm_eps": "rms_norm_eps", - "n_kv_heads": "num_key_value_heads", - "n_layers": "num_hidden_layers", - "n_heads": "num_attention_heads", - "hidden_dim": "intermediate_size", - } - # HF key -> (Mistral key, default value) - top_level_mapping_with_default = { - "model_type": ("model_type", "transformer"), - "hidden_act": ("activation", "silu"), - "tie_word_embeddings": ("tied_embeddings", False), - "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), - "max_position_embeddings": ("max_position_embeddings", 128_000), - } - # mapping top-level keys - for key, new_key in config_mapping.items(): - if key in config: - config[new_key] = config[key] - for new_key, (key, default_value) in top_level_mapping_with_default.items(): - config[new_key] = config.get(key, default_value) - # mapping MoE-specific keys - moe_config_map = { - "route_every_n": "moe_layer_freq", - "first_k_dense_replace": "first_k_dense_replace", - "num_experts_per_tok": "num_experts_per_tok", - "num_experts": "n_routed_experts", - "expert_hidden_dim": "moe_intermediate_size", - "routed_scale": "routed_scaling_factor", - "num_shared_experts": "n_shared_experts", - "num_expert_groups": "n_group", - "num_expert_groups_per_tok": "topk_group", - } - moe = config["moe"] - for key, new_key in moe_config_map.items(): - if key in moe: - config[new_key] = moe[key] - # provide missing values - config["topk_method"] = None - config["norm_topk_prob"] = True - config["scoring_func"] = "softmax" - - def set_vocab(self): - self._set_vocab_mistral() - - def set_gguf_parameters(self): - super().set_gguf_parameters() - MistralModel.set_mistral_config(self.gguf_writer, self.hparams) - yarn_params = self.hparams["yarn"] - self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) - - # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] - # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul - # ref https://github.com/ggml-org/llama.cpp/pull/17945 - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic - if name.endswith(".qscale_act"): - name = name.replace(".qscale_act", ".input_scale") - if name.endswith(".qscale_weight"): - name = name.replace(".qscale_weight", ".weight_scale") - if ".wkv_b." in name: - name = name.replace(".wkv_b.", ".kv_b_proj.") - if ".experts." in name: - name = name.replace(".experts.", ".mlp.experts.") - name = name.replace(".w1.", ".gate_proj.") - name = name.replace(".w2.", ".down_proj.") - name = name.replace(".w3.", ".up_proj.") - name = "model." + name - - return super().filter_tensors((name, gen)) - - -class PixtralModel(LlavaVisionModel): - model_name = "Pixtral" - hf_arch = "" - is_mistral_format = True - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) - - self.gguf_writer.add_vision_attention_layernorm_eps( - self.find_hparam(["norm_eps"]) - ) - self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) - - self.gguf_writer.add_vision_use_silu(True) - - # spatial_merge_size - if self.find_vparam(["mm_projector_id"], optional=True) == "patch_merge": - self.gguf_writer.add_vision_spatial_merge_size( - self.find_vparam(["spatial_merge_size"]) - ) - - def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: - if name == "vision_language_adapter.w_in.weight": - return "mm.1.weight" - elif name == "vision_language_adapter.w_in.bias": - return "mm.1.bias" - elif name == "vision_language_adapter.w_out.weight": - return "mm.2.weight" - elif name == "vision_language_adapter.w_out.bias": - return "mm.2.bias" - return super().map_tensor_name(name, try_suffixes) - - -@ModelBase.register("LightOnOCRForConditionalGeneration") -class LightOnOCRVisionModel(LlavaVisionModel): - is_mistral_format = False - use_break_tok = False - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - name = name.replace("model.vision_encoder.", "vision_tower.") - name = name.replace("model.vision_projection.", "multi_modal_projector.") - - return super().filter_tensors((name, gen)) - - -@ModelBase.register("KimiVLForConditionalGeneration") -class KimiVLModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = 64 * 14 # for compatibility - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_projector_scale_factor(2) - # eps is the same as pytorch's default value - assert self.hparams_vision is not None - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name - - if not is_vision_tensor: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "pos_emb.weight" in name: - data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) - - if "wqkv" in name: - split_dim = 0 if "weight" in name else -1 - wq, wk, wv = data_torch.chunk(3, dim=split_dim) - yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid) - yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid) - yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid) - else: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("KimiK25ForConditionalGeneration") -class KimiK25Model(MmprojModel): - """Kimi-K2.5 with MoonViT3d vision encoder""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - assert self.hparams_vision is not None, "Kimi-K2.5 requires vision_config in model config" - - self.merge_kernel_size = tuple(self.hparams_vision.get("merge_kernel_size", [2, 2])) - self.patch_size = self.hparams_vision.get("patch_size", 14) - - # Set image_size for compatibility with base class - # Use position embedding dimensions as image_size reference - pos_emb_h = self.hparams_vision.get("init_pos_emb_height", 64) - self.hparams_vision["image_size"] = pos_emb_h * self.patch_size - - def set_gguf_parameters(self): - # Base class MmprojModel.set_gguf_parameters() already writes: - # - vision_block_count, vision_head_count, vision_embedding_length - # - vision_feed_forward_length, vision_patch_size, image_mean, image_std - # via find_vparam() which handles the vt_* prefixed keys in Kimi-K2.5's config - super().set_gguf_parameters() - assert self.hparams_vision is not None - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25) - - # Position embedding parameters (for interpolation) - self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64)) - self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64)) - self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4)) - - # Projector parameters - self.gguf_writer.add_vision_use_gelu(self.hparams_vision.get("projector_hidden_act", "gelu") == "gelu") - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5)) - self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0]) - - # Image size limits - # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet) - in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384) - min_patches = 8 # reasonable minimum - pixels_per_patch = self.patch_size ** 2 - self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch) - self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch) - - @staticmethod - def permute(weights: Tensor, n_head: int) -> Tensor: - out_dim, in_dim = weights.shape - head_dim = out_dim // n_head - w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim) - w = w.permute(0, 2, 1, 3, 4) - return w.reshape(out_dim, in_dim) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Only process vision and projector tensors - is_vision = any(x in name for x in ["vision_tower", "mm_projector"]) - - if not is_vision: - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - assert self.hparams_vision is not None - n_head = self.hparams_vision.get("num_attention_heads", 16) - - # Permute Q/K weights/biases from interleaved to split RoPE format - # This allows using build_rope_2d at runtime without post-permutation. - if "wqkv" in name: - out_dim = data_torch.shape[0] - qkv_dim = out_dim // 3 - head_dim = qkv_dim // n_head - - if "weight" in name: - wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2 * qkv_dim, :], data_torch[2 * qkv_dim:, :] - wq = self.permute(wq, n_head) - wk = self.permute(wk, n_head) - data_torch = torch.cat([wq, wk, wv], dim=0) - elif "bias" in name: - bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2 * qkv_dim], data_torch[2 * qkv_dim:] - bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) - bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) - data_torch = torch.cat([bq, bk, bv], dim=0) - - # Temporal embeddings: (T, 1, C) โ†’ (T, C) - if "pos_emb.time_weight" in name: - T, _, C = data_torch.shape - data_torch = data_torch.reshape(T, C) - - # PatchMergerMLP tensor name mapping - # proj.0.weight โ†’ proj.linear_1.weight - # proj.2.weight โ†’ proj.linear_2.weight - if "mm_projector.proj.0." in name: - name = name.replace(".proj.0.", ".proj.linear_1.") - elif "mm_projector.proj.2." in name: - name = name.replace(".proj.2.", ".proj.linear_2.") - - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("CogVLMForCausalLM") -class CogVLMVisionModel(MmprojModel): - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("model.vision."): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("CogVLMForCausalLM") -class CogVLMModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.COGVLM - - -@ModelBase.register("JanusForConditionalGeneration") -class JanusProModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip vision, aligner, and generation tensors - skip_prefixes = ( - 'model.vision_model.', - 'model.aligner.', - 'model.vqmodel.', - 'model.generation_embeddings.', - 'model.generation_aligner.', - 'model.generation_head.', - ) - if name.startswith(skip_prefixes): - return None - - return super().filter_tensors(item) - - -@ModelBase.register("JanusForConditionalGeneration") -class JanusProVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - if "intermediate_size" not in self.hparams_vision: - mlp_ratio = self.hparams_vision.get("mlp_ratio") - hidden_size = self.hparams_vision.get("hidden_size") - if mlp_ratio is not None and hidden_size is not None: - self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - assert self.hparams_vision is not None - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO) - - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6)) - - hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() - if hidden_act == "gelu": - self.gguf_writer.add_vision_use_gelu(True) - elif hidden_act == "silu": - self.gguf_writer.add_vision_use_silu(True) - - def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]: - """Map aligner tensors to projector format""" - suffix = ".bias" if name.endswith(".bias") else ".weight" - - if name.startswith("model.aligner."): - local_name = name[len("model.aligner."):] - elif name.startswith("aligner."): - local_name = name[len("aligner."):] - else: - raise ValueError(f"Unsupported Janus aligner prefix: {name}") - - if local_name.startswith("fc1."): - mm_index = 0 - elif local_name.startswith("hidden_layers."): - parts = local_name.split(".", 2) - if len(parts) < 3: - raise ValueError(f"Unexpected Janus aligner tensor name: {name}") - mm_index = int(parts[1]) + 1 - else: - raise ValueError(f"Unsupported Janus aligner tensor: {name}") - - tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix) - return [(tensor_name, data_torch)] - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip generation-related components - skip_generation_prefixes = ( - 'model.vqmodel.', - 'vqmodel.', - 'model.generation_embeddings.', - 'generation_embeddings.', - 'model.generation_aligner.', - 'generation_aligner.', - 'model.generation_head.', - 'generation_head.', - ) - if name.startswith(skip_generation_prefixes): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle aligner tensors - if name.startswith(('model.aligner.', 'aligner.')): - yield from self._map_aligner_tensor(data_torch, name) - return - - # Handle vision tensors - if name.startswith(('model.vision_model.', 'vision_model.')): - yield from super().modify_tensors(data_torch, name, bid) - return - - return - - -@ModelBase.register("YoutuVLForConditionalGeneration") -class YoutuVLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL) - self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) - - # Handle activation function - hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower() - if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"): - self.gguf_writer.add_vision_use_gelu(True) - elif hidden_act == "silu": - self.gguf_writer.add_vision_use_silu(True) - else: - raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}") - - self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2)) - - window_size = self.hparams.get("window_size") - if window_size is not None: - self.gguf_writer.add_vision_window_size(window_size) - # fullatt_block_indexes contains explicit layer indices that use full attention - # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention - # All other layers use window attention - fullatt_block_indexes = self.hparams.get("fullatt_block_indexes") - assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl" - # Store the explicit layer indices for YoutuVL (irregular pattern approach) - self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - # Skip language model tensors - skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') - if name.startswith(skip_prefixes): - return None - - return super().filter_tensors(item) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Try to map the tensor using TensorNameMap (handles vision encoder and projector) - try: - yield from super().modify_tensors(data_torch, name, bid) - except ValueError: - # If mapping fails, log warning and skip - logger.warning(f"Cannot map tensor: {name}") - return - - -@ModelBase.register("SolarOpenForCausalLM") -class SolarOpenModel(Glm4MoeModel): - model_arch = gguf.MODEL_ARCH.GLM4_MOE - - def set_vocab(self): - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"]) # ty: ignore[unresolved-attribute] - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"]) # ty: ignore[unresolved-attribute] - special_vocab.add_to_gguf(self.gguf_writer) - - -@ModelBase.register("DotsOCRForCausalLM") -class DotsOCRVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert self.hparams_vision is not None - self.hparams_vision["image_size"] = 0 # dynamic resolution - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) - self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"])) - self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"])) - self.gguf_writer.add_vision_use_silu(True) - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - - if not name.startswith("vision_tower."): - return None - - if "vision_tower.blocks." in name and ".mlp." in name: - # note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here - # x = F.silu(self.fc1(x)) * self.fc3(x) - # x = self.fc2(x) - # fc1 -> gate, fc2 -> down, fc3 -> up - # mapping original names to Qwen2.5 naming scheme - name = name.replace("vision_tower.blocks.", "visual.blocks.") - name = name.replace(".fc1", ".gate_proj") - name = name.replace(".fc2", ".down_proj") - name = name.replace(".fc3", ".up_proj") - - return super().filter_tensors((name, gen)) - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Sarashina2VisionForCausalLM") -class Sarashina2VLTextModel(LlamaModel): - model_arch = gguf.MODEL_ARCH.LLAMA - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, gen = item - if name.startswith("llm."): - name = name.replace("llm.", "", 1) - elif name.startswith("norm."): - return None - return super().filter_tensors((name, gen)) - - -@ModelBase.register("Sarashina2VisionForCausalLM") -class Sarashina2VLVisionModel(Qwen2VLVisionModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.global_config['model_type'] = "qwen2_vl" - - -###### CONVERSION LOGIC ###### - - -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - torch.uint8: np.uint8, - } - - # only used when byteswapping data. Only correct size is needed - # TODO: uncomment uint64, uint32, and uint16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_byteswap_map: dict[torch.dtype, type] = { - torch.float64: np.float64, - torch.float32: np.float32, - torch.bfloat16: np.float16, - torch.float16: np.float16, - torch.int64: np.int64, - # torch.uint64: np.uint64, - torch.int32: np.int32, - # torch.uint32: np.uint32, - torch.int16: np.int16, - # torch.uint16: np.uint16, - torch.int8: np.int8, - torch.uint8: np.uint8, - torch.bool: np.uint8, - torch.float8_e4m3fn: np.uint8, - torch.float8_e5m2: np.uint8, - } - - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_str_map: dict[str, torch.dtype] = { - "F64": torch.float64, - "F32": torch.float32, - "BF16": torch.bfloat16, - "F16": torch.float16, - # "U64": torch.uint64, - "I64": torch.int64, - # "U32": torch.uint32, - "I32": torch.int32, - # "U16": torch.uint16, - "I16": torch.int16, - "U8": torch.uint8, - "I8": torch.int8, - "BOOL": torch.bool, - "F8_E4M3": torch.float8_e4m3fn, - "F8_E5M2": torch.float8_e5m2, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - args=(self,), - func=(lambda s: s.numpy()) - ) - - @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") +import argparse +import logging +import os +import sys +from pathlib import Path - @classmethod - def from_safetensors_slice(cls, st_slice: Any) -> Tensor: - dtype = cls._dtype_str_map[st_slice.get_dtype()] - shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:]) - return cast(torch.Tensor, lazy) +import torch - @classmethod - def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor: - def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor: - def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: - if sys.byteorder == 'big': - # switch data back to big endian - tensor = tensor.view(dtype).byteswap(inplace=False) - return tensor - dtype = cls._dtype_str_map[tensor.dtype] - numpy_dtype = cls._dtype_byteswap_map[dtype] - return torch.from_numpy(byteswap_tensor(tensor.mmap_bytes(), numpy_dtype)).view(dtype).reshape(tensor.shape) - dtype = cls._dtype_str_map[t.dtype] - shape = t.shape - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r)) - return cast(torch.Tensor, lazy) +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf - @classmethod - def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): - def byteswap_tensor(tensor: np.ndarray, dtype: type) -> np.ndarray: - if sys.byteorder == 'big': - # switch data back to big endian - tensor = tensor.view(dtype).byteswap(inplace=False) - return tensor - dtype = cls._dtype_str_map[remote_tensor.dtype] - numpy_dtype = cls._dtype_byteswap_map[dtype] - shape = remote_tensor.shape - meta = cls.meta_with_dtype_and_shape(dtype, shape) - lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.from_numpy(byteswap_tensor(np.frombuffer(r.data(), dtype=numpy_dtype), numpy_dtype)).view(dtype).reshape(shape)) - return cast(torch.Tensor, lazy) +from conversion import ( + ModelBase, + ModelType, + get_model_architecture, + get_model_class, + logger, + print_registered_models, + _mistral_common_installed, + _mistral_import_error_msg, +) - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - if kwargs is None: - kwargs = {} +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - if func is torch.Tensor.numpy: - assert len(args) - return args[0].numpy() + if n < 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") - return cls._wrap_fn(func)(*args, **kwargs) + return n def parse_args() -> argparse.Namespace: @@ -14133,7 +115,15 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--mmproj", action="store_true", - help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + help="Export multimodal projector (mmproj) for vision models. This will only work on some vision models. An 'mmproj-' prefix will be added to the output file name.", + ) + parser.add_argument( + "--mtp", action="store_true", + help="Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. An 'mtp-' prefix will be added to the output file name.", + ) + parser.add_argument( + "--no-mtp", action="store_true", + help="Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, but even though the bundled default is more space-efficient overall, this allows differing quantization which may be more performant.", ) parser.add_argument( "--mistral-format", action="store_true", @@ -14158,6 +148,10 @@ def parse_args() -> argparse.Namespace: "--fuse-gate-up-exps", action="store_true", help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.", ) + parser.add_argument( + "--fp8-as-q8", action="store_true", + help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.", + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -14165,62 +159,12 @@ def parse_args() -> argparse.Namespace: return args -def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1000 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1000 * 1000 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1000 * 1000 * 1000 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") - - if n < 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - - -def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str: - # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders - # maybe we should fallback to text model's arch in that case, since not many models have both - text_config = hparams.get("text_config", {}) - if model_type == ModelType.TEXT and hparams.get("thinker_config", {}).get("text_config") is not None: - text_config = hparams["thinker_config"]["text_config"] - vision_config = hparams.get("vision_config", {}) - arch = None - if (arches := hparams.get("architectures")) is not None and len(arches) > 0: - arch = arches[0] - elif "ssm_cfg" in hparams: - # For non-hf Mamba and Mamba2 models - arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM" - - # Step3-VL keeps text config under text_config but uses a custom top-level architecture. - # For text conversion we route to a dedicated text-only class. - # TODO: refactor this later to avoid adding exception here - if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"): - return arch - - # if "architectures" is found in the sub-config, use that instead - if model_type == ModelType.TEXT and text_config.get("architectures") is not None: - arch = text_config["architectures"][0] - elif model_type == ModelType.TEXT and arch == "Qwen3ASRForConditionalGeneration" and text_config.get("model_type") == "qwen3": - arch = "Qwen3ASRForConditionalGeneration" - elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None: - arch = vision_config["architectures"][0] - if arch is None: - raise ValueError("Failed to detect model architecture") - return arch - - def main() -> None: args = parse_args() if args.print_supported_models: logger.error("Supported models:") - ModelBase.print_registered_models() + print_registered_models() sys.exit(0) if args.verbose: @@ -14286,18 +230,35 @@ def main() -> None: model_architecture = get_model_architecture(hparams, model_type) logger.info(f"Model architecture: {model_architecture}") try: - model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) + model_class = get_model_class(model_architecture, mmproj=(model_type == ModelType.MMPROJ)) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) elif args.mmproj: assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" + from conversion.pixtral import PixtralModel model_class = PixtralModel elif "moe" in hparams: + from conversion.mistral import MistralMoeModel model_class = MistralMoeModel else: + from conversion.mistral import MistralModel model_class = MistralModel + if args.mtp and args.no_mtp: + logger.error("--mtp and --no-mtp are mutually exclusive") + sys.exit(1) + + if args.mtp or args.no_mtp: + from conversion.qwen import _Qwen35MtpMixin + if not issubclass(model_class, _Qwen35MtpMixin): + logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today") + sys.exit(1) + if args.no_mtp: + model_class.no_mtp = True + if args.mtp: + model_class.mtp_only = True + model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, eager=args.no_lazy, @@ -14307,7 +268,8 @@ def main() -> None: small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, - fuse_gate_up_exps=args.fuse_gate_up_exps + fuse_gate_up_exps=args.fuse_gate_up_exps, + fp8_as_q8=args.fp8_as_q8, ) if args.vocab_only: diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 8d73b1f5546..827af277b92 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -19,7 +19,7 @@ logger = logging.getLogger("convert_hf_to_gguf_update") sess = requests.Session() -convert_py_pth = pathlib.Path("convert_hf_to_gguf.py") +convert_py_pth = pathlib.Path("conversion/base.py") convert_py = convert_py_pth.read_text(encoding="utf-8") hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token" hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None @@ -139,7 +139,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", }, - {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, + {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2.5-350M", }, {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, {"name": "modern-bert", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", }, @@ -156,6 +156,8 @@ class TOKENIZER_TYPE(IntEnum): {"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", }, {"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", }, {"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", }, + {"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", }, + {"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"}, ] # some models are known to be broken upstream, so we will skip them as exceptions @@ -181,6 +183,8 @@ class TOKENIZER_TYPE(IntEnum): # jina-v2-de variants {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"}, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"}, + # lfm2 variants + {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2.5-8B-A1B", "chkhsh": "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7"}, ] @@ -374,7 +378,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: convert_py_pth.write_text(convert_py, encoding="utf-8") -logger.info("+++ convert_hf_to_gguf.py was updated") +logger.info(f"+++ {convert_py_pth} was updated") # generate tests for each tokenizer model diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ad4751bb96e..9a6437beab1 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -22,12 +22,11 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf - -# reuse model definitions from convert_hf_to_gguf.py -from convert_hf_to_gguf import LazyTorchTensor, ModelBase - from gguf.constants import GGUFValueType +# reuse model definitions from the conversion/ package +from conversion import LazyTorchTensor, ModelBase, get_model_class + logger = logging.getLogger("lora-to-gguf") @@ -209,6 +208,16 @@ def split(self, split_size: int | Sequence[int], dim: int = 0) -> tuple[LoraTorc def to(self, *args, **kwargs): return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs)) + def __mul__(self, other) -> LoraTorchTensor: + # Only output-side multiplication for now + # W = B @ A, so M_out * W == (M_out * B) @ A + if not isinstance(other, (int, float)) and other.shape and other.shape[-1] != 1: + raise NotImplementedError + return LoraTorchTensor(self._lora_A, self._lora_B * other) + + def __rmul__(self, other) -> LoraTorchTensor: + return self * other + @classmethod def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): del types # unused @@ -384,7 +393,7 @@ def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None] with torch.inference_mode(): try: - model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) + model_class = get_model_class(hparams["architectures"][0]) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) @@ -446,6 +455,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) + # filter base name, ignore tensor transformations for now + data_gen = lambda g=tensor: g # noqa: E731 + if (titem := self.filter_tensors((base_name, data_gen))) is None: + continue + base_name, _ = titem # note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name diff --git a/docs/autoparser.md b/docs/autoparser.md index adc4d43ed67..33ede1a2282 100644 --- a/docs/autoparser.md +++ b/docs/autoparser.md @@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend - Usage: `./bin/llama-template-analysis path/to/template.jinja` -**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2` +**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2` - Shows detailed analysis steps, pattern extraction results, and generated parser structure @@ -489,6 +489,7 @@ The following templates have active tests in `tests/test-chat.cpp`: | Qwen-QwQ-32B | Reasoning | Forced-open thinking | | NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper | | IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` | +| IBM Granite 4.0 | JSON_NATIVE | `<tool_call>` wrapper (same template used by 4.1) | | ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags | | Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format | | DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode | diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 155f933b805..e26cef2c37d 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -5,9 +5,10 @@ - [News](#news) - [OS](#os) - [Hardware](#hardware) +- [Performance Reference](#performance-reference) - [Docker](#docker) - [Linux](#linux) -- [Windows](#windows) +- [Windows](#windows-1) - [Environment Variable](#environment-variable) - [Design Rule](#design-rule) - [Known Issue](#known-issues) @@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on ## News -- 2026.04 - - - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0. +- 2026.04-05 + - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0. - Fused MoE. - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package. @@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the NA +## Performance Reference + + +To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313). + +You could update your test result in it directly. + ## Docker The docker build option is currently limited to *Intel GPU* targets. @@ -736,6 +743,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. | | GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. | | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | +| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer | | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. | @@ -746,6 +754,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo | Name | Function | |-----------------|----------------------------------------------------------------------------------| | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. | +| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. | ## Design Rule diff --git a/docs/backend/ZenDNN.md b/docs/backend/ZenDNN.md index add5805331c..b2f970d8c43 100644 --- a/docs/backend/ZenDNN.md +++ b/docs/backend/ZenDNN.md @@ -72,10 +72,13 @@ The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert- |:----------------------:|:-------:|:---------------------------------------------:| | FP32 | Support | Full precision floating point | | BF16 | Support | BFloat16 (best performance on Zen 4/Zen 5) | +| Q8_0 | Support | 8-bit quantized weights via [dynamic quantization](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md) | *Notes:* - **BF16** provides best performance on Zen 4 and Zen 5 EPYCโ„ข processors (Genoa, Turin). +- **Q8_0** is available for quantized model weights since ZenDNN supports dynamic quantization [LowOHA MatMul operator](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md). +- Other quantization formats fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend. ## Linux @@ -140,6 +143,15 @@ Download LLaMA 3.1 8B Instruct BF16 model: huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/ ``` +You can also use a Q8_0 GGUF model: + +```sh +# Download a Q8_0 GGUF model from Hugging Face +huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF \ + Llama-3.1-8B-Instruct-Q8_0.gguf \ + --local-dir models/ +``` + #### 2. Start Server Run llama.cpp server with ZenDNN acceleration: @@ -176,6 +188,10 @@ export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended) For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details). +### Q8_0 Performance Notes + +Q8_0 support is mainly beneficial for prompt processing / prefill workloads where large matrix multiplications dominate execution. Token generation performance may remain close to the standard CPU backend depending on the model, batch size, number of threads, and CPU topology. + ### Profiling and Debugging For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md). @@ -184,6 +200,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen - **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations. - **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32. +- **Q8_0 support scope**: Q8_0 acceleration is available for supported matrix multiplication paths. Other quantization formats still fall back to the standard CPU backend. - **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance. ## Q&A @@ -202,7 +219,7 @@ A: ZenDNN is optimized specifically for AMD processors. While it may work on oth **Q: Does ZenDNN support quantized models?** -A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time. +A: Yes. The ZenDNN backend supports Q8_0 quantized models for supported matrix multiplication operations. FP32 and BF16 are also supported. Other quantization formats may fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend. **Q: Why is my inference not faster with ZenDNN?** diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json index c07bf5ca0c6..d37100764f1 100644 --- a/docs/backend/snapdragon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -10,8 +10,8 @@ "ANDROID_ABI": "arm64-v8a", "ANDROID_PLATFORM": "android-31", "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake", - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", @@ -33,8 +33,8 @@ "name": "arm64-windows-snapdragon", "inherits": [ "base", "arm64-windows-llvm" ], "cacheVariables": { - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", @@ -59,8 +59,8 @@ "toolset": { "value": "host=x86_64", "strategy": "external" }, "cacheVariables": { "CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake", - "CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md index 2414eeaf6a4..e9f0e215858 100644 --- a/docs/backend/snapdragon/README.md +++ b/docs/backend/snapdragon/README.md @@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop. ``` -~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3 +~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7 [d]/> cd /workspace ``` @@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies: - UCRT and Driver Kit - LLVM core libraries and Clang compiler (winget) - CMake, Git, Python (winget) -- Hexagon SDK Community Edition 6.4 or later (see windows.md) +- Hexagon SDK Community Edition 6.6 or later (see windows.md) - OpenCL SDK 2.3 or later (see windows.md) Note: The rest of the **Windows** build process assumes that you're running natively in Powershell. @@ -45,7 +45,7 @@ Preset CMake variables: GGML_HEXAGON="ON" GGML_OPENCL="ON" GGML_OPENMP="OFF" - HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2" + HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0" ... -- Including OpenCL backend -- Including Hexagon backend diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md index 6307e1b69f1..aa731413c90 100644 --- a/docs/backend/snapdragon/windows.md +++ b/docs/backend/snapdragon/windows.md @@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2 Either use the trimmed down version (optimized for CI) from - https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz + https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz Or download the complete official version from - https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2 + https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0 Unzip/untar the archive into ``` -c:\Qualcomm\Hexagon_SDK\6.4.0.2 +c:\Qualcomm\Hexagon_SDK\6.6.0.0 ``` ## Install the latest Adreno GPU driver @@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam However, additional settings are required for generating and signing HTP Ops libraries. ``` > $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2" -> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2" -> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04" +> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0" +> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07" > $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx" -> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64" +> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0" > cmake --preset arm64-windows-snapdragon-release -B build-wos ... diff --git a/docs/build-riscv64-spacemit.md b/docs/build-riscv64-spacemit.md index 66ea99a05e1..7a9a1f3ad4e 100644 --- a/docs/build-riscv64-spacemit.md +++ b/docs/build-riscv64-spacemit.md @@ -5,7 +5,7 @@ 1. Prepare Toolchain For RISCV ~~~ -wget https://github.com/spacemit-com/toolchain/releases/download/v1.1.2/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz +wget https://github.com/spacemit-com/toolchain/releases/download/v1.2.4/spacemit-toolchain-linux-glibc-x86_64-v1.2.4.tar.xz ~~~ 2. Build diff --git a/docs/build.md b/docs/build.md index a18479b3346..007b757d47d 100644 --- a/docs/build.md +++ b/docs/build.md @@ -22,6 +22,7 @@ The following sections describe how to build with different backends and options * [HIP](#hip) * [Vulkan](#vulkan) * [CANN](#cann) +* [ZenDNN](#zendnn) * [Armยฎ KleidiAIโ„ข](#arm-kleidiai) * [OpenCL](#opencl) * [Android](#android-1) @@ -735,7 +736,7 @@ ninja To read documentation for how to build on Android, [click here](./android.md) -## WebGPU [In Progress] +## WebGPU The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`. diff --git a/docs/function-calling.md b/docs/function-calling.md index 9ede914c04d..850b59ce7aa 100644 --- a/docs/function-calling.md +++ b/docs/function-calling.md @@ -291,6 +291,7 @@ Here are some models known to work (w/ chat template override when needed): llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M +llama-server --jinja -fa -hf ibm-granite/granite-4.1-3b-GGUF:Q4_K_M # Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it) diff --git a/docs/multimodal/granitevision.md b/docs/multimodal/granitevision.md index 3118fe0cdc1..7a63484ffe1 100644 --- a/docs/multimodal/granitevision.md +++ b/docs/multimodal/granitevision.md @@ -176,7 +176,7 @@ Note that currently you cannot quantize the visual encoder because granite visio ### 5. Running the Model in Llama cpp -Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner. +Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the llama.cpp banner. ```bash $ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \ diff --git a/docs/ops.md b/docs/ops.md index dace2d3d537..9ef5478de70 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -55,7 +55,7 @@ Legend: | GELU | โŒ | โœ… | โœ… | ๐ŸŸก | โœ… | ๐ŸŸก | โœ… | ๐ŸŸก | โœ… | โŒ | โŒ | | GELU_ERF | โŒ | โœ… | โœ… | ๐ŸŸก | โœ… | ๐ŸŸก | โœ… | ๐ŸŸก | โœ… | โŒ | โŒ | | GELU_QUICK | โŒ | โœ… | โœ… | ๐ŸŸก | โœ… | ๐ŸŸก | โœ… | ๐ŸŸก | โœ… | โŒ | โŒ | -| GET_ROWS | โŒ | ๐ŸŸก | โœ… | ๐ŸŸก | ๐ŸŸก | ๐ŸŸก | ๐ŸŸก | โœ… | ๐ŸŸก | โŒ | โŒ | +| GET_ROWS | โŒ | ๐ŸŸก | โœ… | ๐ŸŸก | ๐ŸŸก | ๐ŸŸก | โœ… | โœ… | ๐ŸŸก | โŒ | โŒ | | GET_ROWS_BACK | โŒ | โŒ | ๐ŸŸก | ๐ŸŸก | โŒ | โŒ | โŒ | โŒ | โŒ | โŒ | โŒ | | GROUP_NORM | โŒ | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โœ… | โŒ | โŒ | โŒ | | HARDSIGMOID | โŒ | โœ… | โœ… | ๐ŸŸก | โœ… | โŒ | โœ… | ๐ŸŸก | โœ… | โŒ | โŒ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index 345129b5a6c..afb4fa78740 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -334,78 +334,78 @@ "SYCL0","GET_ROWS","type=q8_0,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" "SYCL0","GET_ROWS","type=q8_0,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" "SYCL0","GET_ROWS","type=q8_0,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" -"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=1,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=1,be2=1,v=1","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=7,be2=1,v=0","support","0","no","SYCL" -"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=7,be2=1,v=1","support","0","no","SYCL" +"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q3_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q4_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q5_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=q6_K,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xxs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_xs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq2_s,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_xxs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_s,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq1_m,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_nl,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq3_s,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=iq4_xs,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","SYCL" +"SYCL0","GET_ROWS","type=i32,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","SYCL" "SYCL0","GET_ROWS_BACK","type=f32,n=1,m=8,r=2,b=1,v=0","support","0","no","SYCL" "SYCL0","GET_ROWS_BACK","type=f32,n=256,m=5,r=4,b=1,v=0","support","0","no","SYCL" "SYCL0","GET_ROWS_BACK","type=f32,n=256,m=5,r=4,b=1,v=1","support","0","no","SYCL" @@ -5108,449 +5108,547 @@ "SYCL0","SET","type_src=i32,type_dst=i32,ne=[6,5,4,3],dim=2,inplace=1","support","0","no","SYCL" "SYCL0","SET","type_src=i32,type_dst=i32,ne=[6,5,4,3],dim=3,inplace=0","support","0","no","SYCL" "SYCL0","SET","type_src=i32,type_dst=i32,ne=[6,5,4,3],dim=3,inplace=1","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[128,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[128,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[128,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[384,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[384,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne=[384,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[128,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[128,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[128,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[192,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[192,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne=[192,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q4_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q5_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q5_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q5_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q5_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q8_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q8_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q1_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q1_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=mxfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=mxfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=nvfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=nvfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q2_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q3_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q4_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q5_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q6_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq2_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq2_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq2_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq3_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq1_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq1_m,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq4_nl,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq4_nl,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq3_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq4_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q5_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q5_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q5_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q5_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q8_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q8_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q1_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q1_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=mxfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=mxfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=nvfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=nvfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q2_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q3_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q4_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q5_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q6_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq2_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq2_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq2_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq3_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq1_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq1_m,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq4_nl,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq4_nl,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq3_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq4_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q4_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q5_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q5_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q5_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q5_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q8_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q8_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q1_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q1_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=mxfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=mxfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=nvfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=nvfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q2_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q3_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q4_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q5_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q6_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq2_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq2_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq2_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq3_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq1_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq1_m,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq4_nl,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq4_nl,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq3_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq4_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q4_1,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q5_1,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q8_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q1_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=mxfp4,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=nvfp4,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q2_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q3_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q4_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q5_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=q6_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xxs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_xs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq2_s,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_xxs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_s,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq1_m,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_nl,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq3_s,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=iq4_xs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f16,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=i32,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=i32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=i32,type_dst=f32,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=i32,type_dst=f32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","SYCL" -"SYCL0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" -"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","SYCL" -"SYCL0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","SYCL" -"SYCL0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" -"SYCL0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[128,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[128,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[128,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[384,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[384,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[384,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[128,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[128,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[128,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[192,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[192,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[192,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=bf16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=bf16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q4_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q4_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q4_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q4_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q5_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q5_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q5_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q5_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q8_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q8_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q1_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=mxfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=mxfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=nvfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=nvfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q2_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q3_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q4_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q5_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q6_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq2_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq2_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq2_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq3_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq1_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq1_m,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq4_nl,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq4_nl,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq3_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq4_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=f16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=f16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q4_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q4_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q4_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q4_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q5_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q5_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q5_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q5_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q8_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q8_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q1_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=mxfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=mxfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=nvfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=nvfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q2_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q3_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q4_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q5_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q6_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq2_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq2_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq2_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq3_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq1_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq1_m,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq4_nl,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq4_nl,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq3_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq4_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=bf16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=bf16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q4_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q4_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q4_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q4_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q5_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q5_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q5_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q5_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q8_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q8_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q1_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=mxfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=mxfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=nvfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=nvfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q2_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q3_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q4_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q5_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q6_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq2_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq2_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq2_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq3_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq1_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq1_m,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq4_nl,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq4_nl,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq3_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq4_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_1,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q5_1,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q8_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q1_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=mxfp4,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=nvfp4,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q2_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q3_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q4_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q5_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=q6_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xxs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_xs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq2_s,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_xxs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_s,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq1_m,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_nl,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq3_s,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=iq4_xs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f16,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=i32,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=i32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=i32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=i32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","SYCL" +"SYCL0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","SYCL" +"SYCL0","CPY","type_src=i32,type_dst=i32,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","SYCL" +"SYCL0","CPY","type_src=i32,type_dst=i32,ne_src=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,5,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,5,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,5,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,7,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,5,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,3,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,3,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,5,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,5,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,7,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,7,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,7,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f32,type_dst=f32,ne_src=[32,7,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,7,5,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,5,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,5,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,5,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,7,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,5,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,7,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,7,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" +"SYCL0","CPY","type_src=f16,type_dst=f16,ne_src=[32,7,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","SYCL" "SYCL0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","SYCL" "SYCL0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","SYCL" "SYCL0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","SYCL" @@ -6088,6 +6186,7 @@ "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=64,n=1,k=64,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=256,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=32,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=4,k=128,bs=[2,3],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" @@ -7041,6 +7140,15 @@ "SYCL0","MUL_MAT","type_a=iq2_xxs,type_b=f16,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=iq2_xxs,type_b=f16,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=iq2_xxs,type_b=f16,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=q4_1,type_b=f32,m=16,n=1,k=32,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=q4_1,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=q5_0,type_b=f32,m=16,n=1,k=32,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" @@ -9954,6 +10062,11 @@ "SYCL0","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" @@ -10008,126 +10121,281 @@ "SYCL0","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" @@ -10182,6 +10450,11 @@ "SYCL0","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" @@ -10236,126 +10509,281 @@ "SYCL0","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" +"SYCL0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=1","support","1","yes","SYCL" "SYCL0","ROPE","type=f32,ne_a=[128,32,2,3],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=1","support","1","yes","SYCL" @@ -10909,6 +11337,19 @@ "SYCL0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","SYCL" "SYCL0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","SYCL" "SYCL0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,tfrm=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1024,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1024,2,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1024,16,1,1],pad_0=0,pad_1=1,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1023,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1023,8,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1025,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[1025,8,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[2048,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[2048,4,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[2049,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[100,1,1,1],pad_0=100,pad_1=0,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[100,1,1,1],pad_0=0,pad_1=100,circular=0","support","1","yes","SYCL" +"SYCL0","PAD","type=f32,ne_a=[100,100,1,1],pad_0=50,pad_1=50,circular=0","support","1","yes","SYCL" "SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","SYCL" "SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","SYCL" "SYCL0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","SYCL" @@ -11210,36 +11651,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -11250,12 +11691,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11268,12 +11709,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11286,12 +11727,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11304,12 +11745,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11318,36 +11759,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -11358,12 +11799,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11376,12 +11817,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11394,12 +11835,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11412,12 +11853,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11426,36 +11867,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -11466,12 +11907,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11484,12 +11925,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11502,12 +11943,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11520,12 +11961,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11534,36 +11975,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -11574,12 +12015,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11592,12 +12033,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11610,12 +12051,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11628,12 +12069,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11642,324 +12083,324 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -11970,12 +12411,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -11988,12 +12429,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12006,12 +12447,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12024,12 +12465,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12038,36 +12479,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -12078,12 +12519,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12096,12 +12537,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12114,12 +12555,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12132,12 +12573,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12146,36 +12587,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -12186,12 +12627,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12204,12 +12645,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12222,12 +12663,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12240,12 +12681,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12254,36 +12695,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -12294,12 +12735,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12312,12 +12753,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12330,12 +12771,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12348,12 +12789,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -12362,900 +12803,900 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -13266,12 +13707,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13284,12 +13725,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13302,12 +13743,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13320,12 +13761,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13334,36 +13775,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -13374,12 +13815,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13392,12 +13833,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13410,12 +13851,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13428,12 +13869,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13442,180 +13883,180 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -13626,12 +14067,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13644,12 +14085,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13662,12 +14103,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13680,12 +14121,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13694,36 +14135,36 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -13734,12 +14175,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13752,12 +14193,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13770,12 +14211,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13788,12 +14229,12 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" @@ -13802,432 +14243,432 @@ "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" @@ -16070,10 +16511,10 @@ "SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q4_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=f16,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q8_0,permute=[0,1,2,3]","support","0","no","SYCL" -"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f32,permute=[0,1,2,3]","support","0","no","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q4_0,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=128,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL" @@ -16084,31 +16525,39 @@ "SYCL0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","SYCL" "SYCL0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","0","no","SYCL" "SYCL0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","0","no","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=1,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=64,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=127,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=256,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=65,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=100,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=200,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=127,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=64,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=33,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" -"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=100,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=1,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=64,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=127,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=256,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=65,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=100,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=200,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=127,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=64,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=33,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=100,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=2,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=2","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=128,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=4","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1,K=4","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=8,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=3","support","1","yes","SYCL" +"SYCL0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=16,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","SYCL" diff --git a/docs/speculative.md b/docs/speculative.md index fb6ef03067d..43d18185891 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] - type of speculative decoding to use when no draft model is provided +--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] + comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) ---spec-default use default speculative decoding +--spec-default use default speculative decoding config + (enables ngram-mod) ``` ### Draft Model Parameters @@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha (env: LLAMA_ARG_SPEC_DRAFT_MODEL) --spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant] HuggingFace repository for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) --spec-draft-n-max N - number of tokens to draft for speculative decoding (default: 16) + number of tokens to draft for speculative decoding (default: 3) (env: LLAMA_ARG_SPEC_DRAFT_N_MAX) --spec-draft-n-min N minimum number of draft tokens to use for speculative decoding (default: 0) @@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha speculative decoding split probability (default: 0.10) (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) --spec-draft-p-min, --draft-p-min P - minimum speculative decoding probability (greedy) (default: 0.75) + minimum speculative decoding probability (greedy) (default: 0.00) (env: LLAMA_ARG_SPEC_DRAFT_P_MIN) ---spec-draft-ctx-size, -cd, --ctx-size-draft N - size of the prompt context for the draft model (default: 0, 0 = loaded from model) - (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) --spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto) (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) --spec-draft-device, -devd, --device-draft <dev1,dev2,..> comma-separated list of devices to use for offloading the draft model ---spec-draft-replace, --spec-replace TARGET DRAFT - translate the string in TARGET into DRAFT if the draft model and main model are not compatible + (use --list-devices to see available devices) +``` + +### Draft Model CPU Scheduling Parameters + +``` +--spec-draft-threads, -td, --threads-draft N + number of CPU threads to use during generation +--spec-draft-threads-batch, -tbd, --threads-batch-draft N + number of threads to use during batch and prompt processing (default: same as --threads-draft) +--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M + Draft model CPU affinity mask. Complements cpu-range-draft +--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi + Ranges of CPUs for affinity. Complements --cpu-mask-draft +--spec-draft-cpu-strict, --cpu-strict-draft <0|1> + Use strict CPU placement for draft model (default: same as --cpu-strict) +--spec-draft-prio, --prio-draft N + set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll, --poll-draft <0|1> + Use polling to wait for draft model work (default: same as --poll) +--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M + Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft +--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi + Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft +--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1> + Use strict CPU placement for draft model batch (default: --cpu-strict-draft) +--spec-draft-prio-batch, --prio-batch-draft N + set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll-batch, --poll-batch-draft <0|1> + Use polling to wait for draft model work for batch (default: --poll-draft) +``` + +### Draft Model KV Cache and Tensor Override Parameters + +``` +--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE + KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) +--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE + KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) +--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,... + override tensor buffer type for draft model +--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft + keep all Mixture of Experts (MoE) weights in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) +--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N + keep the MoE weights of the first N layers in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) ``` ### n-gram Mod Parameters @@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### `--spec-type TYPE` -Specifies a type of speculative decoding without draft model. +Specifies a comma-separated list of speculative decoding types to use. | Type | Description | |------|-------------| | `none` | No speculative decoding (default) | +| `draft-simple` | Use a simple draft model for speculation | +| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | @@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model. ./llama-server [...] --spec-type ngram-simple ``` +**Example:** Multiple speculative implementations. +```bash +./llama-server [...] --spec-type ngram-mod,ngram-map-k4v +``` + ### `--spec-ngram-*-size-n N` Sets the size N of the lookup n-gram for n-gram map based speculative decoding. @@ -268,3 +323,8 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens) - `#acc tokens`: number of tokens accepted by the main model - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance). + +## Benchmarking + +To measure the end-to-end effect of speculative decoding (throughput, latency, and draft acceptance) across diverse prompts, see the SPEED-Bench client in [tools/server/bench/speed-bench](../tools/server/bench/speed-bench/README.md). +It runs against a running `llama-server` and can compare a baseline run against a speculative-decoding run. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a29dc707c3d..39f802d250e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -27,7 +27,6 @@ else() add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(retrieval) - add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c4ec5c524e9..5c9305b1237 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None: def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] - if np.uint32(1) == np.uint32(1).newbyteorder("<"): + dummy_val = np.uint32(1) + if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file") diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index a2948273512..4bdd239c007 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -44,6 +44,7 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f GRADER_PATTERNS = { "aime": r'\boxed{(\d+)}|\b(\d+)\b', "aime2025": r'\boxed{(\d+)}|\b(\d+)\b', + "aime2026": r'\boxed{(\d+)}|\b(\d+)\b', "gsm8k": r'\b(\d+)\b', } @@ -58,6 +59,11 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f "-123", "999" ], + "aime2026": [ + "42", + "-123", + "999" + ], "gsm8k": [ "42", "-123", @@ -81,6 +87,12 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f {question} +Remember to put your answer inside \\boxed{{}}. +""", + "aime2026": """Solve the following math problem step by step. Put your answer inside \\boxed{{}}. + +{question} + Remember to put your answer inside \\boxed{{}}. """, "gsm8k": """{question} @@ -137,6 +149,8 @@ class TaskState: t_gen_ms: Optional[float] = None reasoning_content: Optional[str] = None server_name: Optional[str] = None + chunk_idx: int = 0 + problem_idx: int = 0 class EvalState: @@ -166,6 +180,8 @@ def load_dataset(self, seed: int = 1234): self.dataset = AimeDataset() elif self.dataset_type == "aime2025": self.dataset = Aime2025Dataset() + elif self.dataset_type == "aime2026": + self.dataset = Aime2026Dataset() elif self.dataset_type == "gsm8k": self.dataset = Gsm8kDataset() elif self.dataset_type == "gpqa": @@ -219,7 +235,9 @@ def add_result( tps_gen: Optional[float] = None, t_gen_ms: Optional[float] = None, reasoning_content: Optional[str] = None, - server_name: Optional[str] = None + server_name: Optional[str] = None, + chunk_idx: int = 0, + problem_idx: int = 0, ): with self._lock: if "cases" not in self.task_states: @@ -238,7 +256,9 @@ def add_result( "tps_gen": tps_gen, "t_gen_ms": t_gen_ms, "reasoning_content": reasoning_content, - "server_name": server_name + "server_name": server_name, + "chunk_idx": chunk_idx, + "problem_idx": problem_idx, } self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False)) @@ -275,6 +295,9 @@ def dump(self): all_cases = {} for i, task_id in tasks_to_save: question_text, prompt, expected = self.get_case(i) + # Extract chunk_idx from task_id for pending cases + _parts = task_id.rsplit("_", 2) + _chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 if task_id in self.task_states.get("cases", {}): all_cases[task_id] = self.task_states["cases"][task_id] else: @@ -292,7 +315,9 @@ def dump(self): "tps_gen": None, "t_gen_ms": None, "reasoning_content": None, - "server_name": None + "server_name": None, + "chunk_idx": _chunk_idx, + "problem_idx": i, } ci_lower, ci_upper = self.accuracy_ci() @@ -368,11 +393,12 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) escaped_server = self._escape_html(server_name) + answer_class = status_class if status == "ok" else "" rows.append(f"""<tr class="task-row" onclick="toggleDetails('{task_id}')"> <td>{task_id}</td> <td class="{status_class}">{status_text}</td> <td>{self._escape_html(expected)}</td> - <td>{self._escape_html(answer)}</td> + <td class="{answer_class}">{self._escape_html(answer)}</td> <td>{tokens_str}</td> <td>{tps_str}</td> <td>{t_gen_str}</td> @@ -391,6 +417,53 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A rows_html = "\n".join(rows) + # ---- per-problem summary table ---- + problem_groups: Dict[int, List[Dict[str, Any]]] = {} + for _tid, _case in cases.items(): + if _case.get("status") != "ok": + continue + _pidx = _case.get("problem_idx") + if _pidx is None: + _p_parts = _tid.rsplit("_", 2) + _pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0 + problem_groups.setdefault(_pidx, []).append(_case) + + summary_rows_html = "" + if problem_groups: + def _stat(v, fmt=".1f", avg_fmt=None): + if not v: + return ("โ€“", "โ€“", "โ€“") + af = fmt if avg_fmt is None else avg_fmt + return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}") + + summary_data = [] + for pidx, g in problem_groups.items(): + runs = len(g) + n_ok = sum(1 for c in g if c.get("correct", False)) + toks = [c["tokens"] for c in g if c.get("tokens") is not None] + tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None] + tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None] + summary_data.append(( + pidx, runs, n_ok, + _stat(toks, "d", ".0f"), + _stat(tps), + _stat(tg), + )) + + summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending + + summary_rows_html = "\n".join( + f"""<tr class="summary-row"> + <td>{p:03d}</td> + <td>{r}</td> + <td>{n}/{r}</td> + <td>{tk[0]}</td><td>{tk[1]}</td><td>{tk[2]}</td> + <td>{tp[0]}</td><td>{tp[1]}</td><td>{tp[2]}</td> + <td>{tg[0]}</td><td>{tg[1]}</td><td>{tg[2]}</td> + </tr>""" + for p, r, n, tk, tp, tg in summary_data + ) + html_content = f"""<!DOCTYPE html> <html> <head> @@ -398,10 +471,10 @@ def dump_html(self, tasks_to_save: List[Tuple[int, str]], all_cases: Dict[str, A <title>{self.dataset_type.upper()} Eval
- {self.dataset_type.upper()} - Model: {self.model_name or 'N/A'} - Accuracy: {accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%] - Correct: {n_correct} / {len(completed)} - Pending: {n_pending} - Time: {self.total_time:.1f}s - Sampling: {sampling_str} +
Dataset
{self.dataset_type.upper()}
+
Model
{self.model_name or 'N/A'}
+
Accuracy
{accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]
+
Correct
{n_correct} / {len(completed)}
+
Pending
{n_pending}
+
Time
{self.total_time:.1f}s
+
Sampling
{sampling_str}
+
+
+ + +
+
+ + + + + + + + + + + + + + + {rows_html} + +
IDGoldAnswerTokensT/sGen sServer
+
+
+ + + + + + + + + + + + + + + + + + + + + {summary_rows_html} + +
ProblemRunsCorrectTokensT/sGen s
minavgmaxminavgmaxminavgmax
- - - - - - - - - - - - - - - {rows_html} - -
IDGoldAnswerTokensT/sGen sServer
""" @@ -679,6 +803,47 @@ def get_prompt(self, question: Dict) -> str: question=self.get_question_text(question), ) +class Aime2026Dataset(BaseDataset): + def __init__(self): + self.questions = [] + self._load_dataset() + + def _load_dataset(self): + print(f"Loading AIME2026 dataset...") + from datasets import load_dataset + + cache_path = cache_dir / "MathArena___aime_2026" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = load_dataset("MathArena/aime_2026", "default", split="train", cache_dir=str(cache_path)) + else: + ds = load_dataset("MathArena/aime_2026", "default", split="train") + + self.questions = [] + for row in ds: + question = dict(row) + question["dataset_type"] = "aime2026" + self.questions.append(question) + + print(f"AIME2026 dataset loaded: {len(self.questions)} questions") + + def get_question(self, index: int) -> Dict: + """Get question by index""" + return self.questions[index] + + def get_question_text(self, question: Dict) -> str: + """Get question string""" + return question["problem"] + + def get_answer(self, question: Dict) -> str: + return str(question["answer"]) + + def get_prompt(self, question: Dict) -> str: + """Get formatted prompt for the question""" + return TEMPLATE_REGISTRY["aime2026"].format( + question=self.get_question_text(question), + ) + class Gsm8kDataset(BaseDataset): def __init__(self, split: str = "test"): self.split = split @@ -1007,12 +1172,19 @@ def _process_single_case( ) -> TaskState: question_text, prompt, expected = eval_state.get_case(i) + # Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}" + _parts = task_id.rsplit("_", 2) + chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 + problem_idx = i + task_state = TaskState( task_id=task_id, prompt=prompt, expected=expected, question_text=question_text, - server_name=server_config.name + server_name=server_config.name, + chunk_idx=chunk_idx, + problem_idx=problem_idx, ) try: @@ -1030,7 +1202,8 @@ def _process_single_case( eval_state.add_result( task_id, prompt, expected, result, None, {"finish_reason": finish_reason}, False, task_state.status, - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() return task_state @@ -1053,7 +1226,8 @@ def _process_single_case( eval_state.add_result( task_id, prompt, expected, result, answer, grader_log, is_correct, "ok", - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() @@ -1188,7 +1362,7 @@ def main(): "--dataset", type=str, default="aime", - choices=["aime", "aime2025", "gsm8k", "gpqa"], + choices=["aime", "aime2025", "aime2026", "gsm8k", "gpqa"], help="Dataset type (default: aime)" ) parser.add_argument( diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py index 2f9cdc5450d..e64ba89335d 100755 --- a/examples/llama-eval/llama-server-simulator.py +++ b/examples/llama-eval/llama-server-simulator.py @@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]: return int(match.group(0)) class AimeDataset: - def __init__(self, split: str = "train"): + def __init__(self, split: str = "train", dataset_type: str = "aime"): self.split = split + self.dataset_type = dataset_type self.questions: List[Dict] = [] self._load_dataset() - def _load_dataset(self): - print(f"Loading AIME dataset (split: {self.split})...") + def _get_question_text(self, question: Dict) -> str: + """Get question text, handling different dataset field names.""" + return question.get("problem", question.get("question", "")) - cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" - if cache_path.exists(): - print(f"Using cached dataset from {cache_path}") - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + def _load_dataset(self): + if self.dataset_type == "aime": + print(f"Loading AIME dataset (split: {self.split})...") + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + elif self.dataset_type == "aime2025": + print(f"Loading AIME2025 dataset...") + ds_list = [] + for config_name in ["AIME2025-I", "AIME2025-II"]: + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test") + ds_list.extend(ds) + ds = ds_list else: - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + raise ValueError(f"Unknown dataset type: {self.dataset_type}") self.questions = list(ds) - print(f"AIME dataset loaded: {len(self.questions)} questions") + print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions") def find_question(self, request_text: str) -> Optional[Dict]: + # Strip common template prefixes to get the actual question text + # Templates include things like "Solve the following math problem step by step..." + # The actual question usually follows a blank line or after the template instruction + cleaned = request_text + # Split on double newline and take the part that looks like the problem + parts = cleaned.split('\n\n') + if len(parts) > 1: + # Find the part that's longest (likely the actual problem text) + problem_parts = [p for p in parts if len(p.strip()) > 100] + if problem_parts: + cleaned = max(problem_parts, key=lambda x: len(x)) + best_match = None best_distance = -1 best_index = -1 for i, question in enumerate(self.questions): - question_text = question["problem"] - request_lower = request_text.lower() + question_text = self._get_question_text(question) + request_lower = cleaned.lower() question_lower = question_text.lower() + # Check if question text is contained in the cleaned request + if question_lower in request_lower or request_lower in question_lower: + debug_log(f"DEBUG: Found substring match at index {i}") + return question + # Exact match if question_lower == request_lower: debug_log(f"DEBUG: Found exact match at index {i}") @@ -118,7 +154,7 @@ def find_question(self, request_text: str) -> Optional[Dict]: debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}") return best_match - debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...") + debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...") return None def get_answer(self, question: Dict) -> str: @@ -134,15 +170,16 @@ def __init__( port: int = 8033, host: str = "localhost", success_rate: float = 0.8, - dataset_split: str = "train" + dataset_split: str = "train", + dataset_type: str = "aime" ): self.port = port self.host = host self.success_rate = success_rate - self.dataset = AimeDataset(dataset_split) + self.dataset = AimeDataset(dataset_split, dataset_type) self.eval_state = EvalState( - id="aime-2025", - tasks=["aime"], + id=dataset_type, + tasks=[dataset_type], task_states={}, sampling_config={"temperature": 0, "max_tokens": 2048} ) @@ -159,6 +196,10 @@ def _generate_response( else: response_text = self._generate_wrong_answer(question) + comp_tokens = random.randint(10000, 60000) + tps_gen = random.uniform(90.0, 110.0) + t_gen_ms = comp_tokens / tps_gen * 1000 + return { "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", @@ -176,8 +217,12 @@ def _generate_response( ], "usage": { "prompt_tokens": 100, - "completion_tokens": 50, - "total_tokens": 150 + "completion_tokens": comp_tokens, + "total_tokens": 100 + comp_tokens + }, + "timings": { + "predicted_ms": t_gen_ms, + "predicted_per_second": tps_gen } } @@ -218,6 +263,12 @@ def _process_request(self, request_data: Dict) -> Dict: return response class RequestHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/v1/models": + self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200) + return + self._send_json({"error": "Not found"}, 404) + def do_POST(self): if self.path != "/v1/chat/completions": self._send_json({"error": "Not found"}, 404) @@ -280,6 +331,13 @@ def main(): default=0.8, help="Success rate 0-1 (default: 0.8)" ) + parser.add_argument( + "--dataset", + type=str, + default="aime", + choices=["aime", "aime2025"], + help="Dataset type (default: aime)" + ) parser.add_argument( "--dataset-split", type=str, @@ -294,7 +352,8 @@ def main(): port=args.port, host=args.host, success_rate=args.success_rate, - dataset_split=args.dataset_split + dataset_split=args.dataset_split, + dataset_type=args.dataset ) server = HTTPServer((args.host, args.port), RequestHandler) @@ -304,7 +363,7 @@ def main(): print("\n=== llama-server-simulator ===") print(f"Server running on http://{args.host}:{args.port}") print(f"Success rate: {args.success_rate}") - print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions") + print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions") print("\nPress Ctrl+C to stop\n") try: diff --git a/examples/llama.android/lib/build.gradle.kts b/examples/llama.android/lib/build.gradle.kts index 9b290d6d4a7..ae95f41a838 100644 --- a/examples/llama.android/lib/build.gradle.kts +++ b/examples/llama.android/lib/build.gradle.kts @@ -25,6 +25,7 @@ android { arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON" arguments += "-DBUILD_SHARED_LIBS=ON" + arguments += "-DLLAMA_BUILD_APP=OFF" arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DLLAMA_OPENSSL=OFF" diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md index c43e642fee7..344f0ac94a9 100644 --- a/examples/model-conversion/README.md +++ b/examples/model-conversion/README.md @@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO ## HuggingFace utilities The following targets are useful for creating collections and model repositories -on Hugging Face in the the ggml-org. These can be used when preparing a release +on Hugging Face in the ggml-org. These can be used when preparing a release to script the process for new model releases. For the following targets a `HF_TOKEN` environment variable is required. diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py index 614c1a86b9a..001d5889655 100755 --- a/examples/model-conversion/scripts/embedding/run-original-model.py +++ b/examples/model-conversion/scripts/embedding/run-original-model.py @@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device print("Using SentenceTransformer to apply all numbered layers") model = SentenceTransformer(model_path) tokenizer = model.tokenizer - config = model[0].auto_model.config + config = model[0].auto_model.config # ty: ignore[unresolved-attribute] else: tokenizer = AutoTokenizer.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt deleted file mode 100644 index 78024672e77..00000000000 --- a/examples/save-load-state/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-save-load-state) -add_executable(${TARGET} save-load-state.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp deleted file mode 100644 index e6f5e9802ab..00000000000 --- a/examples/save-load-state/save-load-state.cpp +++ /dev/null @@ -1,320 +0,0 @@ -#include "arg.h" -#include "common.h" -#include "llama.h" - -#include -#include -#include - - -int main(int argc, char ** argv) { - std::setlocale(LC_NUMERIC, "C"); - - common_params params; - - params.prompt = "The quick brown fox"; - params.sampling.seed = 1234; - - const std::string_view state_file = "dump_state.bin"; - - common_init(); - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { - return 1; - } - - if (params.n_parallel == 1) { - // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache - printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__); - params.kv_unified = true; - } - - if (params.n_predict < 0) { - params.n_predict = 16; - } - - auto n_past = 0; - - std::string result0; - std::string result1; - std::string result2; - std::string result3; - - // init - - ggml_backend_load_all(); - - auto llama_init = common_init_from_params(params); - - auto * model = llama_init->model(); - auto * ctx = llama_init->context(); - - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } - - auto sparams = llama_sampler_chain_default_params(); - - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); - - // tokenize prompt - auto tokens = common_tokenize(ctx, params.prompt, true); - - const bool save_state = true; - if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) { - return 1; - } - - // first run - printf("\nfirst run: %s", params.prompt.c_str()); - - llama_batch batch = llama_batch_init(1, 0, 1); - - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = common_token_to_piece(ctx, next_token); - - printf("%s", next_token_str.c_str()); - result0 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - printf("\n\n"); - - // make new context - llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params)); - - llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); - - printf("\nsecond run: %s", params.prompt.c_str()); - - // load state from file - std::vector unused_sts(tokens.size()); // unused session tokens. - size_t n_token_count_out = 0; - - if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; - } - - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx2, tokens.back(), n_past)) { - return 1; - } - ++n_past; - - // second run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = common_token_to_piece(ctx2, next_token); - - printf("%s", next_token_str.c_str()); - result1 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); - - if (llama_decode(ctx2, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - printf("\n\n"); - - if (result0 != result1) { - fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); - return 1; - } - - // make new context - auto params_ctx3 = common_context_params_to_llama(params); - params_ctx3.n_seq_max = 2; - llama_context * ctx3 = llama_init_from_model(model, params_ctx3); - - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); - - printf("\nsingle seq run: %s", params.prompt.c_str()); - - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; - - if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; - } - - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx3, tokens.back(), n_past)) { - return 1; - } - ++n_past; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_memory_clear(llama_get_memory(ctx3), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - - // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); - } - - // third run with seq 1 instead of 0 - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = common_token_to_piece(ctx3, next_token); - - printf("%s", next_token_str.c_str()); - result2 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); - - if (llama_decode(ctx3, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - // test on-device state save/load - auto params_ctx4 = common_context_params_to_llama(params); - params_ctx4.n_seq_max = 2; - llama_context * ctx4 = llama_init_from_model(model, params_ctx4); - - llama_sampler * smpl4 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed)); - - printf("\nsingle seq run: %s", params.prompt.c_str()); - - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; - - if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; - } - - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx4, tokens.back(), n_past)) { - return 1; - } - ++n_past; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); - const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_memory_clear(llama_get_memory(ctx4), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - - // restore kv into seq 0 - const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); - } - - // forth run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl4, ctx4, -1); - auto next_token_str = common_token_to_piece(ctx4, next_token); - - printf("%s", next_token_str.c_str()); - result3 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); - - if (llama_decode(ctx4, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - printf("\n"); - - llama_sampler_free(smpl); - llama_sampler_free(smpl2); - llama_sampler_free(smpl3); - llama_sampler_free(smpl4); - - llama_batch_free(batch); - - // this one is managed by common_init_result - //llama_free(ctx); - - llama_free(ctx2); - llama_free(ctx3); - llama_free(ctx4); - - if (result0 != result2) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); - return 1; - } - - if (result0 != result3) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); - return 1; - } - - fprintf(stderr, "\n%s : success\n", __func__); - - return 0; -} diff --git a/examples/sycl/start-svr.sh b/examples/sycl/start-svr.sh index b7ea3096486..ce31ec51d2b 100755 --- a/examples/sycl/start-svr.sh +++ b/examples/sycl/start-svr.sh @@ -111,7 +111,6 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then echo "Use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}" - export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}" echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}" else echo "Use all Intel GPUs, including iGPU & dGPU" diff --git a/examples/sycl/test.sh b/examples/sycl/test.sh index 38d2e926896..116047cd2ea 100755 --- a/examples/sycl/test.sh +++ b/examples/sycl/test.sh @@ -119,7 +119,6 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then echo "Use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}" - export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}" echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}" else echo "Use all Intel GPUs, including iGPU & dGPU" diff --git a/examples/sycl/win-start-svr.bat b/examples/sycl/win-start-svr.bat index 4d850cbaa6f..13b5159e002 100644 --- a/examples/sycl/win-start-svr.bat +++ b/examples/sycl/win-start-svr.bat @@ -164,7 +164,6 @@ if not "%GGML_SYCL_DEVICE%"=="-1" ( echo Use %GGML_SYCL_DEVICE% as main GPU REM Use single GPU only. set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%" - set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%" echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR% ) else ( echo Use all Intel GPUs, including iGPU ^& dGPU diff --git a/examples/sycl/win-test.bat b/examples/sycl/win-test.bat index 781d17705db..39640908b07 100644 --- a/examples/sycl/win-test.bat +++ b/examples/sycl/win-test.bat @@ -186,7 +186,6 @@ if not "%GGML_SYCL_DEVICE%"=="-1" ( echo Use %GGML_SYCL_DEVICE% as main GPU REM Use single GPU only. set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%" - set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%" echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR% ) else ( echo Use all Intel GPUs, including iGPU ^& dGPU diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index de765ef3e24..dc8899b46ef 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -4,7 +4,7 @@ project("ggml" C CXX ASM) ### GGML Version set(GGML_VERSION_MAJOR 0) -set(GGML_VERSION_MINOR 11) +set(GGML_VERSION_MINOR 13) set(GGML_VERSION_PATCH 1) set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") @@ -353,7 +353,7 @@ if (GGML_STANDALONE) @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc - DESTINATION share/pkgconfig) + DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) endif() # diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index 91c9d5cd343..23a3066f56d 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -6,6 +6,7 @@ include(CMakeFindDependencyMacro) find_dependency(Threads) if (NOT GGML_SHARED_LIB) + set(GGML_BASE_INTERFACE_LINK_LIBRARIES "") set(GGML_CPU_INTERFACE_LINK_LIBRARIES "") set(GGML_CPU_INTERFACE_LINK_OPTIONS "") @@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB) if (GGML_OPENMP_ENABLED) find_dependency(OpenMP) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "") + if (TARGET OpenMP::OpenMP_C) + list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C) + endif() + if (TARGET OpenMP::OpenMP_CXX) + list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX) + endif() + list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES}) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES}) endif() if (GGML_CPU_HBM) @@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml) add_library(ggml::ggml-base UNKNOWN IMPORTED) set_target_properties(ggml::ggml-base PROPERTIES - IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") + IMPORTED_LOCATION "${GGML_BASE_LIBRARY}" + INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}") set(_ggml_all_targets "") if (NOT GGML_BACKEND_DL) diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 78aa059dde3..a7926a21a9a 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i // Utils // Create a buffer and allocate all the tensors in a ggml_context // ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft +// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index b6f73739809..2924fdbe988 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -381,11 +381,15 @@ extern "C" { // - most tensors have n_segments == 1 and a contiguous slice of the tensor data // - some tensors have an inhomogenenous data layout along the split axis, // those tensors are divided into segments which are each individually split across devices - // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis, - // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1], + // - ne has one entry per segment and device and that segment repeats nr times, + // in total when accounting for repetitions the segments add up to ggml_tensor::ne for that axis, + // the outer/inner loops are over segments/devices like [seg0_dev0_r0, seg0_dev1_r0, seg0_dev0_r1, seg0_dev1_r1, seg1_dev0_r0, seg1_dev1_r0], // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments - // that each need to be split individually across devices so that each device gets a slice of Q, K, and V + // that each need to be split individually across devices so that each device gets a slice of Q, K, and V, + // the Q matrix can be larger than the K and V matrices so this can either be expressed as 3 segments or as 2 segments + // where the segment for K/V repeats twice int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES]; + uint32_t nr[16]; uint32_t n_segments; }; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3357a0d9985..f6725265504 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1189,8 +1189,8 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); - // a - x - // b - dy + // a - dy + // b - x GGML_API struct ggml_tensor * ggml_silu_back( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2541,6 +2541,11 @@ extern "C" { // TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST] // ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306 + // + // state is a 3D tensor of shape (S_v*S_v*H, K, n_seqs): + // K == 1: output carries the final state only. + // K > 1: output carries K snapshot slots; the kernel writes the last min(n_tokens, K) + // per-token snapshots into the trailing slots GGML_API struct ggml_tensor * ggml_gated_delta_net( struct ggml_context * ctx, struct ggml_tensor * q, diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 02d5f221c03..67851ba6f16 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -76,10 +76,16 @@ extern "C" { struct ggml_context ** ctx; }; + // callback to simulate or wrap a FILE pointer - read up to `len` bytes at `offset` into `output` and return the number of bytes read + typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len); + GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); + GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params); + + // max_chunk_read is the maximum number of bytes that the GGUF code will read at once from the callback, a value of 0 means no limit + GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params); GGML_API void gguf_free(struct gguf_context * ctx); @@ -87,7 +93,7 @@ extern "C" { GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx); GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); - GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); // padded to gguf_get_alignment if and only if the gguf_context contains at least one tensor GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx); GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 3e48860bfc8..c26c3f1470d 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC) target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC) endif() +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "") + else() + set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") + message(WARNING "OpenMP not found") + endif() +else() + set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") +endif() + +if (GGML_OPENMP_ENABLED) + target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP) + target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) +endif() + add_library(ggml ggml-backend-dl.cpp ggml-backend-reg.cpp) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index a4b01ccf8a1..3bda9abbe03 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) { // shift all elements after idx by 1 to the left, overwriting the element at idx - for (int i = idx; i < chunk->n_free_blocks; i++) { + for (int i = idx; i < chunk->n_free_blocks - 1; i++) { chunk->free_blocks[i] = chunk->free_blocks[i+1]; } chunk->n_free_blocks--; diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index c0ffd9a048b..8c44c3e44ae 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -392,64 +393,103 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type( // meta backend buffer // +// Container to hold the tensor slices per simple ggml backend buffer. +struct ggml_backend_meta_simple_tensor_container { + std::vector ctxs; + std::map> simple_tensors; + + ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) { + ctxs.reserve(n_simple); + for (int i = 0; i < n_simple; i++) { + ctxs.emplace_back(ggml_init(params)); + } + } + ggml_backend_meta_simple_tensor_container() {} +}; + struct ggml_backend_meta_buffer_context { + // FIXME + // Most tensors can simply be stored statically in their own buffer. + // Externally created views however also need a mapping to simple tensors but they use the buffer of the view source. + // If external views are simply using that buffer they will slowly deplete its memory. + // Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp. + // Long-term: tie the lifetime of external views to the meta backend executing the graph instead, + // currently not possible due to graph-external operations in the backend scheduler. + ggml_backend_meta_simple_tensor_container stc_static; + ggml_backend_meta_simple_tensor_container stc_compute[2]; + int stc_compute_index = 0; + int stc_compute_index_next = 0; + std::vector bufs; + + // FIXME + // The size of the split state cache is unbounded and can theoretically grow infinitely large. + // However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive. static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); - std::map, std::pair> split_state_cache; - std::map< const ggml_tensor *, std::vector> simple_tensors; - - struct buffer_config { - ggml_context * ctx; - ggml_backend_buffer_t buf; - - buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {} - }; - std::vector buf_configs; int debug; - ggml_backend_meta_buffer_context() { + ggml_backend_meta_buffer_context( + ggml_backend_meta_simple_tensor_container & stc_static, + ggml_backend_meta_simple_tensor_container & stc_compute_0, + ggml_backend_meta_simple_tensor_container & stc_compute_1, + const std::vector & bufs) + : stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} { + this->bufs.reserve(bufs.size()); + for (ggml_backend_buffer_t buf : bufs) { + this->bufs.emplace_back(buf); + } const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG"); debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0; } + + ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) { + if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) { + return stc_static; + } + return stc_compute[stc_compute_index]; + } }; static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) { GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; - for (auto & [ctx, buf] : buf_ctx->buf_configs) { - ggml_backend_buffer_free(buf); - ggml_free(ctx); - } delete buf_ctx; } static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) { GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context; - return buf_ctx->buf_configs.size(); + return buf_ctx->bufs.size(); } static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) { GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context; - GGML_ASSERT(index < buf_ctx->buf_configs.size()); - return buf_ctx->buf_configs[index].buf; + GGML_ASSERT(index < buf_ctx->bufs.size()); + return buf_ctx->bufs[index].get(); } static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) { GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; - GGML_ASSERT(index < buf_ctx->buf_configs.size()); + GGML_ASSERT(index < buf_ctx->bufs.size()); - auto it = buf_ctx->simple_tensors.find(tensor); - if (it == buf_ctx->simple_tensors.end()) { + ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor); + auto it = stc.simple_tensors.find(tensor); + if (it == stc.simple_tensors.end()) { return nullptr; } return it->second[index]; } -static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync); + +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state( + ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) { + // FIXME Currently this function preserves/erases the information in n_segments and nr in an inconsistent way. + // Since the operations in question are developed specifically for llama.cpp this currently does not manifest as a bug there. + // However, in a broader ggml context with arbitrary ggml graphs this can lead to unexpected results. const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; @@ -460,11 +500,11 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co for (size_t j = 0; j < n_bufs; j++) { int64_t sum_a = 0; for (size_t s = 0; s < a.n_segments; s++) { - sum_a += a.ne[s*n_bufs + j]; + sum_a += a.ne[s*n_bufs + j] * a.nr[s]; } int64_t sum_b = 0; for (size_t s = 0; s < b.n_segments; s++) { - sum_b += b.ne[s*n_bufs + j]; + sum_b += b.ne[s*n_bufs + j] * b.nr[s]; } if (sum_a != sum_b) { return false; @@ -474,7 +514,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co }; auto handle_generic = [&](const std::vector & src_ss, bool scalar_only) -> ggml_backend_meta_split_state { - ggml_backend_meta_split_state ret = {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, 1}; + ggml_backend_meta_split_state ret = {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, {1}, 1}; for (size_t i = 0; i < GGML_MAX_SRC; i++) { if (tensor->src[i] == nullptr || tensor->src[i] == tensor) { continue; @@ -482,15 +522,15 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co if (ret.axis == GGML_BACKEND_SPLIT_AXIS_NONE) { ret = src_ss[i]; } else if (!split_states_equal(src_ss[i], ret)) { - ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; break; } } if (ret.axis == GGML_BACKEND_SPLIT_AXIS_NONE) { - ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } if (scalar_only && ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) { - ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } GGML_ASSERT(ret.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); return ret; @@ -534,42 +574,24 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co auto handle_mul_mat = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) { - return {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1}; + return {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, {1}, 1}; } if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1 && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) { ggml_backend_meta_split_state ret = src_ss[0]; ret.axis = GGML_BACKEND_SPLIT_AXIS_0; + ret.nr[0] = 1; ret.n_segments = 1; return ret; } if (src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_1 && src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) { - ggml_backend_meta_split_state ret = src_ss[1]; - ret.n_segments = 1; - return ret; + return src_ss[1]; } if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_0 && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_0) { GGML_ASSERT(split_states_equal(src_ss[0], src_ss[1])); - return {assume_sync ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_PARTIAL, {0}, 1}; + return {assume_sync ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_PARTIAL, {0}, {1}, 1}; } GGML_ABORT("fatal error"); - //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; - }; - - auto handle_cpy = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { - if (src_ss[0].axis >= 0 && src_ss[0].axis < GGML_MAX_DIMS) { - int64_t ne_split_src = tensor->src[0]->ne[0]; - for (int dim = 1; dim <= src_ss[0].axis; dim++) { - ne_split_src *= tensor->src[0]->ne[dim]; - } - int64_t ne_split_dst = 1; - for (int dim = 0; dim < GGML_MAX_DIMS; dim++) { - ne_split_dst *= tensor->ne[dim]; - if (ne_split_dst == ne_split_src) { - return {ggml_backend_meta_split_axis(dim), {0}, 1}; - } - } - } - return handle_generic(src_ss, /*scalar_only =*/ false); + //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; }; auto handle_reshape = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { @@ -578,33 +600,25 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co case GGML_BACKEND_SPLIT_AXIS_1: case GGML_BACKEND_SPLIT_AXIS_2: case GGML_BACKEND_SPLIT_AXIS_3: { - GGML_ASSERT(!ggml_is_permuted(tensor) && !ggml_is_permuted(tensor->src[0])); - if (src_ss[0].axis == ggml_n_dims(tensor->src[0]) - 1) { - return {ggml_backend_meta_split_axis(ggml_n_dims(tensor) - 1), {0}, 1}; + GGML_ASSERT(src_ss[0].n_segments == 1); + if (src_ss[0].axis == ggml_n_dims(tensor->src[0]) - 1 && src_ss[0].nr[0] == 1) { + return {ggml_backend_meta_split_axis(ggml_n_dims(tensor) - 1), {0}, {1}, 1}; } - std::vector base_ne_in; - base_ne_in.reserve(GGML_MAX_DIMS - src_ss[0].axis); - { - base_ne_in.push_back(1); - int dim = 0; - for (; dim <= src_ss[0].axis; dim++) { - base_ne_in[0] *= tensor->src[0]->ne[dim]; - } - for (; dim <= GGML_MAX_DIMS; dim++) { - base_ne_in.push_back(base_ne_in.back() * tensor->src[0]->ne[dim]); - } + int64_t base_ne_in = tensor->src[0]->ne[0]; + for (int dim = 1; dim <= src_ss[0].axis; dim++) { + base_ne_in *= tensor->src[0]->ne[dim]; } + base_ne_in /= src_ss[0].nr[0]; int64_t base_ne_out = 1; for (int dim = 0; dim < GGML_MAX_DIMS; dim++) { const int64_t base_ne_out_next = base_ne_out *= tensor->ne[dim]; - for (const int64_t & bni : base_ne_in) { - if (bni == base_ne_out_next) { - return {ggml_backend_meta_split_axis(dim), {0}, 1}; - } + if (base_ne_out_next % base_ne_in == 0) { + return {ggml_backend_meta_split_axis(dim), {0}, {uint32_t(base_ne_out_next/base_ne_in)}, 1}; } - if (base_ne_out_next > base_ne_in[0]) { - GGML_ASSERT(dim + 1 < GGML_MAX_DIMS); - return {ggml_backend_meta_split_axis(dim + 1), {0}, 1}; + if (base_ne_out_next > base_ne_in) { + GGML_ASSERT(src_ss[0].n_segments == 1); + GGML_ASSERT(src_ss[0].nr[0] == 1); + return {ggml_backend_meta_split_axis(dim), {0}, {1}, 1}; } base_ne_out = base_ne_out_next; } @@ -616,11 +630,18 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co } default: { GGML_ABORT("fatal error"); - //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } } }; + auto handle_cpy = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { + if (src_ss[0].axis >= 0 && src_ss[0].axis < GGML_MAX_DIMS) { + return handle_reshape(src_ss); + } + return handle_generic(src_ss, /*scalar_only =*/ false); + }; + auto handle_view = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { if (ggml_is_contiguous(tensor) && ggml_is_contiguous(tensor->src[0])) { return handle_reshape(src_ss); @@ -644,7 +665,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co if (!ggml_is_permuted(tensor) && !ggml_is_permuted(tensor->src[0]) && axis >= 0 && axis < GGML_MAX_DIMS-1) { for (int dim = 0; dim < GGML_MAX_DIMS-1; dim++) { if (tensor->nb[dim+1] == tensor->src[0]->nb[axis+1]) { - return {ggml_backend_meta_split_axis(dim), {0}, 1}; + return {ggml_backend_meta_split_axis(dim), {0}, {1}, 1}; } } GGML_ABORT("fatal error"); @@ -653,7 +674,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co return src_ss[0]; } GGML_ABORT("view of permuted tensor not implemented"); - //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; }; auto handle_permute = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { @@ -662,7 +683,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co case GGML_BACKEND_SPLIT_AXIS_1: case GGML_BACKEND_SPLIT_AXIS_2: case GGML_BACKEND_SPLIT_AXIS_3: { - return {ggml_backend_meta_split_axis(tensor->op_params[src_ss[0].axis]), {0}, 1}; + GGML_ASSERT(src_ss[0].n_segments == 1 || src_ss[0].nr[0] == 1); + return {ggml_backend_meta_split_axis(tensor->op_params[src_ss[0].axis]), {0}, {src_ss[0].nr[0]}, 1}; } case GGML_BACKEND_SPLIT_AXIS_MIRRORED: case GGML_BACKEND_SPLIT_AXIS_PARTIAL: { @@ -670,7 +692,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co } default: { GGML_ABORT("fatal error"); - //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } } }; @@ -679,7 +701,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co switch (src_ss[0].axis) { case GGML_BACKEND_SPLIT_AXIS_0: case GGML_BACKEND_SPLIT_AXIS_1: { - return {ggml_backend_meta_split_axis(int(src_ss[0].axis) ^ 1), {0}, 1}; + GGML_ASSERT(src_ss[0].n_segments == 1 || src_ss[0].nr[0] == 1); + return {ggml_backend_meta_split_axis(int(src_ss[0].axis) ^ 1), {0}, {src_ss[0].nr[0]}, 1}; } case GGML_BACKEND_SPLIT_AXIS_2: case GGML_BACKEND_SPLIT_AXIS_3: @@ -689,7 +712,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co } default: { GGML_ABORT("fatal error"); - //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + //return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } } }; @@ -727,16 +750,16 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co GGML_ASSERT( src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_2); GGML_ASSERT(tensor->src[4] == nullptr || src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED); GGML_ASSERT(tensor->src[4] == nullptr || src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_0); - return {GGML_BACKEND_SPLIT_AXIS_1, {0}, 1}; + return {GGML_BACKEND_SPLIT_AXIS_1, {0}, {1}, 1}; }; auto handle_ssm_conv = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { if (src_ss[0].axis == src_ss[1].axis) { if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_0) { - return {GGML_BACKEND_SPLIT_AXIS_1, {0}, 1}; + return {GGML_BACKEND_SPLIT_AXIS_1, {0}, {1}, 1}; } if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1) { - return {GGML_BACKEND_SPLIT_AXIS_0, {0}, 1}; + return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1}; } } return handle_generic(src_ss, /*scalar_only =*/ false); @@ -744,8 +767,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co auto handle_gated_delta_net = [&](const std::vector & src_ss) -> ggml_backend_meta_split_state { if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && - src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && - src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) { + src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && + src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) { return src_ss[0]; } GGML_ASSERT(src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1); @@ -753,13 +776,15 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co GGML_ASSERT(src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_1); GGML_ASSERT(src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_1); GGML_ASSERT(src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_1); - GGML_ASSERT(src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_2); - return {GGML_BACKEND_SPLIT_AXIS_0, {0}, 1}; + // state shape is (S_v*S_v*H, K, n_seqs); the heads dim is nested inside axis 0, + // so a head-aligned split on the input cache reshapes to axis 0 here (not axis 2). + GGML_ASSERT(src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_2 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_1 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_0); + return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1}; }; auto calculate_split_state = [&]() -> ggml_backend_meta_split_state { if (ggml_nelements(tensor) == 0) { - return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } if (ggml_backend_buffer_get_usage(tensor->buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE && tensor->view_src == nullptr) { ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer)); @@ -768,29 +793,31 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co if (ret.axis >= 0 && ret.axis <= GGML_MAX_DIMS) { const int64_t granularity = ret.axis == GGML_BACKEND_SPLIT_AXIS_0 ? ggml_blck_size(tensor->type) : 1; int64_t ne_sum = 0; - for (size_t sj = 0; sj < ret.n_segments*n_bufs; sj++) { - GGML_ASSERT(ret.ne[sj] % granularity == 0); - ne_sum += ret.ne[sj]; + for (size_t s = 0; s < ret.n_segments; s++) { + for (size_t j = 0; j < n_bufs; j++) { + GGML_ASSERT(ret.ne[s*n_bufs + j] % granularity == 0); + ne_sum += ret.ne[s*n_bufs + j] * ret.nr[s]; + } } GGML_ASSERT(ne_sum == tensor->ne[ret.axis]); } return ret; } - std::vector src_ss(GGML_MAX_SRC, {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, 1}); + std::vector src_ss(GGML_MAX_SRC, {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, {1}, 1}); for (size_t i = 0; i < GGML_MAX_SRC; i++) { if (tensor->src[i] == nullptr || tensor->src[i] == tensor) { - src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; continue; } - src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true); + src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true); GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); } ggml_backend_meta_split_state split_state; switch (tensor->op) { case GGML_OP_NONE: { - split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1}; + split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, {1}, 1}; } break; case GGML_OP_DUP: { split_state = handle_generic(src_ss, /*scalar_only =*/ true); @@ -977,7 +1004,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co } break; default: { GGML_ABORT("ggml op not implemented: %s", ggml_op_name(tensor->op)); - split_state = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; + split_state = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1}; } break; } if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) { @@ -995,23 +1022,25 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co split_state.ne[s*n_bufs + j] = 0; } for (size_t s = 0; s < src_ss[i].n_segments; s++) { - split_state.ne[j] += src_ss[i].ne[s*n_bufs + j]; + split_state.ne[j] += src_ss[i].ne[s*n_bufs + j] * src_ss[i].nr[s]; } split_state.ne[j] *= tensor->ne[split_state.axis]; if (split_state.ne[j] != 0 || tensor->src[i]->ne[src_ss[i].axis] != 0) { - GGML_ASSERT(split_state.ne[j] % tensor->src[i]->ne[src_ss[i].axis] == 0); - split_state.ne[j] /= tensor->src[i]->ne[src_ss[i].axis]; + const int64_t div = tensor->src[i]->ne[src_ss[i].axis] * split_state.nr[0]; + GGML_ASSERT(split_state.ne[j] % div == 0); + split_state.ne[j] /= div; } } } else { + GGML_ASSERT(split_state.n_segments == 1); for (size_t j = 0; j < n_bufs; j++) { + // Assert that ratio is consistent: int64_t sum = 0; for (size_t s = 0; s < src_ss[i].n_segments; s++) { - sum += src_ss[i].ne[s*n_bufs + j]; + sum += src_ss[i].ne[s*n_bufs + j] * src_ss[i].nr[s]; } - // Assert that ratio is consistent: - GGML_ASSERT(split_state.ne[j] * tensor->src[i]->ne[src_ss[i].axis] - == sum * tensor->ne[split_state.axis]); + GGML_ASSERT(split_state.ne[j]*split_state.nr[0] * tensor->src[i]->ne[src_ss[i].axis] + == sum * tensor->ne[split_state.axis]); } } first_src_split_by_axis = false; @@ -1041,13 +1070,14 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co srcs_info += ", "; } const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor->src[0], true); + GGML_ASSERT(split_state.n_segments == 1); const char * axis_name = ggml_backend_meta_split_axis_name(split_state.axis); std::string ne_info; for (size_t j = 0; j < n_bufs; j++) { if (!ne_info.empty()) { ne_info += ", "; } - ne_info += std::to_string(split_state.ne[j]); + ne_info += std::to_string(split_state.ne[j]) + "x" + std::to_string(split_state.nr[0]); } srcs_info += std::string(tensor->src[i]->name) + "[" + ggml_op_name(tensor->src[i]->op) + ", " + axis_name + ", {" + ne_info + "}]"; } @@ -1056,7 +1086,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co if (!ne_info.empty()) { ne_info += ", "; } - ne_info += std::to_string(buf_ctx->split_state_cache[key].first.ne[j]); + const ggml_backend_meta_split_state & ss = buf_ctx->split_state_cache[key].first; + ne_info += std::to_string(ss.ne[j]) + "x" + std::to_string(ss.nr[0]); } GGML_LOG_DEBUG("SPLIT_STATE: {%s} -> %s[%s, %s, {%s}]\n", srcs_info.c_str(), tensor->name, ggml_op_name(tensor->op), ggml_backend_meta_split_axis_name(buf_ctx->split_state_cache[key].first.axis), ne_info.c_str()); @@ -1068,8 +1099,10 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co #ifndef NDEBUG if (ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) { int64_t ne_ret = 0; - for (size_t sj = 0; sj < ret.n_segments*n_bufs; sj++) { - ne_ret += ret.ne[sj]; + for (size_t s = 0; s < ret.n_segments; s++) { + for (size_t j = 0; j < n_bufs; j++) { + ne_ret += ret.ne[s*n_bufs + j] * ret.nr[s]; + } } assert(ne_ret == tensor->ne[int(ret.axis)]); } @@ -1077,17 +1110,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co return ret; } +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync); +} + static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return (void *) 0x1000000000000000; // FIXME } -static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); - ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; - const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer); +static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); - const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true); + const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true); GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); GGML_ASSERT(split_state.n_segments <= 16); @@ -1102,15 +1141,15 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer std::vector simple_tensors; simple_tensors.reserve(n_simple_bufs); for (size_t j = 0; j < n_simple_bufs; j++) { - ggml_context * simple_ctx = buf_ctx->buf_configs[j].ctx; - ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf; + ggml_context * simple_ctx = stc.ctxs[j].get(); + ggml_backend_buffer_t simple_buf = buf_ctx->bufs[j].get(); if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) { // TODO: the following assert fails for llama-parallel even though the results are correct: // GGML_ASSERT(ggml_is_contiguously_allocated(tensor)); ne[split_dim] = 0; for (size_t s = 0; s < split_state.n_segments; s++) { - ne[split_dim] += split_state.ne[s*n_simple_bufs + j]; + ne[split_dim] += split_state.ne[s*n_simple_bufs + j] * split_state.nr[s]; } for (int i = 0; i < GGML_MAX_DIMS; i++) { if (tensor->nb[i] > tensor->nb[split_dim]) { @@ -1156,7 +1195,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs; } else if (simple_buf != nullptr) { t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf) - + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer)); + + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer)); } t_ij->extra = tensor->extra; for (int i = 0; i < GGML_MAX_SRC; i++) { @@ -1184,7 +1223,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer for (size_t j = 0; j < n_simple_bufs; j++) { int64_t ne_sum = 0; for (size_t s = 0; s < split_state_src.n_segments; s++) { - ne_sum += split_state_src.ne[s*n_simple_bufs + j]; + ne_sum += split_state_src.ne[s*n_simple_bufs + j] * split_state_src.nr[s]; } if (ne_sum == 0) { simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE; @@ -1192,19 +1231,27 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer } } - buf_ctx->simple_tensors[tensor] = simple_tensors; + stc.simple_tensors[tensor] = simple_tensors; return GGML_STATUS_SUCCESS; } +static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; + buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next; + return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor); +} + static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer); GGML_ASSERT(ggml_is_contiguous(tensor)); const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); - if (split_state.n_segments != 1) { + if (split_state.n_segments != 1 || split_state.nr[0] != 1) { GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS); + GGML_ASSERT(split_state.nr[0] != 0); GGML_ASSERT(tensor->ne[3] == 1); size_t offset_data = 0; @@ -1215,24 +1262,26 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg const size_t row_stride = tensor->nb[1]; GGML_ASSERT(offset % row_stride == 0); GGML_ASSERT(size % row_stride == 0); - const int64_t r_start = offset / row_stride; - const int64_t r_count = size / row_stride; - GGML_ASSERT(r_start + r_count <= tensor->ne[1]); + const int64_t row_start = offset / row_stride; + const int64_t row_count = size / row_stride; + GGML_ASSERT(row_start + row_count <= tensor->ne[1]); const int64_t blck_size = ggml_blck_size(tensor->type); for (size_t s = 0; s < split_state.n_segments; s++) { - for (size_t j = 0; j < n_bufs; j++) { - ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0); - const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0]; - ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, - simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes, - r_count, simple_tensor->nb[1], tensor->nb[1]); - offset_data += nbytes; - simple_offsets[j] += nbytes; + for (size_t r = 0; r < split_state.nr[s]; r++) { + for (size_t j = 0; j < n_bufs; j++) { + ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); + GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0); + const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0]; + ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, + simple_offsets[j] + row_start * simple_tensor->nb[1], nbytes, + row_count, simple_tensor->nb[1], tensor->nb[1]); + offset_data += nbytes; + simple_offsets[j] += nbytes; + } } } - GGML_ASSERT(offset_data*r_count == size); + GGML_ASSERT(offset_data*row_count == size); return; } GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1); @@ -1240,22 +1289,24 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg const size_t row_stride = tensor->nb[2]; GGML_ASSERT(offset % row_stride == 0); GGML_ASSERT(size % row_stride == 0); - const int64_t r_start = offset / row_stride; - const int64_t r_count = size / row_stride; - GGML_ASSERT(r_start + r_count <= tensor->ne[2]); + const int64_t row_start = offset / row_stride; + const int64_t row_count = size / row_stride; + GGML_ASSERT(row_start + row_count <= tensor->ne[2]); for (size_t s = 0; s < split_state.n_segments; s++) { - for (size_t j = 0; j < n_bufs; j++) { - ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1]; - ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, - simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes, - r_count, simple_tensor->nb[2], tensor->nb[2]); - offset_data += nbytes; - simple_offsets[j] += nbytes; + for (size_t r = 0; r < split_state.nr[s]; r++) { + for (size_t j = 0; j < n_bufs; j++) { + ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); + const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1]; + ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, + simple_offsets[j] + row_start * simple_tensor->nb[2], nbytes, + row_count, simple_tensor->nb[2], tensor->nb[2]); + offset_data += nbytes; + simple_offsets[j] += nbytes; + } } } - GGML_ASSERT(offset_data*r_count == size); + GGML_ASSERT(offset_data*row_count == size); return; } @@ -1273,6 +1324,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg for (size_t j = 0; j < n_bufs; j++) { ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } const size_t simple_offset = i_start * chunk_size_j; ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1310,8 +1364,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); - if (split_state.n_segments != 1) { + if (split_state.n_segments != 1 || split_state.nr[0] != 1) { GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS); + GGML_ASSERT(split_state.nr[0] != 0); GGML_ASSERT(tensor->ne[3] == 1); size_t offset_data = 0; @@ -1322,24 +1377,26 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co const size_t row_stride = tensor->nb[1]; GGML_ASSERT(offset % row_stride == 0); GGML_ASSERT(size % row_stride == 0); - const int64_t r_start = offset / row_stride; - const int64_t r_count = size / row_stride; - GGML_ASSERT(r_start + r_count <= tensor->ne[1]); + const int64_t row_start = offset / row_stride; + const int64_t row_count = size / row_stride; + GGML_ASSERT(row_start + row_count <= tensor->ne[1]); const int64_t blck_size = ggml_blck_size(tensor->type); for (size_t s = 0; s < split_state.n_segments; s++) { - for (size_t j = 0; j < n_bufs; j++) { - const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0); - const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0]; - ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, - simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes, - r_count, simple_tensor->nb[1], tensor->nb[1]); - offset_data += nbytes; - simple_offsets[j] += nbytes; + for (size_t r = 0; r < split_state.nr[s]; r++) { + for (size_t j = 0; j < n_bufs; j++) { + const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); + GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0); + const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0]; + ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, + simple_offsets[j] + row_start * simple_tensor->nb[1], nbytes, + row_count, simple_tensor->nb[1], tensor->nb[1]); + offset_data += nbytes; + simple_offsets[j] += nbytes; + } } } - GGML_ASSERT(offset_data*r_count == size); + GGML_ASSERT(offset_data*row_count == size); return; } GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1); @@ -1347,22 +1404,24 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co const size_t row_stride = tensor->nb[2]; GGML_ASSERT(offset % row_stride == 0); GGML_ASSERT(size % row_stride == 0); - const int64_t r_start = offset / row_stride; - const int64_t r_count = size / row_stride; - GGML_ASSERT(r_start + r_count <= tensor->ne[2]); + const int64_t row_start = offset / row_stride; + const int64_t row_count = size / row_stride; + GGML_ASSERT(row_start + row_count <= tensor->ne[2]); for (size_t s = 0; s < split_state.n_segments; s++) { - for (size_t j = 0; j < n_bufs; j++) { - const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); - const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1]; - ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, - simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes, - r_count, simple_tensor->nb[2], tensor->nb[2]); - offset_data += nbytes; - simple_offsets[j] += nbytes; + for (size_t r = 0; r < split_state.nr[s]; r++) { + for (size_t j = 0; j < n_bufs; j++) { + const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); + const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1]; + ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, + simple_offsets[j] + row_start * simple_tensor->nb[2], nbytes, + row_count, simple_tensor->nb[2], tensor->nb[2]); + offset_data += nbytes; + simple_offsets[j] += nbytes; + } } } - GGML_ASSERT(offset_data*r_count == size); + GGML_ASSERT(offset_data*row_count == size); return; } @@ -1380,6 +1439,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co for (size_t j = 0; j < n_bufs; j++){ const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } const size_t simple_offset = i_start * chunk_size_j; ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1405,8 +1467,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) { - const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer); - for (size_t i = 0; i < n_buffers; i++) { + GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; + for (size_t i = 0; i < buf_ctx->bufs.size(); i++) { ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i)); } } @@ -1432,20 +1495,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) { static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - ggml_init_params params = { - /*.mem_size =*/ 1024*1024*1024, // FIXME + const ggml_init_params params = { + /*.mem_size =*/ 1024*1024*ggml_tensor_overhead(), // FIXME /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + ggml_backend_meta_simple_tensor_container stc_static; + ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts); - ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(); size_t max_size = 0; - buf_ctx->buf_configs.reserve(n_simple_bufts); + std::vector bufs; + bufs.reserve(n_simple_bufts); for (size_t i = 0; i < n_simple_bufts; i++) { - ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size); - max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf)); - buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf); + bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size)); + GGML_ASSERT(bufs.back() != nullptr); + max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back())); } + ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs); return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size); } @@ -1453,28 +1520,53 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - ggml_init_params params = { - /*.mem_size =*/ 1024*1024*1024, // FIXME + constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals. + const ggml_init_params params_static = { + /*.mem_size =*/ ggml_get_mem_size(ctx), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + const ggml_init_params params_compute = { + /*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + ggml_backend_meta_simple_tensor_container stc_static (params_static, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts); - ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(); - meta_buf_ctx->buf_configs.reserve(n_simple_bufts); - for (size_t i = 0; i < n_simple_bufts; i++) { - meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr); - } + std::vector bufs(n_simple_bufts, nullptr); + ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs); ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { t->buffer = meta_buf; - ggml_backend_meta_buffer_init_tensor(meta_buf, t); + ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t); t->data = (void *) 0x2000000000000000; // FIXME } for (size_t i = 0; i < n_simple_bufts; i++) { - meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft( - meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i)); - meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf)); + ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get(); + ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i); + + // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL. + // For those edge cases, allocate a dummy buffer instead. + bool any_nonzero_slice = false; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (ggml_nelements(t) != 0) { + any_nonzero_slice = true; + break; + } + } + if (any_nonzero_slice) { + meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft)); + } else { + meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0)); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = meta_buf_ctx->bufs[i].get(); + } + } + GGML_ASSERT(meta_buf_ctx->bufs[i]); + meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get())); } return meta_buf; } @@ -1587,6 +1679,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); GGML_ASSERT(split_state.n_segments == 1); + GGML_ASSERT(split_state.nr[0] == 1); switch (split_state.axis) { case GGML_BACKEND_SPLIT_AXIS_0: @@ -1603,6 +1696,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j); ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1628,6 +1724,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false); GGML_ASSERT(split_state.n_segments == 1); + GGML_ASSERT(split_state.nr[0] == 1); switch (split_state.axis) { case GGML_BACKEND_SPLIT_AXIS_0: @@ -1644,6 +1741,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j); const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j); const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1]; + if (chunk_size_j == 0) { + continue; + } ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full); offset_j += chunk_size_j; @@ -1690,6 +1790,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, } if (needs_rebuild) { + std::set used_buffers; + for (int i = 0; i < cgraph->n_leafs; i++) { + if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) { + used_buffers.emplace(cgraph->leafs[i]->buffer); + } + } + for (int i = 0; i < cgraph->n_nodes; i++) { + if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) { + used_buffers.emplace(cgraph->nodes[i]->buffer); + } + } + for (ggml_backend_buffer_t buf : used_buffers) { + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context; + buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1; + ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next]; + for (ggml_context_ptr & ctx : stc.ctxs) { + ggml_reset(ctx.get()); + } + stc.simple_tensors.clear(); + } size_t n_subgraphs = 0; size_t max_tmp_size = 0; @@ -1875,7 +1995,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads); const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads); const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead(); - ggml_init_params params = { + const ggml_init_params params = { /*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, @@ -1962,6 +2082,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, node_zero->src[0] = node; ggml_set_op_params_f32(node_zero, 0, 0.0f); node_zero->data = node->data; + node_zero->buffer = node->buffer; node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE; step_cgraphs[j] = get_cgraph_aux(); @@ -2140,4 +2261,3 @@ ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, siz const ggml_backend_meta_context * backend_ctx = (const ggml_backend_meta_context *) meta_backend->context; return backend_ctx->backend_configs[index].backend; } - diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 4e36909f45e..87615921c09 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_ GGML_ASSERT(tensor); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) { + if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) { for (size_t i = 0; i < n_copies; i++) { ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size); } @@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_ } GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data); } @@ -379,7 +379,7 @@ void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); - if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) { + if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) { for (size_t i = 0; i < n_copies; i++) { ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size); } diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index f3eccff7d72..8c735a045b3 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() endif() - if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "") - target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - - target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") - message(WARNING "OpenMP not found") - endif() + if (GGML_OPENMP_ENABLED) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) endif() if (GGML_LLAMAFILE) diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index 74e0c086c6d..9c43da6cf89 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -977,6 +977,35 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumf = hsum_float_8(acc); *s = sumf; + +#elif defined(__loongarch_sx) + + __m128 acc = (__m128)__lsx_vldi(0); + + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d); + const __m128i qx_0 = __lsx_vld((const __m128i *)x[ib].qs, 0); + const __m128i qx_1 = __lsx_vld((const __m128i *)x[ib].qs + 1, 0); + const __m128i qy_0 = __lsx_vld((const __m128i *)y[ib].qs, 0); + const __m128i qy_1 = __lsx_vld((const __m128i *)y[ib].qs + 1, 0); + + const __m128i p16_0 = lsx_maddubs_h(qx_0, qy_0); + const __m128i p16_1 = lsx_maddubs_h(qx_1, qy_1); + + // Sum int16 pairs โ†’ int32 + const __m128i s_0 = __lsx_vaddwev_w_h(p16_0, p16_1); + const __m128i s_1 = __lsx_vaddwod_w_h(p16_0, p16_1); + + const __m128 q = __lsx_vffint_s_w(__lsx_vadd_w(s_0, s_1)); + acc = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(d), q, acc); + } + + __m128 res = lsx_hadd_s(acc, acc); + res = lsx_hadd_s(res, res); + sumf = ((v4f32)res)[0]; + + *s = sumf; + #else UNUSED(nb); UNUSED(ib); @@ -1443,6 +1472,99 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = hsum_float_8(acc); +#elif defined(__loongarch_sx) + + const __m128i m32s = __lsx_vreplgr2vr_b(32); + + __m128 acc_0 = (__m128)__lsx_vldi(0); + __m128 acc_1 = (__m128)__lsx_vldi(0); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const __m128i scale_i8 = __lsx_vld(x[i].scales, 0); + const __m128i scales_lo = __lsx_vsllwil_h_b(scale_i8, 0); + const __m128i scales_hi = __lsx_vsllwil_h_b(__lsx_vbsrl_v(scale_i8, 8), 0); + + __m128i sumi_0 = __lsx_vldi(0); + __m128i sumi_1 = __lsx_vldi(0); + + for (int j = 0; j < QK_K/128; ++j) { + + const __m128i q4bitsH_0 = __lsx_vld((const __m128i*)qh, 0); qh += 16; + const __m128i q4bitsH_1 = __lsx_vld((const __m128i*)qh, 0); qh += 16; + + const __m128i q4h_0 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3), 4); + const __m128i q4h_1 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3), 4); + const __m128i q4h_2 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_0, 3 << 2), 2); + const __m128i q4h_3 = __lsx_vslli_b(__lsx_vandi_b(q4bitsH_1, 3 << 2), 2); + const __m128i q4h_4 = __lsx_vandi_b(q4bitsH_0, 3 << 4); + const __m128i q4h_5 = __lsx_vandi_b(q4bitsH_1, 3 << 4); + const __m128i q4h_6 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_0, 3 << 6), 2); + const __m128i q4h_7 = __lsx_vsrli_b(__lsx_vandi_b(q4bitsH_1, 3 << 6), 2); + + const __m128i q4bits1_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16; + const __m128i q4bits1_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16; + const __m128i q4bits2_0 = __lsx_vld((const __m128i*)q4, 0); q4 += 16; + const __m128i q4bits2_1 = __lsx_vld((const __m128i*)q4, 0); q4 += 16; + + const __m128i q4_0 = __lsx_vor_v(__lsx_vandi_b(q4bits1_0, 0xf), q4h_0); + const __m128i q4_1 = __lsx_vor_v(__lsx_vandi_b(q4bits1_1, 0xf), q4h_1); + const __m128i q4_2 = __lsx_vor_v(__lsx_vandi_b(q4bits2_0, 0xf), q4h_2); + const __m128i q4_3 = __lsx_vor_v(__lsx_vandi_b(q4bits2_1, 0xf), q4h_3); + const __m128i q4_4 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_0, 4), q4h_4); + const __m128i q4_5 = __lsx_vor_v(__lsx_vsrli_b(q4bits1_1, 4), q4h_5); + const __m128i q4_6 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_0, 4), q4h_6); + const __m128i q4_7 = __lsx_vor_v(__lsx_vsrli_b(q4bits2_1, 4), q4h_7); + + const __m128i q8_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_2 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_3 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_4 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_5 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_6 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8_7 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + + __m128i p16_0 = lsx_maddubs_h(__lsx_vsub_b(q4_0, m32s), q8_0); + __m128i p16_1 = lsx_maddubs_h(__lsx_vsub_b(q4_1, m32s), q8_1); + __m128i p16_2 = lsx_maddubs_h(__lsx_vsub_b(q4_2, m32s), q8_2); + __m128i p16_3 = lsx_maddubs_h(__lsx_vsub_b(q4_3, m32s), q8_3); + __m128i p16_4 = lsx_maddubs_h(__lsx_vsub_b(q4_4, m32s), q8_4); + __m128i p16_5 = lsx_maddubs_h(__lsx_vsub_b(q4_5, m32s), q8_5); + __m128i p16_6 = lsx_maddubs_h(__lsx_vsub_b(q4_6, m32s), q8_6); + __m128i p16_7 = lsx_maddubs_h(__lsx_vsub_b(q4_7, m32s), q8_7); + + const __m128i sc_vec = j == 0 ? scales_lo : scales_hi; + + p16_0 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 0), p16_0); + p16_1 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 1), p16_1); + p16_2 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 2), p16_2); + p16_3 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 3), p16_3); + p16_4 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 4), p16_4); + p16_5 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 5), p16_5); + p16_6 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 6), p16_6); + p16_7 = lsx_madd_h(__lsx_vreplvei_h(sc_vec, 7), p16_7); + + sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_0, p16_2)); + sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_1, p16_3)); + sumi_0 = __lsx_vadd_w(sumi_0, __lsx_vadd_w(p16_4, p16_6)); + sumi_1 = __lsx_vadd_w(sumi_1, __lsx_vadd_w(p16_5, p16_7)); + } + + __m128 p_0 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_0)); + __m128 p_1 = __lsx_vfmul_s(__lsx_vreplfr2vr_s(d), __lsx_vffint_s_w(sumi_1)); + acc_0 = __lsx_vfadd_s(p_0, acc_0); + acc_1 = __lsx_vfadd_s(p_1, acc_1); + } + + *s = hsum_float_4x4(acc_0, acc_1, (__m128)__lsx_vldi(0), (__m128)__lsx_vldi(0)); + #else UNUSED(x); UNUSED(y); @@ -2149,6 +2271,35 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v *s = hsum_float_8(accum); +#elif defined(__loongarch_sx) + + const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0); + + __m128 accum = (__m128)__lsx_vldi(0); + for (int ibl = 0; ibl < nb; ++ibl) { + const uint8_t * qs = x[ibl].qs; + const int8_t * q8 = y[ibl].qs; + uint16_t sh = x[ibl].scales_h; + __m128i sumi = __lsx_vldi(0); + for (int ib = 0; ib < QK_K/32; ++ib) { + const __m128i q4bits = __lsx_vld((const __m128i*)qs, 0); qs += 16; + const __m128i q8b_0 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q8b_1 = __lsx_vld((const __m128i*)q8, 0); q8 += 16; + const __m128i q4b_0 = __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits, 0xf)); + const __m128i q4b_1 = __lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits, 4)); + const __m128i p16_0 = lsx_maddubs_h(q4b_0, q8b_0); + const __m128i p16_1 = lsx_maddubs_h(q4b_1, q8b_1); + const int16_t ls = (((x[ibl].scales_l[ib/2] >> ((ib & 1) * 4)) & 0xf) | ((sh & 0x3) << 4)) - 32; + sh >>= 2; + sumi = __lsx_vadd_w(lsx_madd_h(p16_0, __lsx_vreplgr2vr_h(ls)), sumi); + sumi = __lsx_vadd_w(lsx_madd_h(p16_1, __lsx_vreplgr2vr_h(ls)), sumi); + } + const float ds = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + accum = __lsx_vfadd_s(__lsx_vfmul_s(__lsx_vreplfr2vr_s(ds), __lsx_vffint_s_w(sumi)), accum); + } + + *s = ((v4f32)lsx_hadd_s(lsx_hadd_s(accum, accum), lsx_hadd_s(accum, accum)))[0]; + #else UNUSED(x); UNUSED(y); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7b05edf6b75..cd5c61a8187 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2943,7 +2943,9 @@ struct ggml_cplan ggml_graph_plan( case GGML_OP_GATED_DELTA_NET: { const int64_t S_v = node->src[2]->ne[0]; - cur = S_v * sizeof(float) * n_tasks; + const int64_t K = node->src[5]->ne[1]; // state is (D, K, n_seqs) + const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0); + cur = per_thread * sizeof(float) * n_tasks; } break; case GGML_OP_COUNT: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 6bc8dc150ce..dc73696ad9f 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -2235,8 +2235,42 @@ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, gg } } +static void ggml_compute_forward_fill_f16(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0)); + + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const auto [ir0, ir1] = get_thread_range(params, dst); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne2*ne1); + const int64_t i02 = (ir - i03*ne2*ne1)/ne1; + const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1); + + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1); + + ggml_vec_set_f16(ne0, dst_ptr, c); + } +} + void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) { - ggml_compute_forward_fill_f32(params, dst); + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_fill_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_fill_f16(params, dst); + } break; + default: + { + GGML_ABORT("unsupported type for ggml_compute_forward_fill: %s", ggml_type_name(src0->type)); + } + } } // ggml_compute_tri @@ -10513,19 +10547,30 @@ static void ggml_compute_forward_gated_delta_net_one_chunk( const bool kda = (neg0 == S_v); - // scratch layout per thread: [delta(S_v)] - const int64_t scratch_per_thread = S_v; + // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count. + const int64_t K = src_state->ne[1]; + GGML_ASSERT(K >= 1); + // per-seq stride in floats (slot 0 of seq s lives at state + s * seq_stride) + const int64_t state_seq_stride = src_state->nb[2] / sizeof(float); + + const int64_t per_thread = S_v + (K > 1 ? S_v * S_v : 0); const int ith = params->ith; - float * delta = (float *)params->wdata + ith * scratch_per_thread + CACHE_LINE_SIZE_F32; + float * delta = (float *)params->wdata + ith * per_thread + CACHE_LINE_SIZE_F32; + float * state_work = K > 1 ? (delta + S_v) : nullptr; // output layout: [attn_scores | new_states] - // attn_scores: S_v * H * n_tokens * n_seqs floats - // new_states: S_v * S_v * H * n_seqs floats - const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs; + // attn_scores: S_v * H * n_tokens * n_seqs floats + // new_states: S_v * S_v * H * n_seqs * K floats (K snapshot slots; last min(n_tokens, K)) + const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs; + const int64_t state_size_per_snap = S_v * S_v * H * n_seqs; float * attn_out_base = (float *)dst->data; float * state_out_base = (float *)dst->data + attn_score_elems; + // snapshot slot mapping: target_slot = t - shift. When n_tokens < K only the last + // n_tokens slots are written; earlier slots are left untouched (caller-owned). + const int64_t shift = n_tokens - K; + const float * state_in_base = (const float *)src_state->data; //const int64_t rq1 = nev1 / neq1; @@ -10545,10 +10590,15 @@ static void ggml_compute_forward_gated_delta_net_one_chunk( const int64_t iq3 = iv3 / rq3; const int64_t ik3 = iv3 / rk3; - float * s_out = state_out_base + (iv3 * H + iv1) * S_v * S_v; + // For K=1, write directly to the single output slot to avoid an extra memcpy at the end. + // For K>1, work in scratch and copy out per-token when the slot is in range. + float * s_out = (K > 1) + ? state_work + : state_out_base + (iv3 * H + iv1) * S_v * S_v; - // copy input state into output buffer and operate in-place - const float * s_in = state_in_base + (iv3 * H + iv1) * S_v * S_v; + // copy input state into the working buffer and operate in-place + // state layout (D, K, n_seqs): slot 0 of seq iv3 starts at iv3 * state_seq_stride. + const float * s_in = state_in_base + iv3 * state_seq_stride + iv1 * S_v * S_v; memcpy(s_out, s_in, S_v * S_v * sizeof(float)); // attn output pointer for first token of this (head, seq) @@ -10598,6 +10648,15 @@ static void ggml_compute_forward_gated_delta_net_one_chunk( } attn_data += S_v * H; // advance to next token + + if (K > 1) { + const int64_t target_slot = t - shift; + if (target_slot >= 0 && target_slot < K) { + float * curr_state_o = state_out_base + target_slot * state_size_per_snap + + (iv3 * H + iv1) * S_v * S_v; + memcpy(curr_state_o, s_out, S_v * S_v * sizeof(float)); + } + } } } } diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 0deda930985..62e687201ef 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -1125,25 +1125,12 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { #define GGML_F16_EPR 4 static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { - float tmp[4]; - - tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); - tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); - tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); - tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); - - return (__m128)__lsx_vld(tmp, 0); + return __lsx_vfcvtl_s_h(__lsx_vld((const void *)x, 0)); } static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { - float arr[4]; - - __lsx_vst(y, arr, 0); - - x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); - x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); - x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); - x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); + __m128i a = __lsx_vfcvt_h_s(y, y); + memcpy(x, &a, sizeof(ggml_fp16_t) * 4); } #define GGML_F32Cx4 __m128 diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index d0e4001338a..67b6b05cac8 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -273,67 +273,51 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G #if defined(GGML_SIMD) #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; //get vector length - const int ggml_f16_epr = sve_register_length / 16; // running when 16 - const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers - - const int np= (n & ~(ggml_f16_step - 1)); - svfloat16_t sum1 = svdup_n_f16(0.0f); - svfloat16_t sum2 = svdup_n_f16(0.0f); - svfloat16_t sum3 = svdup_n_f16(0.0f); - svfloat16_t sum4 = svdup_n_f16(0.0f); - - svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; - svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; - for (int i = 0; i < np; i += ggml_f16_step) { - ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); - sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1); - - ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); - sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2); - - ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); - ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); - sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3); - - ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); - ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4); - - ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); - ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5); + const int ggml_f16_epr = svcnth(); + const int ggml_f16_step = 8 * ggml_f16_epr; + const int np = n - (n % ggml_f16_step); + const int np2 = n - (n % ggml_f16_epr); + + svfloat32_t sum1_lo = svdup_n_f32(0.0f); + svfloat32_t sum1_hi = svdup_n_f32(0.0f); + svfloat32_t sum2_lo = svdup_n_f32(0.0f); + svfloat32_t sum2_hi = svdup_n_f32(0.0f); + svfloat32_t sum3_lo = svdup_n_f32(0.0f); + svfloat32_t sum3_hi = svdup_n_f32(0.0f); + svfloat32_t sum4_lo = svdup_n_f32(0.0f); + svfloat32_t sum4_hi = svdup_n_f32(0.0f); - ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); - ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6); - - ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); - ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7); - - ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); - ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8); + for (int i = 0; i < np; i += ggml_f16_step) { + ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0), GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0)); + ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1), GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1)); + ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2), GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2)); + ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3), GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3)); + ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4), GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4)); + ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5), GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5)); + ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6), GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6)); + ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7), GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7)); } - const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8 - for (int k = np; k < np2; k += ggml_f16_epr) { - svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); - svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); - sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry); + for (int i = np; i < np2; i += ggml_f16_epr) { + ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i, 0), GGML_F16x_VEC_LOAD(y + i, 0)); } if (np2 < n) { - svbool_t pg = svwhilelt_b16(np2, n); - svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); - svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); + const svbool_t pg = svwhilelt_b16(np2, n); + const svfloat16_t rx = svld1_f16(pg, (const __fp16 *)(x + np2)); + const svfloat16_t ry = svld1_f16(pg, (const __fp16 *)(y + np2)); - sum1 = svmad_f16_x(pg, hx, hy, sum1); + ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, rx, ry); } - GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4); + + sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum2_lo); + sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum2_hi); + sum3_lo = svadd_f32_m(DEFAULT_PG32, sum3_lo, sum4_lo); + sum3_hi = svadd_f32_m(DEFAULT_PG32, sum3_hi, sum4_hi); + sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum3_lo); + sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum3_hi); + + sumf = ggml_sve_sum_f32x2(sum1_lo, sum1_hi); #elif defined(__riscv_v_intrinsic) #if defined(__riscv_zvfh) int vl = __riscv_vsetvlmax_e32m2(); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index bcd68da9aa9..5de9cb5b7e0 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -14,6 +14,35 @@ // floating point type used to accumulate sums typedef double ggml_float; +#if defined(__ARM_FEATURE_SVE) +inline static void ggml_sve_f16_fma_widened( + svfloat32_t * acc_lo, + svfloat32_t * acc_hi, + svfloat16_t x, + svfloat16_t y) { +#if defined(__ARM_FEATURE_SVE2) + *acc_lo = svmlalb_f32(*acc_lo, x, y); + *acc_hi = svmlalt_f32(*acc_hi, x, y); +#else + // Plain SVE fallback path if SVE2 instructions not available + svfloat16_t x_even = svtrn1_f16(x, x); + svfloat16_t x_odd = svtrn2_f16(x, x); + + svfloat16_t y_even = svtrn1_f16(y, y); + svfloat16_t y_odd = svtrn2_f16(y, y); + + svbool_t pg = svptrue_b32(); + + *acc_lo = svmla_f32_x(pg, *acc_lo, svcvt_f32_f16_x(pg, x_even), svcvt_f32_f16_x(pg, y_even)); + *acc_hi = svmla_f32_x(pg, *acc_hi, svcvt_f32_f16_x(pg, x_odd), svcvt_f32_f16_x(pg, y_odd)); +#endif +} + +inline static ggml_float ggml_sve_sum_f32x2(svfloat32_t sum_lo, svfloat32_t sum_hi) { + return (ggml_float) (svaddv_f32(svptrue_b32(), sum_lo) + svaddv_f32(svptrue_b32(), sum_hi)); +} +#endif + #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -122,108 +151,61 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG #if defined(GGML_SIMD) #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; - const int ggml_f16_epr = sve_register_length / 16; // running when 16 - const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers - - int np = (n & ~(ggml_f16_step - 1)); - - svfloat16_t sum_00 = svdup_n_f16(0.0f); - svfloat16_t sum_01 = svdup_n_f16(0.0f); - svfloat16_t sum_02 = svdup_n_f16(0.0f); - svfloat16_t sum_03 = svdup_n_f16(0.0f); + const int ggml_f16_epr = svcnth(); + const int ggml_f16_step = 2 * ggml_f16_epr; + int np = n - (n % ggml_f16_step); + int np2 = n - (n % ggml_f16_epr); - svfloat16_t sum_10 = svdup_n_f16(0.0f); - svfloat16_t sum_11 = svdup_n_f16(0.0f); - svfloat16_t sum_12 = svdup_n_f16(0.0f); - svfloat16_t sum_13 = svdup_n_f16(0.0f); - - svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; - svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; + svfloat32_t sum_0_0_lo = svdup_n_f32(0.0f); + svfloat32_t sum_0_0_hi = svdup_n_f32(0.0f); + svfloat32_t sum_0_1_lo = svdup_n_f32(0.0f); + svfloat32_t sum_0_1_hi = svdup_n_f32(0.0f); + svfloat32_t sum_1_0_lo = svdup_n_f32(0.0f); + svfloat32_t sum_1_0_hi = svdup_n_f32(0.0f); + svfloat32_t sum_1_1_lo = svdup_n_f32(0.0f); + svfloat32_t sum_1_1_hi = svdup_n_f32(0.0f); for (int i = 0; i < np; i += ggml_f16_step) { - ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements - - ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements - sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1 - ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements - sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1); - - ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements + const svfloat16_t ay0 = GGML_F16x_VEC_LOAD(y + i, 0); + const svfloat16_t ax00 = GGML_F16x_VEC_LOAD(x[0] + i, 0); + const svfloat16_t ax01 = GGML_F16x_VEC_LOAD(x[1] + i, 0); - ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements - sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2); - ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); - sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2); + ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax00, ay0); + ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax01, ay0); - ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); + const svfloat16_t ay1 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 0); + const svfloat16_t ax10 = GGML_F16x_VEC_LOAD(x[0] + i + 1 * ggml_f16_epr, 0); + const svfloat16_t ax11 = GGML_F16x_VEC_LOAD(x[1] + i + 1 * ggml_f16_epr, 0); - ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); - sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3); - ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); - sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3); - - ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - - ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3); - sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4); - ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3); - sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4); - - ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - - ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4); - - sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5); - ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4); - sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5); - - ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - - ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5); - - sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6); - ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5); - sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6); - - ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - - ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6); - - sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7); - ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6); - sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7); - - ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - - ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7); - - sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8); - ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7); - sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8); + ggml_sve_f16_fma_widened(&sum_0_1_lo, &sum_0_1_hi, ax10, ay1); + ggml_sve_f16_fma_widened(&sum_1_1_lo, &sum_1_1_hi, ax11, ay1); } - const int np2 = (n & ~(ggml_f16_epr - 1)); - for (int k = np; k < np2; k += ggml_f16_epr) { - svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); + for (int i = np; i < np2; i += ggml_f16_epr) { + const svfloat16_t ry = GGML_F16x_VEC_LOAD(y + i, 0); + const svfloat16_t rx0 = GGML_F16x_VEC_LOAD(x[0] + i, 0); + const svfloat16_t rx1 = GGML_F16x_VEC_LOAD(x[1] + i, 0); - svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0); - sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry); - rx = GGML_F16x_VEC_LOAD(x[1] + k, 0); - sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry); + ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, rx0, ry); + ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, rx1, ry); } if (np2 < n) { - svbool_t pg = svwhilelt_b16(np2, n); - svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2)); - svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2)); - svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); + const svbool_t pg = svwhilelt_b16(np2, n); + const svfloat16_t ay = svld1_f16(pg, (const __fp16 *)(y + np2)); + const svfloat16_t ax0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2)); + const svfloat16_t ax1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2)); - sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00); - sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10); + ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax0, ay); + ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax1, ay); } - GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); - GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); + + svfloat32_t sum_0_lo = svadd_f32_x(DEFAULT_PG32, sum_0_0_lo, sum_0_1_lo); + svfloat32_t sum_0_hi = svadd_f32_x(DEFAULT_PG32, sum_0_0_hi, sum_0_1_hi); + svfloat32_t sum_1_lo = svadd_f32_x(DEFAULT_PG32, sum_1_0_lo, sum_1_1_lo); + svfloat32_t sum_1_hi = svadd_f32_x(DEFAULT_PG32, sum_1_0_hi, sum_1_1_hi); + sumf[0] = ggml_sve_sum_f32x2(sum_0_lo, sum_0_hi); + sumf[1] = ggml_sve_sum_f32x2(sum_1_lo, sum_1_hi); np = n; #elif defined(__riscv_v_intrinsic) #if defined(__riscv_zvfh) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index b54d4a6b107..d3953eee962 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND) # 80 == Ampere, asynchronous data loading, faster tensor core instructions # 86 == RTX 3000, needs CUDA v11.1 # 89 == RTX 4000, needs CUDA v11.8 + # 90 == Hopper H100/200, needs CUDA v11.8 # 120 == Blackwell, needs CUDA v12.8, FP4 tensor cores # # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run @@ -33,7 +34,7 @@ if (CUDAToolkit_FOUND) list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real) + list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-virtual) endif() if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index adb4d5f0cb9..c25f42b32bb 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -2,6 +2,9 @@ #include #include +template +using type_for_index = T; + static __device__ __forceinline__ float op_repeat(const float a, const float b) { return b; GGML_UNUSED(a); @@ -52,6 +55,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const int s12, const int s13, src1_ptrs... src1s) { + ggml_cuda_pdl_lc(); const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y); const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3); @@ -72,6 +76,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; + ggml_cuda_pdl_sync(); for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { const uint32_t i10 = fastmodulo(i0, ne10); @@ -141,6 +146,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const int i10 = fastmodulo(i0, ne10); + ggml_cuda_pdl_sync(); float result = src0_row ? (float) src0_row[i0*s00] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10]))); @@ -282,35 +288,24 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1); const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2); - if constexpr (sizeof...(I) > 0) { - k_bin_bcast_unravel<<>>( + { + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)block_num, block_size, 0, stream); + ggml_cuda_kernel_launch(k_bin_bcast_unravel...>, launch_params, src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, /*s0,*/ s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); - } else { - k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, - ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, - s00, s01, s02, s03, - s10, s11, s12, s13); } } else { const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); - if constexpr (sizeof...(I) > 0) { - k_bin_bcast<<>>( + { + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(k_bin_bcast...>, launch_params, src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, /*s0,*/ s1, s2, s3, - s00 ,s01, s02, s03, - s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); - } else { - k_bin_bcast<<>>( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, s00, s01, s02, s03, - s10, s11, s12, s13); + s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } } } @@ -333,6 +328,7 @@ static __global__ void k_repeat_back( } T sum = 0; + ggml_cuda_pdl_sync(); for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) { for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) { for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 10817505d9f..73755470fbc 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -5,7 +5,9 @@ #include "ggml-cuda.h" #include +#include #include +#include #if defined(GGML_USE_HIP) #define GGML_COMMON_DECL_HIP @@ -27,6 +29,7 @@ #include #include #include +#include #include #if defined(GGML_USE_HIP) @@ -50,6 +53,7 @@ #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_ADA_LOVELACE 890 +#define GGML_CUDA_CC_HOPPER 900 // While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms #define GGML_CUDA_CC_BLACKWELL 1200 @@ -107,6 +111,27 @@ # define GGML_CUDA_USE_CUB #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 +// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8. +// However, this has been bugged in CTK < 12.3 for MSVC builds, see +// https://github.com/ggml-org/llama.cpp/pull/22522#discussion_r3302393293 +// __CUDA_ARCH__ is undefined in host passes; GPU arch check happens in device-side code. +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && \ + (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080)) +# define GGML_CUDA_USE_PDL +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080)) + +static __device__ __forceinline__ void ggml_cuda_pdl_sync() { +#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER +} + +static __device__ __forceinline__ void ggml_cuda_pdl_lc() { +#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER +} + #ifdef __CUDA_ARCH_LIST__ constexpr bool ggml_cuda_has_arch_impl(int) { return false; @@ -165,6 +190,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) + #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { return cublasGetStatusString(err); @@ -1487,3 +1513,122 @@ struct ggml_cuda_mm_fusion_args_device { const void * gate_bias = nullptr; ggml_glu_op glu_op; }; + +struct ggml_cuda_kernel_launch_params { + dim3 block_nums; + dim3 block_dims; + size_t shmem; + cudaStream_t stream; + + // size_t shmem + ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const size_t shmem_, const cudaStream_t stream_) + : block_nums(block_nums_), block_dims(block_dims_), shmem(shmem_), stream(stream_) {} + + // Some call sites pass ints instead of the required size_t. This 2nd constructor casts int->size_t to avoid these -Wnarrowing warnings. + ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const int shmem_, const cudaStream_t stream_) + : block_nums(block_nums_), block_dims(block_dims_), shmem((size_t)shmem_), stream(stream_) {} +}; + +#if defined(GGML_CUDA_USE_PDL) +struct ggml_cuda_pdl_config { + cudaLaunchAttribute attr; + cudaLaunchConfig_t cfg; + + ggml_cuda_pdl_config(const ggml_cuda_kernel_launch_params & params) { + attr.id = cudaLaunchAttributeProgrammaticStreamSerialization; + attr.val.programmaticStreamSerializationAllowed = 1; + + cfg = {}; + cfg.gridDim = params.block_nums; + cfg.blockDim = params.block_dims; + cfg.dynamicSmemBytes = params.shmem; + cfg.stream = params.stream; + cfg.attrs = &attr; + cfg.numAttrs = 1; + } + + // Delete due to &attr + ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete; + +}; + +static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) { + const int device = ggml_cuda_get_device(); + + struct cache_key { + int device; + const void * kernel; + + bool operator==(const cache_key & other) const { return device == other.device && kernel == other.kernel; } + }; + + struct cache_key_hash { + // MurmurHash3 mixing function for better hash distribution (vs. just std::hash which in some implementations simply returns the identity) + static size_t hash_mix(size_t x) { + std::uint64_t y = x; + const std::uint64_t m = 0xe9846af9b1a615d; + + y ^= y >> 32; + y *= m; + y ^= y >> 32; + y *= m; + y ^= y >> 28; + + return static_cast(y); + } + + size_t operator()(const cache_key & key) const { + // Use a nonzero seed to avoid mapping all-zero keys to zero + size_t h = 42; + h = hash_mix(h + key.device); + h = hash_mix(h + reinterpret_cast(key.kernel)); + return h; + } + }; + + static std::mutex cache_mutex; + static std::unordered_map cache; + + const cache_key key = { device, kernel }; + std::lock_guard lock(cache_mutex); + const auto it = cache.find(key); + if (it != cache.end()) { + return it->second; + } + + cudaFuncAttributes attr = {}; + CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel)); + + // PDL device-side primitives are emitted only for PTX versions >= 90. + // We have to guard on a loaded kernel's PTX version so a kernel forward-JIT'ed + // from pre-Hopper PTX to a Hopper-or-newer GPU does not opt into PDL. + const bool can_use_pdl = attr.ptxVersion >= 90; + cache.emplace(key, can_use_pdl); + return can_use_pdl; +} + +#endif //defined(GGML_CUDA_USE_PDL) + + +template +static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) { +#if defined(GGML_CUDA_USE_PDL) + + static const bool env_pdl_enabled = []() { + const char * env = getenv("GGML_CUDA_PDL"); + return env == nullptr || std::atoi(env) != 0; + }(); + + if (env_pdl_enabled && ggml_cuda_kernel_can_use_pdl(reinterpret_cast(kernel))) { + auto pdl_cfg = ggml_cuda_pdl_config(launch_params); + + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward(args)... )); + return; + } +#endif //defined(GGML_CUDA_USE_PDL) + + kernel<<>>(std::forward(args)... ); + CUDA_CHECK(cudaGetLastError()); +} diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index 102f944f924..adba4d522a4 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -15,6 +15,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont const int64_t n = ne0 * ne1 * ne2; + ggml_cuda_pdl_sync(); for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) { if constexpr (dim == 0) { const int64_t row = i / ne0; @@ -64,8 +65,8 @@ static void concat_f32_cuda(const float * x, const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; if (dim == 0) { - concat_f32_cont<0> - <<>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); return; } if (dim == 1) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index d208acf2d5f..121472ec228 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -16,6 +16,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13) { + ggml_cuda_pdl_lc(); const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { @@ -36,6 +37,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; + ggml_cuda_pdl_sync(); cpy_1(cx + x_offset, cdst + dst_offset); } @@ -59,6 +61,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1]; int cur_tile_buf = 0; + ggml_cuda_pdl_sync(); #pragma unroll for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) { @@ -142,6 +145,7 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + ggml_cuda_pdl_sync(); cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -168,6 +172,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + ggml_cuda_pdl_sync(); cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -182,6 +187,7 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const const src_t * x = (const src_t *) cx; dst_t * dst = (dst_t *) cdst; + ggml_cuda_pdl_sync(); dst[i] = ggml_cuda_cast(x[i]); } @@ -192,8 +198,8 @@ cudaStream_t stream) { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; GGML_ASSERT(num_blocks < UINT_MAX); - cpy_scalar_contiguous<<>> - (cx, cdst, ne); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_contiguous, launch_params, cx, cdst, ne); } template @@ -223,13 +229,15 @@ static void ggml_cpy_scalar_cuda( GGML_ASSERT(grid_z < USHRT_MAX); dim3 dimGrid(grid_x, grid_y, grid_z); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); - cpy_scalar_transpose<<>> - (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_transpose, launch_params, + cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } else { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; GGML_ASSERT(num_blocks < UINT_MAX); - cpy_scalar><<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar>, launch_params, + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } } diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index beeb5238946..d650b5fbd0f 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -636,6 +636,7 @@ static __global__ void flash_attn_mask_to_KV_max( if (tid < WARP_SIZE) { buf_iw[tid] = 1; } + ggml_cuda_pdl_sync(); __syncthreads(); int KV_max_sj = (ne30 - 1) * FATTN_KQ_STRIDE; @@ -687,6 +688,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform( const uint3 fd_iter_j_z, const uint3 fd_iter_j) { constexpr int ncols = ncols1*ncols2; + ggml_cuda_pdl_lc(); const int tile_idx = blockIdx.x; // One block per output tile. const int j = blockIdx.y; @@ -718,6 +720,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform( dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt_Q*D + (j*ne02 + c)*D + tid; + ggml_cuda_pdl_sync(); // Load the partial result that needs a fixup float dst_val = *dst; float max_val; @@ -809,6 +812,7 @@ static __global__ void flash_attn_stream_k_fixup_general( float dst_val = 0.0f; float max_val = 0.0f; float rowsum = 0.0f; + ggml_cuda_pdl_sync(); { dst_val = *dst; @@ -867,6 +871,7 @@ static __global__ void flash_attn_combine_results( const float2 * __restrict__ VKQ_meta, float * __restrict__ dst, const int parallel_blocks) { + ggml_cuda_pdl_lc(); // Dimension 0: threadIdx.x // Dimension 1: blockIdx.x // Dimension 2: blockIdx.y @@ -890,6 +895,7 @@ static __global__ void flash_attn_combine_results( __builtin_assume(tid < D); extern __shared__ float2 meta[]; + ggml_cuda_pdl_sync(); for (int i = tid; i < 2*parallel_blocks; i += D) { ((float *) meta)[i] = ((const float *)VKQ_meta) [i]; } @@ -1146,7 +1152,9 @@ void launch_fattn( const uint3 ne01 = init_fastdiv_values(Q->ne[1]); GGML_ASSERT(block_dim.x % warp_size == 0); - fattn_kernel<<>>( + + // disabled PDL enrollment for now due to a compiler bug. + fattn_kernel<<>>( (const char *) Q->data, K_data, V_data, @@ -1176,9 +1184,9 @@ void launch_fattn( const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {(unsigned)ntiles_dst, ncols1, ncols2}; - flash_attn_stream_k_fixup_uniform - <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream); + ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_uniform, launch_params, + (float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[2], nblocks_sk, gqa_ratio, bpt, fd0, fd1, fd2); } else if (ntiles_dst % blocks_num.x != 0) { @@ -1193,9 +1201,9 @@ void launch_fattn( const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2}; - flash_attn_stream_k_fixup_general - <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream); + ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_general, launch_params, + (float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], gqa_ratio, total_work, fd_k_j_z_ne12, fd_k_j_z, fd_k_j, fd_k); } @@ -1204,9 +1212,9 @@ void launch_fattn( const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]); const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2); - flash_attn_combine_results - <<>> - (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream); + ggml_cuda_kernel_launch(flash_attn_combine_results, launch_params, + dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); } CUDA_CHECK(cudaGetLastError()); } diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index a25e912c4d2..ac5abb13367 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -472,7 +472,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask( const int i = 8 * (threadIdx.x % (nbatch_fa/8)); - cp_async_cg_16(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + j_vram*stride_mask + i); + cp_async_cg_16(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + int64_t(j_vram)*stride_mask + i); } } else if constexpr (oob_check) { #pragma unroll @@ -488,7 +488,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask( for (int i0 = 0; i0 < nbatch_fa; i0 += warp_size) { const int i = i0 + threadIdx.x; - tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[j_vram*stride_mask + i] : half(0.0f); + tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[int64_t(j_vram)*stride_mask + i] : half(0.0f); } } } else if constexpr (nbatch_fa < 2*warp_size) { @@ -505,7 +505,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask( const int i = threadIdx.x % (warp_size/cols_per_warp); - ggml_cuda_memcpy_1(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + j_vram*stride_mask + 2*i); + ggml_cuda_memcpy_1(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + int64_t(j_vram)*stride_mask + 2*i); } } else { #pragma unroll @@ -521,7 +521,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask( for (int i0 = 0; i0 < nbatch_fa; i0 += 2*warp_size) { const int i = i0 + 2*threadIdx.x; - ggml_cuda_memcpy_1(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + j_vram*stride_mask + i); + ggml_cuda_memcpy_1(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + int64_t(j_vram)*stride_mask + i); } } } @@ -568,7 +568,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( constexpr bool Q_in_reg = ggml_cuda_fattn_mma_get_Q_in_reg (DKQ, DV, ncols); constexpr int nstages = ggml_cuda_fattn_mma_get_nstages (DKQ, DV, ncols1, ncols2); - constexpr int stride_tile_Q = DKQ/2 + 4; constexpr int stride_tile_K = nbatch_K2 + 4; constexpr int stride_tile_V = V_is_K_view ? stride_tile_K : nbatch_V2 + 4; @@ -604,9 +603,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( #pragma unroll for (int k0_start = (DKQ/2-1) - (DKQ/2-1) % nbatch_K2; k0_start >= 0; k0_start -= nbatch_K2) { const int k0_stop = k0_start + nbatch_K2 < DKQ/2 ? k0_start + nbatch_K2 : DKQ/2; - const int k0_diff = k0_stop - k0_start; if constexpr (nstages <= 1) { + const int k0_diff = k0_stop - k0_start; constexpr bool use_cp_async = nstages == 1; flash_attn_ext_f16_load_tile (K_h2 + int64_t(k_VKQ_0)*stride_K + k0_start, tile_K, k0_diff, stride_K, k_VKQ_sup); @@ -640,6 +639,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( } } } else { + constexpr int stride_tile_Q = DKQ/2 + 4; #pragma unroll for (int k_KQ_0 = k0_start; k_KQ_0 < k0_stop; k_KQ_0 += T_A_KQ::J) { load_ldmatrix(Q_B[0], tile_Q + (threadIdx.y / np)*(T_B_KQ::I*stride_tile_Q) + k_KQ_0, stride_tile_Q); @@ -954,9 +954,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int i0_start = 0; i0_start < DV; i0_start += 2*nbatch_V2) { static_assert(DV % (2*nbatch_V2) == 0, "bad loop size"); const int i0_stop = i0_start + 2*nbatch_V2; - const int i0_diff = i0_stop - i0_start; if constexpr (nstages <= 1) { + const int i0_diff = i0_stop - i0_start; if (!V_is_K_view || i0_stop > 2*nbatch_K2) { constexpr bool use_cp_async = nstages == 1; flash_attn_ext_f16_load_tile @@ -1724,6 +1724,7 @@ static __global__ void flash_attn_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { + ggml_cuda_pdl_sync(); // TODO optimize placement #if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)) // Skip unused kernel variants for faster compilation: diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index 7b0a5e5cf49..fac76f13593 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -894,6 +894,8 @@ static __global__ void flash_attn_tile( } float KQ_sum[cpw] = {0.0f}; + ggml_cuda_pdl_sync(); + // Load Q data, convert to FP16 if fast: #pragma unroll for (int jc0 = 0; jc0 < cpw; ++jc0) { diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index f0bd42a5761..b0a6cf67f1a 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -40,6 +40,7 @@ static __global__ void flash_attn_ext_vec( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { + ggml_cuda_pdl_lc(); #ifdef FLASH_ATTN_AVAILABLE // Skip unused kernel variants for faster compilation: @@ -136,6 +137,8 @@ static __global__ void flash_attn_ext_vec( #endif // V_DOT2_F32_F16_AVAILABLE int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; + + ggml_cuda_pdl_sync(); if constexpr (Q_q8_1) { #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index f19defbff93..4b6f6501094 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -86,6 +86,7 @@ static __global__ void flash_attn_ext_f16( constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); + ggml_cuda_pdl_sync(); const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. diff --git a/ggml/src/ggml-cuda/fwht.cu b/ggml/src/ggml-cuda/fwht.cu new file mode 100644 index 00000000000..184dc254c72 --- /dev/null +++ b/ggml/src/ggml-cuda/fwht.cu @@ -0,0 +1,101 @@ +#include "common.cuh" +#include "fwht.cuh" + +template +__launch_bounds__(4*ggml_cuda_get_physical_warp_size(), 1) +__global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows, const float scale) { + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + + const int64_t r = (int64_t) blockIdx.x * blockDim.y + threadIdx.y; + + if (r >= n_rows) { + return; + } + + src += r * N; + dst += r * N; + + static constexpr int el_w = N / warp_size; + float reg[el_w]; + const int lane = threadIdx.x; + + ggml_cuda_pdl_sync(); +#pragma unroll + for (int i = 0; i < el_w; ++i) { + reg[i] = src[i * warp_size + lane] * scale; + } + +#pragma unroll + for (int h = 1; h < warp_size; h *= 2) { +#pragma unroll + for (int j = 0; j < el_w; j++) { + const float val = reg[j]; + const float val2 = __shfl_xor_sync(0xFFFFFFFF, val, h, warp_size); + + reg[j] = (lane & h) == 0 ? val + val2 : val2 - val; + } + } + +#pragma unroll + for (int h = warp_size; h < N; h *= 2) { + const int step = h / warp_size; +#pragma unroll + for (int j = 0; j < el_w; j += 2 * step) { +#pragma unroll + for (int k = 0; k < step; k++) { + const float x = reg[j + k]; + const float y = reg[j + k + step]; + + reg[j + k] = x + y; + reg[j + k + step] = x - y; + } + } + } + +#pragma unroll + for (int i = 0; i < el_w; ++i) { + dst[i * warp_size + lane] = reg[i]; + } +} + +bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src, dst)); + if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) { + return false; + } + const int n = src->ne[0]; + const int64_t rows = ggml_nrows(src); + + const float * src_d = (const float *) src->data; + float * dst_d = (float *) dst->data; + + const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size; + const int rows_per_block = 4; + + const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block; + + cudaStream_t stream = ctx.stream(); + dim3 grid_dims(num_blocks, 1, 1); + dim3 block_dims(warp_size, rows_per_block, 1); + const ggml_cuda_kernel_launch_params launch_params = + ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); + + const float scale = 1 / sqrtf(n); + + switch (n) { + case 64: + ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale); + return true; + case 128: + ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale); + return true; + case 256: + ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale); + return true; + case 512: + ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale); + return true; + default: + return false; + } +} diff --git a/ggml/src/ggml-cuda/fwht.cuh b/ggml/src/ggml-cuda/fwht.cuh new file mode 100644 index 00000000000..cf3df94cafa --- /dev/null +++ b/ggml/src/ggml-cuda/fwht.cuh @@ -0,0 +1,4 @@ +#include "common.cuh" + +// Returns whether the Fast Walsh-Hadamard transform could be used. +bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index 6b44bec7317..7cfda652367 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -1,6 +1,7 @@ #include "gated_delta_net.cuh" +#include "ggml-cuda/common.cuh" -template +template __global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2) gated_delta_net_cuda(const float * q, const float * k, @@ -23,7 +24,8 @@ gated_delta_net_cuda(const float * q, int64_t sb3, const uint3 neqk1_magic, const uint3 rq3_magic, - float scale) { + float scale, + int K) { const uint32_t h_idx = blockIdx.x; const uint32_t sequence = blockIdx.y; // each warp owns one column, using warp-level primitives to reduce across rows @@ -37,9 +39,12 @@ gated_delta_net_cuda(const float * q, float * attn_data = dst; float * state = dst + attn_score_elems; - const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v; - state += state_offset; - curr_state += state_offset + col * S_v; + // input state layout (D, K, n_seqs) โ€” seq stride is K * D = K * H * S_v * S_v. + // output state layout (per-slot D * n_seqs) โ€” same per-(seq,head) offset as before. + const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v; + const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v; + state += state_out_offset; + curr_state += state_in_offset + col * S_v; attn_data += (sequence * n_tokens * H + h_idx) * S_v; constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v; @@ -48,6 +53,7 @@ gated_delta_net_cuda(const float * q, float s_shard[rows_per_lane]; // state is stored transposed: M[col][i] = S[i][col], row col is contiguous + ggml_cuda_pdl_sync(); #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; @@ -135,17 +141,35 @@ gated_delta_net_cuda(const float * q, } attn_data += S_v * H; + + if constexpr (keep_rs_t) { + // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots + // are written; earlier slots are left untouched (caller-owned). + const int shift = (int) n_tokens - K; + + const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output + const int target_slot = t - shift; + if (target_slot >= 0 && target_slot < K) { + float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset; +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + curr_state[col * S_v + i] = s_shard[r]; + } + } + } } - // Write state back to global memory (transposed layout) + if constexpr (!keep_rs_t) { #pragma unroll - for (int r = 0; r < rows_per_lane; r++) { - const int i = r * warp_size + lane; - state[col * S_v + i] = s_shard[r]; + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + state[col * S_v + i] = s_shard[r]; + } } } -template +template static void launch_gated_delta_net( const float * q_d, const float * k_d, const float * v_d, const float * g_d, const float * b_d, const float * s_d, @@ -155,7 +179,7 @@ static void launch_gated_delta_net( int64_t sv1, int64_t sv2, int64_t sv3, int64_t sb1, int64_t sb2, int64_t sb3, int64_t neqk1, int64_t rq3, - float scale, cudaStream_t stream) { + float scale, int K, cudaStream_t stream) { //TODO: Add chunked kernel for even faster pre-fill const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size; const int num_warps = 4; @@ -167,31 +191,32 @@ static void launch_gated_delta_net( int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); switch (S_v) { case 16: - gated_delta_net_cuda<16, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; case 32: - gated_delta_net_cuda<32, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; case 64: { - gated_delta_net_cuda<64, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; } case 128: { - gated_delta_net_cuda<128, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; } default: @@ -261,13 +286,29 @@ void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * cudaStream_t stream = ctx.stream(); + // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count. + const int K = (int) src_state->ne[1]; + const bool keep_rs = K > 1; + if (kda) { - launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, - S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1, rq3, scale, stream); + if (keep_rs) { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } else { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } } else { - launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, - S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1, rq3, scale, stream); + if (keep_rs) { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } else { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 36b840e8148..457b695eb2a 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -11,6 +11,7 @@ static __global__ void k_get_rows( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + ggml_cuda_pdl_sync(); for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) { for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. @@ -48,6 +49,8 @@ static __global__ void k_get_rows_float( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + ggml_cuda_pdl_lc(); + ggml_cuda_pdl_sync(); for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) { for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. @@ -83,6 +86,7 @@ static __global__ void k_get_rows_back_float( float sum = 0.0f; + ggml_cuda_pdl_sync(); for (int64_t i = 0; i < nrows_grad; ++i) { if (rows[i] != dst_row) { continue; @@ -156,7 +160,8 @@ static void get_rows_cuda_float( GGML_ASSERT(ne11 <= std::numeric_limits::max() / ne12); const uint3 ne12_fdv = init_fastdiv_values(ne12); - k_get_rows_float<<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{block_nums, block_dims, 0, stream}; + ggml_cuda_kernel_launch(k_get_rows_float, launch_params, src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10,*/ ne11, ne12_fdv, /*ne13,*/ diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e25be3592fd..18aaa098398 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -24,6 +24,7 @@ #include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/diag.cuh" #include "ggml-cuda/fattn.cuh" +#include "ggml-cuda/fwht.cuh" #include "ggml-cuda/getrows.cuh" #include "ggml-cuda/im2col.cuh" #include "ggml-cuda/mmf.cuh" @@ -2569,6 +2570,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0); use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false); use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]); + use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); } } else { @@ -2577,6 +2579,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0); use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false); use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]); + use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); } @@ -2594,6 +2597,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc); bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32; + const int32_t hint = ggml_get_op_params_i32(dst, 1); + if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) { + return; + } + if (!split && use_mul_mat_vec_f) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) @@ -4986,8 +4994,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * } static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { - GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; + ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context; + + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device)); + + return prop.integrated + ? GGML_BACKEND_DEVICE_TYPE_IGPU + : GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 49af5389957..a8f6046e46d 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -67,9 +67,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132 if ((nrows / nsm) < 2) { const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } else { const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } } diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index c2a8d54c95a..d55cc1ec7b5 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -91,7 +91,7 @@ static __global__ void mul_mat_f( const int row0 = blockIdx.x * rows_per_block; int expert_idx = 0; - int col_base = 0; + [[maybe_unused]] int col_base = 0; const int channel_dst = has_ids ? 0 : blockIdx.y; @@ -122,12 +122,12 @@ static __global__ void mul_mat_f( ids += col_offset * stride_row_id; } - const float2 * y2 = (const float2 *) y; + [[maybe_unused]] const float2 * y2 = (const float2 *) y; extern __shared__ char data_mmv[]; char * shmem_base = data_mmv; - int * slot_map = (int *) shmem_base; + [[maybe_unused]] int * slot_map = (int *) shmem_base; char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base; tile_C C[ntA][ntB]; diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index d9147202429..3d6de64b775 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -21,6 +21,7 @@ static __global__ void mul_mat_vec_f( int channel_y; int sample_dst; + ggml_cuda_pdl_sync(); if constexpr (is_multi_token_id) { // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case token_idx = blockIdx.z; @@ -79,9 +80,8 @@ static __global__ void mul_mat_vec_f( gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; } - const int channel_bias = ids ? channel_x : channel_dst; - if constexpr (has_fusion) { + const int channel_bias = ids ? channel_x : channel_dst; if (use_bias) { x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst; } @@ -94,7 +94,7 @@ static __global__ void mul_mat_vec_f( extern __shared__ char data_mmv[]; float * buf_iw = (float *) data_mmv; - float * buf_iw_gate = nullptr; + [[maybe_unused]] float * buf_iw_gate = nullptr; if constexpr (has_fusion) { buf_iw_gate = (float *) (data_mmv + warp_size*sizeof(float)); } @@ -122,7 +122,7 @@ static __global__ void mul_mat_vec_f( if constexpr (std::is_same_v) { const float2 * x2 = (const float2 *) x; - const float2 * gate_x2 = nullptr; + [[maybe_unused]] const float2 * gate_x2 = nullptr; if constexpr (has_fusion) { if (use_gate) { gate_x2 = (const float2 *) gate_x; @@ -154,7 +154,7 @@ static __global__ void mul_mat_vec_f( } } else if constexpr (std::is_same_v) { const half2 * x2 = (const half2 *) x; - const half2 * gate_x2 = nullptr; + [[maybe_unused]] const half2 * gate_x2 = nullptr; if constexpr (has_fusion) { if (use_gate) { gate_x2 = (const half2 *) gate_x; @@ -265,7 +265,7 @@ static __global__ void mul_mat_vec_f( } #else const nv_bfloat162 * x2 = (const nv_bfloat162 *) x; - const nv_bfloat162 * gate_x2 = nullptr; + [[maybe_unused]] const nv_bfloat162 * gate_x2 = nullptr; if constexpr (has_fusion) { if (use_gate) { gate_x2 = (const nv_bfloat162 *) gate_x; @@ -273,7 +273,7 @@ static __global__ void mul_mat_vec_f( } for (int col2 = tid; col2 < ncols2; col2 += block_size) { const nv_bfloat162 tmpx = x2[col2]; - nv_bfloat162 tmpx_gate; + [[maybe_unused]] nv_bfloat162 tmpx_gate; if constexpr (has_fusion) { if (use_gate) { tmpx_gate = gate_x2[col2]; @@ -298,6 +298,7 @@ static __global__ void mul_mat_vec_f( static_assert(std::is_same_v, "unsupported type"); } + ggml_cuda_pdl_lc(); #pragma unroll for (int j = 0; j < ncols_dst; ++j) { sumf[j] = warp_reduce_sum(sumf[j]); @@ -382,11 +383,13 @@ static void mul_mat_vec_f_switch_fusion( const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) { + const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, nbytes_shared, stream}; + const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + ggml_cuda_kernel_launch(mul_mat_vec_f, launch_params, + x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; @@ -395,8 +398,8 @@ static void mul_mat_vec_f_switch_fusion( GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + ggml_cuda_kernel_launch(mul_mat_vec_f, launch_params, + x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index da48f313a38..86b4a493019 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -63,6 +63,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) { enum mmvq_parameter_table_id { MMVQ_PARAMETERS_GENERIC = 0, + MMVQ_PARAMETERS_TURING, MMVQ_PARAMETERS_GCN, MMVQ_PARAMETERS_RDNA2, MMVQ_PARAMETERS_RDNA3_0, @@ -78,6 +79,8 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() { return MMVQ_PARAMETERS_RDNA2; #elif defined(GCN) || defined(CDNA) return MMVQ_PARAMETERS_GCN; +#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && __CUDA_ARCH__ < GGML_CUDA_CC_AMPERE + return MMVQ_PARAMETERS_TURING; #else return MMVQ_PARAMETERS_GENERIC; #endif @@ -96,6 +99,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) { if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) { return MMVQ_PARAMETERS_GCN; } + if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_AMPERE) { + return MMVQ_PARAMETERS_TURING; + } return MMVQ_PARAMETERS_GENERIC; } @@ -271,6 +277,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) { return MMVQ_MAX_BATCH_SIZE; } +bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) { + if (GGML_CUDA_CC_IS_CDNA(cc)) { + if (GGML_CUDA_CC_IS_CDNA1(cc)) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return ne11 <= 7; + case GGML_TYPE_Q5_1: + return ne11 <= 7; + case GGML_TYPE_Q8_0: + return ne11 <= 6; + case GGML_TYPE_Q2_K: + return ne11 <= 4; + case GGML_TYPE_Q3_K: + return ne11 <= 3; + case GGML_TYPE_Q4_K: + return ne11 <= 2; + case GGML_TYPE_Q5_K: + return ne11 <= 3; + case GGML_TYPE_Q6_K: + return ne11 <= 4; + case GGML_TYPE_IQ1_S: + return ne11 <= 5; + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_XS: + return ne11 <= 6; + default: + return ne11 <= MMVQ_MAX_BATCH_SIZE; + } + } + switch (type) { // tuned for CDNA2 + case GGML_TYPE_Q2_K: + return ne11 <= 5; + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return ne11 <= 3; + case GGML_TYPE_Q6_K: + return ne11 <= 5; + default: + return ne11 <= MMVQ_MAX_BATCH_SIZE; + } + } + return ne11 <= MMVQ_MAX_BATCH_SIZE; +} + // Device constexpr: returns the max batch size for the current arch+type at compile time. template static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() { @@ -359,7 +412,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_K: + return 8; case GGML_TYPE_Q6_K: + return 2; case GGML_TYPE_IQ4_NL: return 8; default: @@ -368,11 +423,38 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d } return 1; } + if (table_id == MMVQ_PARAMETERS_TURING) { + if (ncols_dst == 1) { + switch (type) { + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return 2; + default: + return 4; + } + } + switch (ncols_dst) { + case 2: + case 3: + case 4: + return 4; + case 5: + case 6: + case 7: + case 8: + return 2; + default: + return 1; + } + } return 1; } static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) { - if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) { + if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN || table_id == MMVQ_PARAMETERS_TURING) { switch (ncols_dst) { case 1: return small_k ? nwarps : 1; @@ -422,6 +504,7 @@ static __global__ void mul_mat_vec_q( uint32_t channel_y; uint32_t sample_dst; + ggml_cuda_pdl_sync(); channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio); channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst; sample_dst = blockIdx.z; @@ -432,7 +515,7 @@ static __global__ void mul_mat_vec_q( bool use_gate = false; bool use_bias = false; bool use_gate_bias = false; - const void * vgate = nullptr; + [[maybe_unused]] const void * vgate = nullptr; const float * x_bias = nullptr; const float * gate_bias = nullptr; ggml_glu_op active_glu; @@ -448,8 +531,8 @@ static __global__ void mul_mat_vec_q( } - float x_biases[ncols_dst] = { 0.0f }; - float gate_biases[ncols_dst] = { 0.0f }; + [[maybe_unused]] float x_biases[ncols_dst] = { 0.0f }; + [[maybe_unused]] float gate_biases[ncols_dst] = { 0.0f }; if constexpr (has_fusion) { const uint32_t channel_bias = ids ? channel_x : channel_dst; if (use_bias) { @@ -506,12 +589,7 @@ static __global__ void mul_mat_vec_q( } __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; - __shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; - if constexpr (!has_fusion) { - (void) tmp_shared_gate; - } else if (!use_gate) { - (void) tmp_shared_gate; - } + [[maybe_unused]] __shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; if (threadIdx.y > 0) { #pragma unroll @@ -681,8 +759,9 @@ static void mul_mat_vec_q_switch_fusion( const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (c_ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_q<<>> - (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch(mul_mat_vec_q, launch_params, + vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; @@ -691,8 +770,9 @@ static void mul_mat_vec_q_switch_fusion( GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_q<<>> - (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch(mul_mat_vec_q, launch_params, + vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); } diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh index 6bf0a8e8677..5605bf7a4e6 100644 --- a/ggml/src/ggml-cuda/mmvq.cuh +++ b/ggml/src/ggml-cuda/mmvq.cuh @@ -2,6 +2,8 @@ #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. +bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11); + // Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID, // based on the quantization type and GPU architecture (compute capability). int get_mmvq_mmid_max_batch(ggml_type type, int cc); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index ef98f675aa7..09d9f3a7d62 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -18,6 +18,7 @@ static __global__ void norm_f32( float2 mean_var = make_float2(0.0f, 0.0f); + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; mean_var.x += xi; @@ -46,6 +47,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr float tmp = 0.0f; // partial sum for thread in warp + ggml_cuda_pdl_sync(); for (int j = start; j < end; j += block_size) { tmp += x[j]; } @@ -95,6 +97,7 @@ static __global__ void rms_norm_f32(const float * x, const uint3 add_nrows_packed = make_uint3(0, 0, 0), const uint3 add_nchannels_packed = make_uint3(0, 0, 0), const uint3 add_nsamples_packed = make_uint3(0, 0, 0)) { + ggml_cuda_pdl_lc(); const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -124,6 +127,7 @@ static __global__ void rms_norm_f32(const float * x, float tmp = 0.0f; // partial sum for thread in warp + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; @@ -163,6 +167,7 @@ static __global__ void rms_norm_back_f32( float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xfi = xf[col]; sum_xx += xfi * xfi; @@ -253,6 +258,7 @@ static __global__ void l2_norm_f32( float tmp = 0.0f; // partial sum for thread in warp + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; @@ -261,6 +267,7 @@ static __global__ void l2_norm_f32( // sum up partial sums extern __shared__ float s_sum[]; tmp = block_reduce(tmp, s_sum); + ggml_cuda_pdl_lc(); // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html const float scale = rsqrtf(fmaxf(tmp, eps * eps)); @@ -300,10 +307,19 @@ static void rms_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = {blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, false>, launch_params, + x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, false>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } } @@ -346,14 +362,20 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, - mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); + mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, - mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); + mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } } else { const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols); @@ -367,14 +389,16 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims,block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, true, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, add_nchannels_packed, add_nsamples_packed); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, true, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, @@ -399,10 +423,12 @@ static void l2_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); - l2_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, 0, stream}; + ggml_cuda_kernel_launch(l2_norm_f32, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } else { const dim3 block_dims(1024, 1, 1); - l2_norm_f32<1024><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(l2_norm_f32<1024>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 52f664719ae..49516965cad 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -6,6 +6,7 @@ static __global__ void quantize_q8_1( const float * __restrict__ x, void * __restrict__ vy, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, const int64_t ne0, const uint32_t ne1, const uint3 ne2) { + ggml_cuda_pdl_lc(); const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= ne0) { @@ -28,6 +29,7 @@ static __global__ void quantize_q8_1( const int64_t ib = i_cont / QK8_1; // block index const int64_t iqs = i_cont % QK8_1; // quant index + ggml_cuda_pdl_sync(); const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -196,6 +198,7 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, const int64_t i2 = blockIdx.z % ne2; const int64_t i3 = blockIdx.z / ne2; + ggml_cuda_pdl_sync(); const int64_t i01 = ids ? ids[i1] : i1; const int64_t i02 = i2; const int64_t i03 = i3; @@ -288,6 +291,7 @@ static __global__ void quantize_mmq_q8_1( const int64_t i3 = blockIdx.z / ne2; const int64_t i00 = i0; + ggml_cuda_pdl_sync(); const int64_t i01 = ids ? ids[i1] : i1; const int64_t i02 = i2; const int64_t i03 = i3; @@ -378,7 +382,8 @@ void quantize_row_q8_1_cuda( const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, block_size, 0, stream); + ggml_cuda_kernel_launch(quantize_q8_1, launch_params, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv); GGML_UNUSED(type_src0); } diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh index de240fd4413..5895d3bf8e5 100644 --- a/ggml/src/ggml-cuda/reduce_rows.cuh +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -10,6 +10,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r const int num_unroll = 8; float temp[num_unroll]; float sum_temp[num_unroll] = { 0.0f }; + + ggml_cuda_pdl_sync(); for (int i = col; i < ncols;) { for (int j = 0; j < num_unroll; ++j) { if (i < ncols) { diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 45a49a5dc2a..e20a5cb6bed 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -134,6 +134,7 @@ static __global__ void rope_neox(const T * x, const float * freq_factors, const int64_t * row_indices, const int set_rows_stride) { + ggml_cuda_pdl_lc(); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne00) { @@ -148,6 +149,7 @@ static __global__ void rope_neox(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; + ggml_cuda_pdl_sync(); // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices. @@ -216,6 +218,7 @@ static __global__ void rope_multi(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; + ggml_cuda_pdl_sync(); if (i0 >= n_dims) { dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; @@ -300,6 +303,7 @@ static __global__ void rope_vision(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; + ggml_cuda_pdl_sync(); const int sect_dims = sections.v[0] + sections.v[1]; const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; @@ -399,13 +403,14 @@ static void rope_neox_cuda(const T * x, const dim3 block_nums(nr, n_blocks_x, 1); const float theta_scale = powf(freq_base, -2.0f / n_dims); + const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, 0, stream}; if (freq_factors == nullptr) { - rope_neox<<>>( + ggml_cuda_kernel_launch(rope_neox, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } else { - rope_neox<<>>( + ggml_cuda_kernel_launch(rope_neox, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } @@ -443,11 +448,13 @@ static void rope_multi_cuda(const T * x, const float theta_scale = powf(freq_base, -2.0f / n_dims); if (freq_factors == nullptr) { - rope_multi<<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(rope_multi, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } else { - rope_multi<<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(rope_multi, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 0ddeff6a175..7b2e59a4383 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -3,9 +3,11 @@ #define MAX_GRIDDIM_X 0x7FFFFFFF static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) { + ggml_cuda_pdl_lc(); int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; + ggml_cuda_pdl_sync(); for (int64_t i = tid; i < nelements; i += stride) { dst[i] = scale * x[i] + bias; } @@ -13,7 +15,8 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; - scale_f32<<>>(x, dst, scale, bias, nelements); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(scale_f32, launch_params, x, dst, scale, bias, nelements); } void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 631de7e8fa5..e14f96b824c 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -53,6 +53,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd); const int64_t i10 = i01; + ggml_cuda_pdl_sync(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; @@ -157,7 +158,9 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd); const int64_t i10 = i01; + ggml_cuda_pdl_sync(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + ggml_cuda_pdl_lc(); const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; @@ -203,9 +206,11 @@ static void set_rows_cuda( const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11); const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12); - k_set_rows<<>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, - s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, - ne11_fd, ne12_fd); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_size, block_size, 0, stream); + ggml_cuda_kernel_launch(k_set_rows, launch_params, + src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, + s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, + ne11_fd, ne12_fd); } } diff --git a/ggml/src/ggml-cuda/softcap.cu b/ggml/src/ggml-cuda/softcap.cu index 40dfe45d65c..9f0fa1051cf 100644 --- a/ggml/src/ggml-cuda/softcap.cu +++ b/ggml/src/ggml-cuda/softcap.cu @@ -1,18 +1,21 @@ #include "softcap.cuh" static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) { + ggml_cuda_pdl_lc(); const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } + ggml_cuda_pdl_sync(); dst[i] = tanhf(scale * x[i]) * softcap; } static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE; - softcap_f32<<>>(x, dst, scale, softcap, k); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(softcap_f32, launch_params, x, dst, scale, softcap, k); } // fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 4841389fbc8..48787b4b890 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -1,3 +1,4 @@ +#include "common.cuh" #include "ssm-conv.cuh" #include "unary.cuh" @@ -7,6 +8,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { + ggml_cuda_pdl_lc(); GGML_UNUSED(src0_nb0); const int tid = threadIdx.x; const int bidx = blockIdx.x; @@ -23,6 +25,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float float x[d_conv] = { 0.0f }; float w[d_conv] = { 0.0f }; + ggml_cuda_pdl_sync(); #pragma unroll for (size_t j = 0; j < d_conv; j++) { w[j] = w_block[tid * stride_w + j]; @@ -128,8 +131,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa constexpr int kNC = decltype(NC)::value; if (n_t <= 32) { const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); - ssm_conv_f32<<>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1, - dst, dst_nb0, dst_nb1, dst_nb2, n_t); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_conv_f32, launch_params, src0, src1, bias, src0_nb0, src0_nb1, + src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { const int64_t split_n_t = 32; dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); @@ -140,11 +144,12 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa }; switch (nc) { - case 3: launch_kernel(std::integral_constant{}); break; - case 4: launch_kernel(std::integral_constant{}); break; - case 5: launch_kernel(std::integral_constant{}); break; - case 9: launch_kernel(std::integral_constant{}); break; - default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now."); + case 3: launch_kernel(std::integral_constant{}); break; + case 4: launch_kernel(std::integral_constant{}); break; + case 5: launch_kernel(std::integral_constant{}); break; + case 9: launch_kernel(std::integral_constant{}); break; + case 15: launch_kernel(std::integral_constant{}); break; + default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9, 15 right now."); } } diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index c1d4e2bc8df..412980376ac 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -26,6 +26,7 @@ __global__ void __launch_bounds__(splitD, 1) const int64_t s_off, const int64_t d_inner, const int64_t L_param) { const size_t L = L_template == 0 ? L_param : L_template; + ggml_cuda_pdl_sync(); const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2); const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float)); const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float)); @@ -135,6 +136,7 @@ __global__ void __launch_bounds__(d_state, 1) const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float); + ggml_cuda_pdl_sync(); // TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state); const float * x_warp = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float))); @@ -206,7 +208,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa constexpr int num_warps = threads/WARP_SIZE; const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1); - ssm_scan_f32_group<128/WARP_SIZE, 128><<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_scan_f32_group<128/WARP_SIZE, 128>, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); @@ -215,7 +218,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa constexpr int num_warps = threads/WARP_SIZE; const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1); - ssm_scan_f32_group<256/WARP_SIZE, 256><<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_scan_f32_group<256/WARP_SIZE, 256>, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); @@ -231,58 +235,59 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1); const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float); if (d_state == 16) { + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream); switch (n_tok) { case 1: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 2: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 3: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 4: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 5: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 6: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 7: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 8: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; default: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 4025771aadb..0003658ca95 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -7,10 +7,12 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int const dim3 block_nums(nrows, 1, 1); if ((nrows / nsm) < 2) { const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, x, dst, ncols); } else { const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, x, dst, ncols); } } @@ -34,10 +36,12 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if ((nrows / nsm) < 2) { // Increase num threads to 512 for small nrows to better hide the latency const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } else { // Enough active SMs to hide latency, use smaller blocks to allow better scheduling const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } } diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu index 59ce36fb1c9..db1d39e2dc7 100644 --- a/ggml/src/ggml-cuda/top-k.cu +++ b/ggml/src/ggml-cuda/top-k.cu @@ -5,6 +5,7 @@ # include # if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2) # define CUB_TOP_K_AVAILABLE +# include using namespace cub; # endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2 #endif // GGML_CUDA_USE_CUB diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 3020e5c7433..c4253bfa43b 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -105,6 +105,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * wt[i] = -INFINITY; } + ggml_cuda_pdl_sync(); #pragma unroll for (int i = 0; i < n_experts; i += WARP_SIZE) { const int expert = i + threadIdx.x; @@ -133,7 +134,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * // selection_wt is only needed when bias is present (selection uses wt + bias) // when no bias, we use wt directly for both selection and weight values - float selection_wt[has_bias ? experts_per_thread : 1]; + [[maybe_unused]] float selection_wt[has_bias ? experts_per_thread : 1]; if constexpr (has_bias) { #pragma unroll @@ -161,6 +162,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * output_weights[i] = 0.f; } + ggml_cuda_pdl_lc(); for (int k = 0; k < n_expert_used; k++) { float max_val = wt[0]; int max_expert = threadIdx.x; @@ -271,51 +273,52 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx, dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1); dim3 block_dims(WARP_SIZE, rows_per_block, 1); cudaStream_t stream = ctx.stream(); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); switch (n_expert) { case 1: - topk_moe_cuda<1, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<1, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 2: - topk_moe_cuda<2, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<2, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 4: - topk_moe_cuda<4, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<4, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 8: - topk_moe_cuda<8, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<8, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 16: - topk_moe_cuda<16, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<16, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 32: - topk_moe_cuda<32, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<32, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 64: - topk_moe_cuda<64, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<64, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 128: - topk_moe_cuda<128, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<128, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 256: - topk_moe_cuda<256, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<256, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 512: - topk_moe_cuda<512, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<512, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 576: - topk_moe_cuda<576, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<576, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; default: GGML_ASSERT(false && "fatal error"); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 2aeba26f414..4cb805fa601 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -116,19 +116,22 @@ static __device__ __forceinline__ float op_trunc(float x) { template static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { + ggml_cuda_pdl_lc(); const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } + ggml_cuda_pdl_sync(); dst[i] = (T)op((float)x[i]); } template static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; - unary_op_kernel<<>>(x, dst, k); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(unary_op_kernel, launch_params, x, dst, k); } template @@ -258,6 +261,7 @@ void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { template static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) { + ggml_cuda_pdl_lc(); const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x; if (i >= k) { @@ -268,13 +272,15 @@ static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t j0 = (i / n) * o0 + (i % n); const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + ggml_cuda_pdl_sync(); dst[i] = (T)(op((float)x[j0]) * (float)g[j1]); } template static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) { const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE; - unary_gated_op_kernel<<>>(x, g, dst, k, n, o0, o1); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(unary_gated_op_kernel, launch_params, x, g, dst, k, n, o0, o1); } template diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 3d1c9da8329..48ded82e83c 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -39,7 +39,7 @@ #include "ggml-hexagon.h" #include "ggml-impl.h" #include "ggml-quants.h" -#include "op-desc.h" +#include "htp-opnode.h" #include "htp-ops.h" #include "htp_iface.h" #include "htp-drv.h" @@ -68,6 +68,7 @@ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C } static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE; static int opt_opbatch = 1024; // max number of ops in a batch static int opt_opqueue = 16; // max number of pending batches +static int opt_oppoll = 0; // polling for batch completions static std::regex* opt_opfilter = NULL; // regex of ops to not claim @@ -101,23 +102,23 @@ static const char * status_to_str(uint32_t status) { // ** debug helpers -static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const ggml_tensor * op, const uint32_t req_flags) { +static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_opnode & node, const uint32_t req_flags) { if (!opt_verbose) return; - op_desc desc(op); + htp_opformat fmt(node); GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, req_flags); + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags); } static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) { if (!opt_verbose) return; - op_desc desc(op); + htp_opformat fmt(htp_opformat(htp_opnode{const_cast(op), {}, HTP_OP_INVALID})); GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs, supp ? "yes" : "no"); + ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no"); } -static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op, +static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node, uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) { if (!opt_profile) return; @@ -128,15 +129,16 @@ static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_t pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]); } - op_desc desc(op); + htp_opformat fmt(node); GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(), - ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str); + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str); } // ** backend sessions struct ggml_hexagon_opbatch; struct ggml_hexagon_opqueue; +struct htp_opnode; struct ggml_hexagon_session { std::string name; @@ -166,7 +168,7 @@ struct ggml_hexagon_session { void allocate(int dev_id) noexcept(false); void release() noexcept(true); - void enqueue_op(htp_op_code opcode, const ggml_tensor *op); + void enqueue_op(const htp_opnode & node); void flush(bool all = true); void flush_pending(bool all = false); @@ -550,7 +552,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) size_t row_size = ggml_row_size(t->type, t->ne[0]); size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. @@ -611,7 +613,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) size_t row_size = ggml_row_size(t->type, t->ne[0]); size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) // Ensure we don't try to copy more data than the tensor actually contains. const size_t total_tensor_size = (size_t)nrows * row_size; @@ -660,6 +662,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) ggml_aligned_free(buf_rp, row_size_rp); } +static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) { + static const int qk = QK4_1; + + for (unsigned int i = 0; i < qk / 2; ++i) { + const int x0 = (x->qs[i] & 0x0F); + const int x1 = (x->qs[i] >> 4); + qs[bi * qk + i + 0] = x0; + qs[bi * qk + i + qk / 2] = x1; + } +} + +static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) { + static const int qk = QK4_1; + + for (unsigned int i = 0; i < qk / 2; ++i) { + const uint8_t x0 = qs[bi * qk + i + 0]; + const uint8_t x1 = qs[bi * qk + i + qk / 2]; + x->qs[i] = x0 | (x1 << 4); + } +} + +static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) { + static const int qk = QK_Q4_0x4x2; + const int nb = (k + qk - 1) / qk; // number of blocks (padded) + const int nloe = k % qk; // leftovers + + const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes + const int qblk_size = qk / 2; // int4 = 128 bytes + const int qrow_size = k / 2; // int4 (not padded to blocks) + + uint8_t * y_q = y + 0; // quants first + uint8_t * y_d = y + qrow_size; // then scales/offsets + + // Repack the quants + for (int i = 0; i < nb; i++) { + uint8_t qs[QK_Q4_0x4x2]; // unpacked quants + unpack_q4_1_quants(qs, &x[i * 8 + 0], 0); + unpack_q4_1_quants(qs, &x[i * 8 + 1], 1); + unpack_q4_1_quants(qs, &x[i * 8 + 2], 2); + unpack_q4_1_quants(qs, &x[i * 8 + 3], 3); + unpack_q4_1_quants(qs, &x[i * 8 + 4], 4); + unpack_q4_1_quants(qs, &x[i * 8 + 5], 5); + unpack_q4_1_quants(qs, &x[i * 8 + 6], 6); + unpack_q4_1_quants(qs, &x[i * 8 + 7], 7); + + bool partial = (nloe && i == nb-1); + + uint8_t * q = y_q + (i * qblk_size); + for (int j = 0; j < qk / 2; j++) { + q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000]; + } + } + + // Repack the scales and offsets + for (int i = 0; i < nb; i++) { + ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size); + for (int j = 0; j < 8; j++) { + d_m[j * 2 + 0] = x[i * 8 + j].d; + d_m[j * 2 + 1] = x[i * 8 + j].m; + } + } +} + +static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) { + static const int qk = QK_Q4_0x4x2; + const int nb = (k + qk - 1) / qk; // number of blocks (padded) + const int nloe = k % qk; // leftovers + + const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes + const int qblk_size = qk / 2; // int4 = 128 bytes + const int qrow_size = k / 2; // int4 (not padded to blocks) + + const uint8_t * y_q = y + 0; // quants first + const uint8_t * y_d = y + qrow_size; // then scales/offsets + + // Unpack the quants + for (int i = 0; i < nb; i++) { + uint8_t qs[QK_Q4_0x4x2]; + bool partial = (nloe && i == nb-1); + + const uint8_t * q = y_q + (i * qblk_size); + for (int j = 0; j < qk / 2; j++) { + if (partial) { + qs[j*2+0] = q[j] & 0x0F; + qs[j*2+1] = q[j] >> 4; + } else { + qs[j+000] = q[j] & 0x0F; + qs[j+128] = q[j] >> 4; + } + } + + pack_q4_1_quants(&x[i * 8 + 0], qs, 0); + pack_q4_1_quants(&x[i * 8 + 1], qs, 1); + pack_q4_1_quants(&x[i * 8 + 2], qs, 2); + pack_q4_1_quants(&x[i * 8 + 3], qs, 3); + pack_q4_1_quants(&x[i * 8 + 4], qs, 4); + pack_q4_1_quants(&x[i * 8 + 5], qs, 5); + pack_q4_1_quants(&x[i * 8 + 6], qs, 6); + pack_q4_1_quants(&x[i * 8 + 7], qs, 7); + } + + // Unpack the scales and offsets + for (int i = 0; i < nb; i++) { + const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size); + for (int j = 0; j < 8; j++) { + x[i * 8 + j].d = d_m[j * 2 + 0]; + x[i * 8 + j].m = d_m[j * 2 + 1]; + } + } +} + +static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) { + static const int qk = QK_Q4_0x4x2; + const int nb = (k + qk - 1) / qk; // number of blocks (padded) + + uint8_t qs[QK_Q4_0x4x2]; // unpacked quants + memset(qs, 0, sizeof(qs)); + + for (int i = 0; i < nb; i++) { + pack_q4_1_quants(&x[i * 8 + 0], qs, 0); + pack_q4_1_quants(&x[i * 8 + 1], qs, 1); + pack_q4_1_quants(&x[i * 8 + 2], qs, 2); + pack_q4_1_quants(&x[i * 8 + 3], qs, 3); + pack_q4_1_quants(&x[i * 8 + 4], qs, 4); + pack_q4_1_quants(&x[i * 8 + 5], qs, 5); + pack_q4_1_quants(&x[i * 8 + 6], qs, 6); + pack_q4_1_quants(&x[i * 8 + 7], qs, 7); + } + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < 8; j++) { + x[i * 8 + j].d = 0; + x[i * 8 + j].m = 0; + } + } +} + +static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) { + int64_t nrows = ggml_nrows(t); + + size_t row_size = ggml_row_size(t->type, t->ne[0]); + size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) + + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + + void * buf_pd = ggml_aligned_malloc(row_size_pd); + GGML_ASSERT(buf_pd != NULL); + + void * buf_rp = ggml_aligned_malloc(row_size_rp); + GGML_ASSERT(buf_rp != NULL); + + HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, + t->ne[0], nrows, row_size); + + init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]); + + for (int64_t i = 0; i < n_full_rows; i++) { + const uint8_t * src = (const uint8_t *) data + (i * row_size); + uint8_t * dst = (uint8_t *) t->data + (i * row_size); + + memcpy(buf_pd, src, row_size); + repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]); + memcpy(dst, buf_rp, row_size); + } + + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) data + (i * row_size); + uint8_t * dst = (uint8_t *) t->data + (i * row_size); + + init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]); + memcpy(buf_pd, src, n_rem_bytes); + repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]); + memcpy(dst, buf_rp, n_rem_bytes); + } + + ggml_aligned_free(buf_pd, row_size_pd); + ggml_aligned_free(buf_rp, row_size_rp); +} + +static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) { + int64_t nrows = ggml_nrows(t); + + size_t row_size = ggml_row_size(t->type, t->ne[0]); + size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) + + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + + void * buf_pd = ggml_aligned_malloc(row_size_pd); + GGML_ASSERT(buf_pd != NULL); + + void * buf_rp = ggml_aligned_malloc(row_size_rp); + GGML_ASSERT(buf_rp != NULL); + + HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, + t->ne[0], nrows, row_size); + + memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros + + for (int64_t i = 0; i < n_full_rows; i++) { + const uint8_t * src = (const uint8_t *) t->data + (i * row_size); + uint8_t * dst = (uint8_t *) data + (i * row_size); + + memcpy(buf_rp, src, row_size); + unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]); + memcpy(dst, buf_pd, row_size); + } + + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) t->data + (i * row_size); + uint8_t * dst = (uint8_t *) data + (i * row_size); + + // We still need to read and unpack the entire source row because quantization is block-based. + memcpy(buf_rp, src, row_size); + unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]); + memcpy(dst, buf_pd, n_rem_bytes); + } + + ggml_aligned_free(buf_pd, row_size_pd); + ggml_aligned_free(buf_rp, row_size_rp); +} + // ======== Q8x4x2 ==================== static void dump_block_q8_0(const block_q8_0 * b, int i) { HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2], @@ -876,7 +1111,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) size_t row_size = ggml_row_size(t->type, t->ne[0]); size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. @@ -937,7 +1172,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) size_t row_size = ggml_row_size(t->type, t->ne[0]); size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales) // Ensure we don't try to copy more data than the tensor actually contains. const size_t total_tensor_size = (size_t)nrows * row_size; @@ -1238,7 +1473,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si size_t row_size = ggml_row_size(t->type, t->ne[0]); size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad - size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) // Ensure we don't try to read more data than is available in the source buffer 'data' // or write more than the tensor can hold. @@ -1299,7 +1534,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si size_t row_size = ggml_row_size(t->type, t->ne[0]); size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad - size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) // Ensure we don't try to copy more data than the tensor actually contains. const size_t total_tensor_size = (size_t)nrows * row_size; @@ -1365,6 +1600,12 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, repack_q4_0_q4x4x2(tensor, data, size); break; + case GGML_TYPE_Q4_1: + GGML_ASSERT(offset == 0); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); + repack_q4_1_q4x4x2(tensor, data, size); + break; + case GGML_TYPE_Q8_0: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); @@ -1407,6 +1648,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer, repack_q4x4x2_q4_0(data, tensor, size); break; + case GGML_TYPE_Q4_1: + GGML_ASSERT(offset == 0); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); + repack_q4x4x2_q4_1(data, tensor, size); + break; + case GGML_TYPE_Q8_0: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); @@ -1536,12 +1783,10 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host, }; -// Backend session implementation - struct ggml_hexagon_opbatch { ggml_hexagon_session* sess; - std::vector ops; // pointers to original ops + std::vector ops; // htp_opnode of ops std::vector h_bufs; // htp buffer descriptors std::vector h_tens; // htp tensor descriptors @@ -1673,7 +1918,7 @@ struct ggml_hexagon_opbatch { return ti; } - bool fit_op(const struct ggml_tensor *t) const { + bool fit_op(const htp_opnode & node) const { if (n_ops >= n_ops_max ) return false; // check how much extras we will need @@ -1693,10 +1938,10 @@ struct ggml_hexagon_opbatch { } }; - for (unsigned int i=0; i < HTP_OP_MAX_INPUTS && t->src[i]; i++) { - fit_tensor(t->src[i]); + for (const auto * src : node.get_inputs()) { + fit_tensor(src); } - fit_tensor(t); + fit_tensor(node.dst()); if ((extra_bufs + n_bufs) > n_bufs_max) return false; if ((extra_tens + n_tens) > n_tens_max) return false; @@ -1706,29 +1951,30 @@ struct ggml_hexagon_opbatch { } // assumes that fit_op() was called first and returned true - void add_op(htp_op_code opcode, const struct ggml_tensor * t) { + void add_op(const htp_opnode & node) { // Add new op unsigned int n = n_ops++; GGML_ASSERT(n_ops <= n_ops_max); - ops[n] = t; + ops[n] = node; htp_op_desc &o = h_ops[n]; - memcpy(&o.params, &t->op_params, sizeof(t->op_params)); - o.opcode = opcode; + memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params)); + o.opcode = node.opcode; o.flags = 0; if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) { o.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags); + ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags); + auto inputs = node.get_inputs(); for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) { - o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff; + o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff; } - o.dst = add_tensor(t); + o.dst = add_tensor(node.dst()); } }; @@ -1737,7 +1983,7 @@ struct ggml_hexagon_opqueue { ggml_hexagon_shared_buffer *shm_buf; size_t shm_blk_size; - using opvec = std::vector; + using opvec = std::vector; std::queue done; // completed batch ids std::vector op_cache; // per batch op cache @@ -1886,7 +2132,8 @@ void ggml_hexagon_session::flush_pending(bool all) { uint32_t n_dbufs; // Read response packet from queue - int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, DSPQUEUE_TIMEOUT); + const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT; + int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo); if (err == AEE_EEXPIRED) { continue; } @@ -1935,11 +2182,11 @@ void ggml_hexagon_session::flush_batch() { } } -void ggml_hexagon_session::enqueue_op(htp_op_code opcode, const ggml_tensor *op) { - if (!op_batch->fit_op(op)) { +void ggml_hexagon_session::enqueue_op(const htp_opnode & node) { + if (!op_batch->fit_op(node)) { flush_batch(); } - op_batch->add_op(opcode, op); + op_batch->add_op(node); } // Flush HTP response queue i.e wait for all outstanding requests to complete @@ -2290,6 +2537,7 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses const int64_t H = v->ne[1]; const int64_t n_tokens = v->ne[2]; const int64_t n_seqs = v->ne[3]; + const int64_t K = state->ne[1]; if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) { return false; @@ -2302,10 +2550,10 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) { return false; } - if (ggml_nelements(state) != S_v * S_v * H * n_seqs) { + if (ggml_nelements(state) != S_v * S_v * H * n_seqs * K) { return false; } - if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) { + if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) { return false; } @@ -2327,6 +2575,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s switch (src0->type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: @@ -2377,6 +2626,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session switch (src0->type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q8_0: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: @@ -2661,7 +2911,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess int mode = op_params[2]; - if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) { + if (mode == GGML_ROPE_TYPE_VISION) { return false; } if (mode & 1) { @@ -2735,12 +2985,25 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * if (dst->ne[0] != d_inner || dst->ne[1] != n_t || dst->ne[2] != n_s) { return false; } + if (src0->nb[0] != sizeof(float) || src1->nb[0] != sizeof(float) || dst->nb[0] != sizeof(float)) { + return false; + } + if (src0->nb[1] != src0->ne[0] * sizeof(float) || src1->nb[1] != src1->ne[0] * sizeof(float)) { + return false; + } - // TODO: add support for non-contiguous tensors - if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) { + return true; +} + +static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { return false; } + GGML_UNUSED(sess); return true; } @@ -2816,6 +3079,21 @@ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32) { return false; } + if (dst->type != GGML_TYPE_F32) { return false; } + if (!ggml_are_same_shape(src0, dst)) { return false; } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; } + + return true; + + GGML_UNUSED(sess); +} + static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { auto sess = static_cast(backend->context); return sess->c_name(); @@ -2843,8 +3121,10 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS; case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS; case GGML_OP_ARGSORT: return HTP_OP_ARGSORT; + case GGML_OP_NORM: return HTP_OP_NORM; case GGML_OP_L2_NORM: return HTP_OP_L2_NORM; case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM; + case GGML_OP_CONCAT: return HTP_OP_CONCAT; case GGML_OP_SCALE: return HTP_OP_SCALE; case GGML_OP_SQR: return HTP_OP_SQR; case GGML_OP_SQRT: return HTP_OP_SQRT; @@ -2857,6 +3137,9 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_FILL: return HTP_OP_FILL; case GGML_OP_DIAG: return HTP_OP_DIAG; case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_TRI: return HTP_OP_TRI; + case GGML_OP_PAD: return HTP_OP_PAD; + case GGML_OP_UNARY: switch (ggml_get_unary_op(t)) { case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU; @@ -2896,10 +3179,43 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes); + std::vector nodes; + nodes.reserve(graph->n_nodes); + + // Fusion for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * n = graph->nodes[i]; - if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) { - sess->enqueue_op(op_remap_to_htp(n), n); + if (!op_is_compute(n)) { + continue; + } + + ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr; + + htp_opnode node = { + /*.node =*/ n, + /*.fused =*/ {}, + /*.opcode =*/ HTP_OP_INVALID + }; + + if (n->op == GGML_OP_RMS_NORM && next_node) { + if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + node.add_fused(next_node); + node.opcode = HTP_OP_RMS_NORM_MUL; + i++; // skip the fused MUL node + } + } + + if (node.opcode == HTP_OP_INVALID) { + node.opcode = op_remap_to_htp(n); + } + + nodes.push_back(std::move(node)); + } + + // Queue and execute + if (opt_opstage & HTP_OPSTAGE_QUEUE) { + for (const auto & node : nodes) { + sess->enqueue_op(node); } } @@ -2918,51 +3234,7 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) { sess->flush(); } -struct node_info { - ggml_tensor * node; - - std::vector fused; - - ggml_op op() const { - return node->op; - } - - const ggml_tensor * dst() const { - return fused.empty() ? node : fused.back(); - } - - const ggml_tensor * src0() const { - return node->src[0]; - } - - const ggml_tensor * src1() const { - return node->src[1]; - } - - bool is_empty() const { - return ggml_op_is_empty(node->op); - } - - void add_fused(ggml_tensor * t) { - fused.push_back(t); - } - - bool stackable() const { - switch (this->op()) { - case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: - return ggml_is_quantized(this->src0()->type); - default: - return false; - } - } - - bool same_input(const node_info& n) const { - return n.src1() == this->src1(); - } -}; - -static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { +static std::vector ggml_hexagon_graph_optimize_reorder(const std::vector & nodes) { const int n = nodes.size(); std::vector res; @@ -3016,14 +3288,14 @@ static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgr enum ggml_op ops[MAX_FUSE]; - std::vector nodes; + std::vector nodes; nodes.reserve(gf->n_nodes); // fuse nodes: // we don't want to make reorders that break fusing, so we first pack all fusable tensors // and perform the reorder over the fused nodes. after the reorder is done, we unfuse for (int i = 0; i < n; i++) { - node_info node = { + htp_opnode node = { /*.node =*/gf->nodes[i], /*.fused =*/{}, }; @@ -3254,6 +3526,25 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se return true; } +static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + int dim = ((const int32_t *) op->op_params)[0]; + if (dim < 0 || dim >= GGML_MAX_DIMS) { + return false; + } + + for (int i = 0; i < GGML_MAX_SRC; ++i) { + const struct ggml_tensor * src = op->src[i]; + if (!src) { + continue; + } + if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) { + return false; + } + } + + return true; +} + static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { const struct ggml_tensor * dst = op; @@ -3308,10 +3599,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_add_id(sess, op); break; + case GGML_OP_NORM: case GGML_OP_L2_NORM: - supp = ggml_hexagon_supported_unary(sess, op); - break; - case GGML_OP_RMS_NORM: case GGML_OP_SCALE: supp = ggml_hexagon_supported_unary(sess, op); @@ -3404,6 +3693,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_cumsum(sess, op); break; + case GGML_OP_CONCAT: + supp = ggml_hexagon_supported_concat(sess, op); + break; + case GGML_OP_FILL: supp = ggml_hexagon_supported_fill(sess, op); break; @@ -3416,6 +3709,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_solve_tri(sess, op); break; + case GGML_OP_TRI: + supp = ggml_hexagon_supported_tri(sess, op); + break; + + case GGML_OP_PAD: + supp = ggml_hexagon_supported_pad(sess, op); + break; + default: break; } @@ -3560,6 +3861,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { // Basic sanity checks to make sure definitions match static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0, "please update hexagon_type to match ggml_type"); + static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1, + "please update hexagon_type to match ggml_type"); static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0, "please update hexagon_type to match ggml_type"); static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4, @@ -3572,6 +3875,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE"); const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH"); const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE"); + const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL"); const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER"); const char * str_profile = getenv("GGML_HEXAGON_PROFILE"); const char * str_etm = getenv("GGML_HEXAGON_ETM"); @@ -3609,6 +3913,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage; opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch; opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue; + opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll; opt_profile = str_profile ? atoi(str_profile) : 0; opt_etm = str_etm ? atoi(str_etm) : 0; opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; diff --git a/ggml/src/ggml-hexagon/htp-opnode.h b/ggml/src/ggml-hexagon/htp-opnode.h new file mode 100644 index 00000000000..14b232240b4 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp-opnode.h @@ -0,0 +1,241 @@ +#ifndef HTP_OPNODE_H +#define HTP_OPNODE_H + +#define GGML_COMMON_IMPL_CPP +#include "ggml-backend-impl.h" +#include "ggml-common.h" + +#include +#include +#include +#include "htp-ops.h" + +struct htp_opnode { + ggml_tensor * node = nullptr; + + std::vector fused; + + htp_op_code opcode = HTP_OP_INVALID; + + ggml_op op() const { + return node->op; + } + + const ggml_tensor * dst() const { + return fused.empty() ? node : fused.back(); + } + + const ggml_tensor * src0() const { + return node->src[0]; + } + + const ggml_tensor * src1() const { + return node->src[1]; + } + + bool is_empty() const { + return ggml_op_is_empty(node->op); + } + + void add_fused(ggml_tensor * t) { + fused.push_back(t); + } + + bool stackable() const { + switch (this->op()) { + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + return ggml_is_quantized(this->src0()->type); + default: + return false; + } + } + + bool same_input(const htp_opnode& n) const { + return n.src1() == this->src1(); + } + + std::vector get_inputs() const { + std::vector inputs; + std::vector outputs; + outputs.push_back(node); + for (const auto * f : fused) { + outputs.push_back(f); + } + + auto contains = [&](const std::vector & vec, const ggml_tensor * t) { + for (const auto * x : vec) { + if (x == t) return true; + } + return false; + }; + + auto add_input = [&](const ggml_tensor * t) { + if (t && !contains(outputs, t) && !contains(inputs, t)) { + inputs.push_back(t); + } + }; + + for (int i = 0; i < GGML_MAX_SRC && node->src[i]; i++) { + add_input(node->src[i]); + } + for (const auto * f : fused) { + for (int i = 0; i < GGML_MAX_SRC && f->src[i]; i++) { + add_input(f->src[i]); + } + } + return inputs; + } + + std::string op_name() const { + if (fused.empty()) { + return ggml_op_desc(node); + } + std::string name = ggml_op_desc(node); + for (const auto * f : fused) { + name += "+"; + name += ggml_op_desc(f); + } + return name; + } +}; + +struct htp_opformat { + char strides[64 * GGML_MAX_SRC]; + char dims[64 * GGML_MAX_SRC]; + char types[16 * GGML_MAX_SRC]; + char buffs[64 * GGML_MAX_SRC]; + char names[64 * GGML_MAX_SRC]; + + int format_tensor_dims(char * str, const struct ggml_tensor * t) { + if (t->ne[2] == 1 && t->ne[3] == 1) { + return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]); + } else { + return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + } + } + + void format_op_dims(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += format_tensor_dims(p, inputs[0]); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += format_tensor_dims(p, inputs[i]); + } + + p += sprintf(p, " -> "); + } + + char self[64]; + format_tensor_dims(self, node.dst()); + p += sprintf(p, "%s", self); + } + + int format_tensor_strides(char * str, const struct ggml_tensor * t) { + const char * c = ggml_is_contiguous(t) ? "" : "!"; + + if (t->ne[2] == 1 && t->ne[3] == 1) { + return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c); + } else { + return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c); + } + } + + void format_op_strides(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += format_tensor_strides(p, inputs[0]); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += format_tensor_strides(p, inputs[i]); + } + + p += sprintf(p, " -> "); + } + + char self[64]; + format_tensor_strides(self, node.dst()); + p += sprintf(p, "%s", self); + } + + void format_op_types(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += sprintf(p, "%s", ggml_type_name(inputs[0]->type)); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", ggml_type_name(inputs[i]->type)); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%s", ggml_type_name(node.dst()->type)); + } + + const char * tensor_buff_name(const struct ggml_tensor * t) { + if (t->buffer) { + return ggml_backend_buffer_name(t->buffer); + } + return "NONE"; + } + + void format_op_buffs(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += sprintf(p, "%s", tensor_buff_name(inputs[0])); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", tensor_buff_name(inputs[i])); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%s", tensor_buff_name(node.dst())); + } + + void format_op_names(char * str, const htp_opnode & node) { + char * p = str; + auto inputs = node.get_inputs(); + + if (!inputs.empty()) { + p += sprintf(p, "%s", inputs[0]->name); + + for (size_t i = 1; i < inputs.size(); i++) { + p += sprintf(p, " x "); + p += sprintf(p, "%s", inputs[i]->name); + } + + p += sprintf(p, " -> "); + } + + p += sprintf(p, "%s", node.dst()->name); + } + + void format(const htp_opnode & node) { + format_op_dims(dims, node); + format_op_strides(strides, node); + format_op_types(types, node); + format_op_buffs(buffs, node); + format_op_names(names, node); + } + + htp_opformat() {} + htp_opformat(const htp_opnode & node) { format(node); } +}; + +#endif // HTP_OPNODE_H diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index bcadac11f95..ff3fc0804e3 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -35,9 +35,11 @@ add_library(${HTP_LIB} SHARED ssm-conv.c cumsum-ops.c fill-ops.c + concat-ops.c diag-ops.c solve-tri-ops.c gated-delta-net-ops.c + pad-ops.c ) target_compile_definitions(${HTP_LIB} PRIVATE @@ -56,15 +58,16 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx) if (_hmx_idx GREATER_EQUAL 0) target_sources(${HTP_LIB} PRIVATE - hmx-queue.c - hmx-matmul-ops.c hmx-flash-attn-ops.c + hmx-matmul-ops.c + hmx-queue.c ) # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h) set_source_files_properties( - hmx-matmul-ops.c hmx-flash-attn-ops.c + hmx-matmul-ops.c + hmx-queue.c PROPERTIES COMPILE_OPTIONS "-mhmx" ) diff --git a/ggml/src/ggml-hexagon/htp/concat-ops.c b/ggml/src/ggml-hexagon/htp/concat-ops.c new file mode 100644 index 00000000000..61580f2c08f --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/concat-ops.c @@ -0,0 +1,275 @@ +#include "htp-ctx.h" +#include "htp-ops.h" +#include "hexagon_types.h" +#include "hexagon_protos.h" +#include "hvx_hexagon_protos.h" +#include "hex-dma.h" +#include "vtcm-utils.h" +#include "hvx-utils.h" +#include "hex-fastdiv.h" +#include + +struct htp_concat_context { + struct htp_ops_context * octx; + uint32_t dim; + uint32_t nrows_per_thread; + struct fastdiv_values div_ne0; + struct fastdiv_values div_ne1; + struct fastdiv_values div_ne2; +}; + +static void concat_2d_f32_transposed(unsigned int nth, unsigned int ith, void * data) { + struct htp_concat_context * cctx = (struct htp_concat_context *) data; + struct htp_ops_context * octx = cctx->octx; + + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; + + const uint32_t src0_ne0 = src0->ne[0]; + const uint32_t src1_ne0 = src1->ne[0]; + const uint32_t ne1 = dst->ne[1]; + + const uint32_t start_i = ith * cctx->nrows_per_thread; + const uint32_t end_i = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1; + if (start_i >= end_i) return; + + dma_queue * q = octx->ctx->dma[ith]; + + uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; + uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread; + + const uint32_t block_i = 32; + const uint32_t spad1_stride = block_i * sizeof(float); + + int32_t offsets[32] __attribute__((aligned(128))); + for(int k=0; k<32; k++) { + offsets[k] = k * spad1_stride; + } + HVX_Vector vv = *(HVX_Vector*)offsets; + const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 32); + const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(float), VLEN); + uint32_t mu = src1_ne0_padded * spad1_stride; + + for (uint32_t i = start_i; i < end_i; i += block_i) { + uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i; + + uint32_t src1_width_bytes = current_block_i * sizeof(float); + uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1]; + dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0); + + uint32_t src0_row_bytes = src0_ne0 * sizeof(float); + uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1]; + dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i); + + dma_queue_pop(q); // src1 + + HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride); + + for (uint32_t j = 0; j < src1_ne0_padded; j += 32) { + #pragma unroll(4) + for (uint32_t ii = 0; ii < current_block_i; ii++) { + size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(float)); + Q6_vgather_ARMVw(&vtcm_tmp[ii], rt, mu, vv); + uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(float); + hvx_vmemu(dst_ptr) = vtcm_tmp[ii]; + } + } + + dma_queue_pop(q); // src0 + + uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1]; + dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(float), current_block_i); + + dma_queue_pop(q); + } +} + +static void concat_2d_f16_transposed(unsigned int nth, unsigned int ith, void * data) { + struct htp_concat_context * cctx = (struct htp_concat_context *) data; + struct htp_ops_context * octx = cctx->octx; + + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; + + const uint32_t src0_ne0 = src0->ne[0]; + const uint32_t src1_ne0 = src1->ne[0]; + const uint32_t ne1 = dst->ne[1]; + + const uint32_t start_i = ith * cctx->nrows_per_thread; + const uint32_t end_i = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1; + if (start_i >= end_i) return; + + dma_queue * q = octx->ctx->dma[ith]; + + uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; + uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread; + + const uint32_t block_i = 64; + const uint32_t spad1_stride = block_i * sizeof(__fp16); + + int16_t offsets[64] __attribute__((aligned(128))); + for(int k=0; k<64; k++) { + offsets[k] = k * spad1_stride; + } + HVX_Vector vv = *(HVX_Vector*)offsets; + const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 64); + const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(__fp16), VLEN); + uint32_t mu = src1_ne0_padded * spad1_stride; + + for (uint32_t i = start_i; i < end_i; i += block_i) { + uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i; + + uint32_t src1_width_bytes = current_block_i * sizeof(__fp16); + uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1]; + dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0); + + uint32_t src0_row_bytes = src0_ne0 * sizeof(__fp16); + uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1]; + dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i); + + dma_queue_pop(q); // src1 + + HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride); + + for (uint32_t j = 0; j < src1_ne0_padded; j += 64) { + #pragma unroll(4) + for (uint32_t ii = 0; ii < current_block_i; ii++) { + size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(__fp16)); + Q6_vgather_ARMVh(&vtcm_tmp[ii], rt, mu, vv); + uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(__fp16); + hvx_vmemu(dst_ptr) = vtcm_tmp[ii]; + } + } + + dma_queue_pop(q); // src0 + + uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1]; + dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(__fp16), current_block_i); + + dma_queue_pop(q); + } +} + +static void concat_generic(unsigned int nth, unsigned int ith, void * data) { + struct htp_concat_context * cctx = (struct htp_concat_context *) data; + struct htp_ops_context * octx = cctx->octx; + + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; + + const int dim = cctx->dim; + const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2; + + const uint32_t ne[4] = {dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]}; + const uint32_t total_elements = ne[0] * ne[1] * ne[2] * ne[3]; + const uint32_t chunk_size = (total_elements + nth - 1) / nth; + + const uint32_t start_idx = MIN(ith * chunk_size, total_elements); + const uint32_t end_idx = MIN(start_idx + chunk_size, total_elements); + + // Naive scalar element-wise copy + for (uint32_t idx = start_idx; idx < end_idx; idx++) { + uint32_t idx_div_ne0 = fastdiv(idx, &cctx->div_ne0); + uint32_t i0 = idx - idx_div_ne0 * ne[0]; + + uint32_t idx_div_ne01 = fastdiv(idx_div_ne0, &cctx->div_ne1); + uint32_t i1 = idx_div_ne0 - idx_div_ne01 * ne[1]; + + uint32_t idx_div_ne012 = fastdiv(idx_div_ne01, &cctx->div_ne2); + uint32_t i2 = idx_div_ne01 - idx_div_ne012 * ne[2]; + uint32_t i3 = idx_div_ne012; + + uint8_t * dst_ptr = (uint8_t *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2] + i1 * dst->nb[1] + i0 * dst->nb[0]; + + uint32_t idx_dim = 0; + if (dim == 0) idx_dim = i0; + else if (dim == 1) idx_dim = i1; + else if (dim == 2) idx_dim = i2; + else if (dim == 3) idx_dim = i3; + + const struct htp_tensor * src = (idx_dim < src0->ne[dim]) ? src0 : src1; + + uint32_t s0 = i0; + uint32_t s1 = i1; + uint32_t s2 = i2; + uint32_t s3 = i3; + + if (dim == 0 && src == src1) s0 -= src0->ne[0]; + if (dim == 1 && src == src1) s1 -= src0->ne[1]; + if (dim == 2 && src == src1) s2 -= src0->ne[2]; + if (dim == 3 && src == src1) s3 -= src0->ne[3]; + + uint8_t * src_ptr = (uint8_t *)src->data + s3 * src->nb[3] + s2 * src->nb[2] + s1 * src->nb[1] + s0 * src->nb[0]; + + if (type_size == 4) { + *(float*)dst_ptr = *(float*)src_ptr; + } else { + *(__fp16*)dst_ptr = *(__fp16*)src_ptr; + } + } +} + +int op_concat(struct htp_ops_context * octx) { + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * src1 = octx->src[1]; + const struct htp_tensor * dst = octx->dst; + + int dim = octx->op_params[0]; + + bool is_2d = dst->ne[2] == 1 && dst->ne[3] == 1; + + const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2; + bool is_src1_transposed = (src1->nb[0] > src1->nb[1]); + bool is_src0_transposed = (src0->nb[0] > src0->nb[1]); + + uint32_t n_threads = octx->n_threads; + struct htp_concat_context cctx; + cctx.octx = octx; + cctx.dim = dim; + cctx.div_ne0 = init_fastdiv_values(dst->ne[0]); + cctx.div_ne1 = init_fastdiv_values(dst->ne[1]); + cctx.div_ne2 = init_fastdiv_values(dst->ne[2]); + + void (*worker_func)(unsigned int, unsigned int, void *) = concat_generic; + + if (dim == 0 && is_2d && is_src1_transposed && !is_src0_transposed) { + n_threads = MIN(dst->ne[1], n_threads); + if (n_threads < 1) { + n_threads = 1; + } + uint32_t block_i = (type_size == 4) ? 32 : 64; + + cctx.nrows_per_thread = hmx_ceil_div(dst->ne[1], n_threads); + + // Allocate VTCM + uint32_t spad1_stride = block_i * type_size; + + uint32_t src1_ne0_padded = hex_round_up(src1->ne[0], block_i); + uint32_t spad0_row_bytes = hex_round_up((src0->ne[0] + src1_ne0_padded) * type_size, VLEN); + + octx->src0_spad.size_per_thread = block_i * spad0_row_bytes; + octx->src1_spad.size_per_thread = src1_ne0_padded * spad1_stride + block_i * VLEN; + + octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; + octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread; + + if (octx->src0_spad.size + octx->src1_spad.size > octx->ctx->vtcm_size) { + return HTP_STATUS_VTCM_TOO_SMALL; + } + + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; + + if (type_size == 4) { + worker_func = concat_2d_f32_transposed; + } else { + worker_func = concat_2d_f16_transposed; + } + } + + worker_pool_run_func(octx->ctx->worker_pool, worker_func, &cctx, n_threads); + return HTP_STATUS_OK; +} diff --git a/ggml/src/ggml-hexagon/htp/cpy-ops.c b/ggml/src/ggml-hexagon/htp/cpy-ops.c index 5c040a32224..ae507effa51 100644 --- a/ggml/src/ggml-hexagon/htp/cpy-ops.c +++ b/ggml/src/ggml-hexagon/htp/cpy-ops.c @@ -28,158 +28,170 @@ struct htp_copy_context { uint32_t dst_blocks_per_row; uint32_t src0_nrows_per_thread; - - void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith); }; #define cpy_preamble \ const struct htp_tensor *src0 = octx->src[0]; \ const struct htp_tensor *dst = octx->dst; \ \ - const uint32_t ne00 = src0->ne[0]; \ - const uint32_t ne01 = src0->ne[1]; \ - const uint32_t ne02 = src0->ne[2]; \ - const uint32_t ne03 = src0->ne[3]; \ - \ - const uint32_t nb00 = src0->nb[0]; \ - const uint32_t nb01 = src0->nb[1]; \ - const uint32_t nb02 = src0->nb[2]; \ - const uint32_t nb03 = src0->nb[3]; \ - \ - const uint32_t ne0 = dst->ne[0]; \ - const uint32_t ne1 = dst->ne[1]; \ - const uint32_t ne2 = dst->ne[2]; \ - const uint32_t ne3 = dst->ne[3]; \ - \ - const uint32_t nb0 = dst->nb[0]; \ - const uint32_t nb1 = dst->nb[1]; \ - const uint32_t nb2 = dst->nb[2]; \ - const uint32_t nb3 = dst->nb[3]; \ - \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne02 = src0->ne[2]; \ + const uint32_t ne03 = src0->ne[3]; \ + \ + const uint32_t nb00 = src0->nb[0]; \ + const uint32_t nb01 = src0->nb[1]; \ + const uint32_t nb02 = src0->nb[2]; \ + const uint32_t nb03 = src0->nb[3]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb0 = dst->nb[0]; \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ + const uint32_t nb3 = dst->nb[3]; \ + \ const uint32_t nr = ne01; -static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) { - cpy_preamble; - - // parallelize by src0 rows - const uint32_t dr = ct->src0_nrows_per_thread; - const uint32_t ir0 = dr * ith; - const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; - - // copy by rows - for (uint32_t i03 = 0; i03 < ne03; i03++) { - for (uint32_t i02 = 0; i02 < ne02; i02++) { - #pragma unroll(2) - for (uint32_t i01 = ir0; i01 < ir1; i01++) { - uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; - uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2); - hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size); - } - } - } +#define DEFINE_CPY_SAMESHAPE(NAME, ELEM_TYPE, ELEM_SIZE) \ +static void cpy_thread_##NAME##_sameshape(unsigned int nth, unsigned int ith, void * data) { \ + struct htp_copy_context * ct = (struct htp_copy_context *) data; \ + struct htp_ops_context * octx = ct->octx; \ + cpy_preamble; \ + const uint32_t dr = ct->src0_nrows_per_thread; \ + const uint32_t ir0 = dr * ith; \ + const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; \ + if (ir0 >= nr) return; \ + for (uint32_t i03 = 0; i03 < ne03; i03++) { \ + for (uint32_t i02 = 0; i02 < ne02; i02++) { \ + _Pragma("unroll(4)") \ + for (uint32_t i01 = ir0; i01 < ir1; i01++) { \ + uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; \ + uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; \ + hex_l2fetch(src0_ptr, ne00 * ELEM_SIZE, nb01, 2); \ + hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE); \ + } \ + } \ + } \ } -static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) { - cpy_preamble; - - // parallelize by src0 rows - const uint32_t dr = ct->src0_nrows_per_thread; - const uint32_t ir0 = dr * ith; - const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; - - // Fast path: when both src0 and dst are contiguous in memory - // Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice. - const bool src0_contig = (nb00 == ct->src0_type_size) && - (nb01 == ne00 * nb00) && - (nb02 == ne01 * nb01) && - (nb03 == ne02 * nb02); - const bool dst_contig = (nb0 == ct->dst_type_size) && - (nb1 == ne0 * nb0) && - (nb2 == ne1 * nb1) && - (nb3 == ne2 * nb2); - - if (src0_contig && dst_contig) { - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01; - uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00; - uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ct->src0_type_size; - hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size); - } - } - return; - } - - // dst counters - int64_t k10 = 0; - int64_t i11 = 0; - int64_t i12 = 0; - int64_t i13 = 0; - - // number of blocks in a row - const int64_t nk00 = ct->src0_blocks_per_row; - const int64_t nk0 = ct->dst_blocks_per_row; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - k10 += nk00 * ir0; - while (k10 >= nk0) { - k10 -= nk0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - for (int64_t i01 = ir0; i01 < ir1; i01++) { - for (int64_t k00 = 0; k00 < nk00; k00++) { - const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - memcpy(dst_ptr, src0_ptr, ct->dst_type_size); - - if (++k10 == nk0) { - k10 = 0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } - k10 += nk00 * (ne01 - ir1); - while (k10 >= nk0) { - k10 -= nk0; - if (++i11 == ne1) { - i11 = 0; - if (++i12 == ne2) { - i12 = 0; - if (++i13 == ne3) { - i13 = 0; - } - } - } - } - } - } +DEFINE_CPY_SAMESHAPE(f32, float, 4) +DEFINE_CPY_SAMESHAPE(f16, __fp16, 2) + +#define DEFINE_CPY_RESHAPE(NAME, ELEM_TYPE, ELEM_SIZE) \ +static void cpy_thread_##NAME##_reshape(unsigned int nth, unsigned int ith, void * data) { \ + struct htp_copy_context * ct = (struct htp_copy_context *) data; \ + struct htp_ops_context * octx = ct->octx; \ + cpy_preamble; \ + const uint32_t dr = ct->src0_nrows_per_thread; \ + const uint32_t ir0 = dr * ith; \ + const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; \ + if (ir0 >= nr) return; \ + const bool src0_contig = (nb00 == ELEM_SIZE) && \ + (nb01 == ne00 * nb00) && \ + (nb02 == ne01 * nb01) && \ + (nb03 == ne02 * nb02); \ + const bool dst_contig = (nb0 == ELEM_SIZE) && \ + (nb1 == ne0 * nb0) && \ + (nb2 == ne1 * nb1) && \ + (nb3 == ne2 * nb2); \ + if (src0_contig && dst_contig) { \ + for (int64_t i03 = 0; i03 < ne03; i03++) { \ + for (int64_t i02 = 0; i02 < ne02; i02++) { \ + uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01; \ + uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00; \ + uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ELEM_SIZE; \ + hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ELEM_SIZE); \ + } \ + } \ + return; \ + } \ + const bool reshape_flat_fast = (ne03 == 1 && ne2 == 1 && ne3 == 1) && \ + (ne0 == ne00 * ne01) && (ne1 == ne02) && \ + (nb00 == ELEM_SIZE) && (nb0 == ELEM_SIZE); \ + if (reshape_flat_fast) { \ + for (uint32_t i02 = 0; i02 < ne02; i02++) { \ + for (uint32_t i01 = ir0; i01 < ir1; i01++) { \ + uint8_t * src0_ptr = (uint8_t *) src0->data + i01 * nb01 + i02 * nb02; \ + uint8_t * dst_ptr = (uint8_t *) dst->data + i01 * ne00 * ELEM_SIZE + i02 * nb1; \ + hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE); \ + } \ + } \ + return; \ + } \ + int64_t k10 = 0; \ + int64_t i11 = 0; \ + int64_t i12 = 0; \ + int64_t i13 = 0; \ + const int64_t nk00 = ct->src0_blocks_per_row; \ + const int64_t nk0 = ct->dst_blocks_per_row; \ + for (int64_t i03 = 0; i03 < ne03; i03++) { \ + for (int64_t i02 = 0; i02 < ne02; i02++) { \ + k10 += nk00 * ir0; \ + while (k10 >= nk0) { \ + k10 -= nk0; \ + if (++i11 == ne1) { \ + i11 = 0; \ + if (++i12 == ne2) { \ + i12 = 0; \ + if (++i13 == ne3) { \ + i13 = 0; \ + } \ + } \ + } \ + } \ + for (int64_t i01 = ir0; i01 < ir1; i01++) { \ + for (int64_t k00 = 0; k00 < nk00; k00++) { \ + const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); \ + char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); \ + memcpy(dst_ptr, src0_ptr, ELEM_SIZE); \ + if (++k10 == nk0) { \ + k10 = 0; \ + if (++i11 == ne1) { \ + i11 = 0; \ + if (++i12 == ne2) { \ + i12 = 0; \ + if (++i13 == ne3) { \ + i13 = 0; \ + } \ + } \ + } \ + } \ + } \ + } \ + k10 += nk00 * (ne01 - ir1); \ + while (k10 >= nk0) { \ + k10 -= nk0; \ + if (++i11 == ne1) { \ + i11 = 0; \ + if (++i12 == ne2) { \ + i12 = 0; \ + if (++i13 == ne3) { \ + i13 = 0; \ + } \ + } \ + } \ + } \ + } \ + } \ } -static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) { +DEFINE_CPY_RESHAPE(f32, float, 4) +DEFINE_CPY_RESHAPE(f16, __fp16, 2) + +static void cpy_thread_f16_f32_sameshape(unsigned int nth, unsigned int ith, void * data) { + struct htp_copy_context * ct = (struct htp_copy_context *) data; + struct htp_ops_context * octx = ct->octx; cpy_preamble; // parallelize by src0 rows const uint32_t dr = ct->src0_nrows_per_thread; const uint32_t ir0 = dr * ith; const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + if (ir0 >= nr) return; // copy by rows for (uint32_t i03 = 0; i03 < ne03; i03++) { @@ -195,13 +207,16 @@ static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct ht } } -static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) { +static void cpy_thread_f32_f16_sameshape(unsigned int nth, unsigned int ith, void * data) { + struct htp_copy_context * ct = (struct htp_copy_context *) data; + struct htp_ops_context * octx = ct->octx; cpy_preamble; // parallelize by src0 rows const uint32_t dr = ct->src0_nrows_per_thread; const uint32_t ir0 = dr * ith; const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; + if (ir0 >= nr) return; // copy by rows for (uint32_t i03 = 0; i03 < ne03; i03++) { @@ -217,11 +232,6 @@ static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct ht } } -static void cpy_work_func(unsigned int n, unsigned int i, void *data) { - struct htp_copy_context *ct = (struct htp_copy_context *) data; - ct->copy(ct, ct->octx, n, i); -} - int op_cpy(struct htp_ops_context * octx) { cpy_preamble; @@ -254,22 +264,32 @@ int op_cpy(struct htp_ops_context * octx) { ct.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads; + worker_callback_t copy_fun; + if (sametype && sameshape) { - ct.copy = cpy_thread_sametype_sameshape; + if (src0->type == HTP_TYPE_F32) { + copy_fun = cpy_thread_f32_sameshape; + } else { + copy_fun = cpy_thread_f16_sameshape; + } } else if (sameshape) { /**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32) - ct.copy = cpy_thread_f16_f32_sameshape; + copy_fun = cpy_thread_f16_f32_sameshape; else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16) - ct.copy = cpy_thread_f32_f16_sameshape; + copy_fun = cpy_thread_f32_f16_sameshape; else return HTP_STATUS_NO_SUPPORT; } else if (sametype) { - ct.copy = cpy_thread_sametype_reshape; + if (src0->type == HTP_TYPE_F32) { + copy_fun = cpy_thread_f32_reshape; + } else { + copy_fun = cpy_thread_f16_reshape; + } } else { return HTP_STATUS_NO_SUPPORT; } - worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_threads); + worker_pool_run_func(octx->ctx->worker_pool, copy_fun, &ct, n_threads); return HTP_STATUS_OK; } diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c index d95df6ac9d5..1bd8c1407de 100644 --- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c @@ -22,6 +22,16 @@ // Must be multiple of 32 #define FLASH_ATTN_BLOCK_SIZE (32 * 2) +#if __HVX_ARCH__ < 79 +#define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b)) +#define HVX_OP_SUB_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b)) +#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b)) +#else +#define HVX_OP_ADD_F32(a, b) Q6_Vsf_vadd_VsfVsf(a, b) +#define HVX_OP_SUB_F32(a, b) Q6_Vsf_vsub_VsfVsf(a, b) +#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b) +#endif + // This is a bit of a hack because the compiler is strugling to properly inline // the default hvx_vec_f32_to_f16 with output into the local array. static __attribute__((noinline)) void hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1) @@ -54,8 +64,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf); } - HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p))); - rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum))); + HVX_Vector rsum = HVX_OP_ADD_F32(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)); + rsum = HVX_OP_MUL_F32(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)); hvx_vec_store_u(r, 4, rsum); } @@ -105,10 +115,10 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx4(const void * restrict y, rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf); } - HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p))); - HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p))); - HVX_Vector rsum2 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p))); - HVX_Vector rsum3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p))); + HVX_Vector rsum0 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)); + HVX_Vector rsum1 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)); + HVX_Vector rsum2 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p)); + HVX_Vector rsum3 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p)); HVX_Vector_x4 rsum0123 = { .v = { rsum0, rsum1, rsum2, rsum3 } }; return hvx_vec_reduce_sum_f32x4(rsum0123); @@ -123,7 +133,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y, const size_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors const size_t nloe = n % VLEN_FP16; // leftover elements - HVX_Vector sums; // initialize at j = 0 + HVX_Vector sums = Q6_V_vzero(); const size_t stride_x_4 = stride_x * 4; for (uint32_t j = 0; j < VLEN_FP32; j += 4) { HVX_Vector sums_x4 = hvx_dot_f16_f16_aa_rx4(y, x, stride_x, nvec, nloe); @@ -132,8 +142,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y, x += stride_x_4; } - sums = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), sums); - return Q6_Vsf_equals_Vqf32(sums); + return HVX_OP_MUL_F32(hvx_vec_splat_f32(s), sums); } // MAD: y (F32) += x (F16) * s (F16) @@ -268,11 +277,10 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t * uint32_t i = 0; #pragma unroll(4) for (; i < nvec; ++i) { - vdst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs)); + vdst[i] = HVX_OP_MUL_F32(vsrc[i], vs); } if (nloe) { - HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs); - hvx_vec_store_a(&vdst[i], nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v)); + hvx_vec_store_a(&vdst[i], nloe * sizeof(float), HVX_OP_MUL_F32(vsrc[i], vs)); } } @@ -438,25 +446,44 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * // Process in sub-blocks of 32 (VLEN_FP32) HVX_Vector sb_scores[FLASH_ATTN_BLOCK_SIZE / VLEN_FP32]; HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY); - for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) { + for (uint32_t iv = 0; ic < current_block_size; ic += VLEN_FP32, ++iv) { // 1. Compute scores HVX_Vector scores = hvx_dot_f16_f16_aa_rx32(q_ptr_vtcm, k_base + ic * factx->size_k_row_padded, factx->size_k_row_padded, DK, factx->scale); // 2. Softcap if (factx->logit_softcap != 0.0f) { scores = hvx_vec_tanh_f32(scores); - scores = Q6_Vqf32_vmpy_VsfVsf(scores, logit_cap); - scores = Q6_Vsf_equals_Vqf32(scores); + scores = HVX_OP_MUL_F32(scores, logit_cap); } // 3. Mask if (mask) { const __fp16 * mp = m_base + ic; HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp; - HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec); - HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair); - scores = Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores); - scores = Q6_Vsf_equals_Vqf32(scores); + + // Multiplying -INFINITY (0xFC00) by a slope in VhfVhf instructions can incorrectly produce NaN on v79. + // Clamp -INFINITY to the max negative fp16 finite value (-65504.0f). + HVX_Vector vinf = Q6_Vh_vsplat_R(0xFC00); + HVX_Vector vmin = Q6_Vh_vsplat_R(0xFBFF); + HVX_VectorPred is_inf = Q6_Q_vcmp_eq_VhVh(m_vals_f16, vinf); + m_vals_f16 = Q6_V_vmux_QVV(is_inf, vmin, m_vals_f16); + + #if __HVX_ARCH__ >= 79 + HVX_VectorPair m_vals_f32_pair = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec); + HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair); + scores = Q6_Vsf_vadd_VsfVsf(add_val, scores); + #else + HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec); + HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair); + scores = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores)); + #endif + } + + // Mask out invalid lanes for leftover handling + uint32_t valid_lanes = current_block_size - ic; + if (valid_lanes < VLEN_FP32) { + HVX_VectorPred valid_pred = Q6_Q_vsetq_R(valid_lanes * 4); // 4 bytes per fp32 lane + scores = Q6_V_vmux_QVV(valid_pred, scores, hvx_vec_splat_f32(-INFINITY)); } sb_scores[iv] = scores; @@ -466,78 +493,55 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * { // 4. Online Softmax Update HVX_Vector M_new_vec = Q6_Vsf_vmax_VsfVsf(v_max, M_vec); - HVX_Vector diff_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec)); + HVX_Vector diff_vec = HVX_OP_SUB_F32(M_vec, M_new_vec); HVX_Vector ms_vec = hvx_vec_exp_f32(diff_vec); M_vec = M_new_vec; hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec); HVX_Vector p_sum_vec = hvx_vec_splat_f32(0.0f); - for (uint32_t ic2 = 0, iv = 0; ic2 + VLEN_FP32 <= current_block_size; ic2 += VLEN_FP32, ++iv) { + for (uint32_t ic2 = 0, iv = 0; ic2 < current_block_size; ic2 += VLEN_FP32, ++iv) { HVX_Vector scores = sb_scores[iv]; - HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_vec); - HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted)); + HVX_Vector scores_shifted = HVX_OP_SUB_F32(scores, M_vec); + HVX_Vector P = hvx_vec_exp_f32(scores_shifted); - p_sum_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(p_sum_vec, P)); + p_sum_vec = HVX_OP_ADD_F32(p_sum_vec, P); // 5. Accumulate V __fp16 __attribute__((aligned(VLEN))) p_arr[VLEN_FP16]; hvx_vec_f32_to_f16_a(p_arr, P, hvx_vec_splat_f32(0)); + float __attribute__((aligned(128))) P_arr[VLEN_FP32]; + hvx_vec_store_a(P_arr, 128, P); + for (uint32_t j = 0; j < VLEN_FP32; j += 2) { - const uint32_t cur_ic = ic2 + j; - const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded; + const uint32_t cur_ic = ic2 + j; + if (cur_ic >= current_block_size) { + break; + } + + if (cur_ic + 1 == current_block_size) { + // Odd leftover, process single row + if (P_arr[j] != 0.0f) { + const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded; + hvx_mad_f32_f16_aa(VKQ32, v_ptr, (p_arr + j), DV); + } + break; + } + + // Avoid NaN * 0.0 = NaN for uninitialized V cache rows. + // Check the f32 values to safely avoid strict aliasing violations. + if (P_arr[j] == 0.0f && P_arr[j + 1] == 0.0f) { + continue; + } + + const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded; hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, (p_arr + j), (p_arr + j + 1), DV); } } p_sum_vec = hvx_vec_reduce_sum_f32(p_sum_vec); - S_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(S_vec, ms_vec)), p_sum_vec)); - } - - if (ic < current_block_size) { - // Sync scalars for leftover/next block if needed - float M = hvx_vec_get_f32(M_vec); - float S = hvx_vec_get_f32(S_vec); - - // Leftover - for (; ic < current_block_size; ++ic) { - float s_val; - const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded; - hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale); - if (factx->logit_softcap != 0.0f) { - s_val = factx->logit_softcap * tanhf(s_val); - } - - if (mask) { - const float m_val = m_base[ic]; - s_val += slope * m_val; - } - - const float Mold = M; - __fp16 vs = 1.0f; - - if (s_val > M) { - M = s_val; - HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M); - HVX_Vector ms_vec = hvx_vec_exp_f32(diff_vec); - hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec); - - float ms = hvx_vec_get_f32(ms_vec); - S = S * ms + vs; - } else { - HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M); - vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec)); - S += vs; - } - - const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded; - - hvx_mad_f32_f16_aa(VKQ32, v_ptr, &vs, DV); - } - - M_vec = hvx_vec_splat_f32(M); - S_vec = hvx_vec_splat_f32(S); + S_vec = HVX_OP_ADD_F32(HVX_OP_MUL_F32(S_vec, ms_vec), p_sum_vec); } // Issue DMA for next+1 block (if exists) @@ -599,8 +603,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * const int i2 = iq2; const int i3 = iq3; - // dst is permuted - uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1; + // dst is permuted: [DV, n_heads, n_tokens, n_seq] + // head stride is nb[1], token stride is nb[2], batch stride is nb[3] + uint8_t * dst_ptr = (uint8_t *) dst->data + i2 * dst->nb[1] + i1 * dst->nb[2] + i3 * dst->nb[3]; if (dst->type == HTP_TYPE_F32) { hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV); @@ -623,8 +628,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) { } #ifdef HTP_HAS_HMX - // HMX path: prefill (neq1 >= 32), head_dim multiple of 32, F16 KV - if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0 && q->ne[1] >= 32) { + // HMX path: head_dim multiple of 32, F16 KV + if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0) { int ret = hmx_flash_attn_ext(octx); if (ret == HTP_STATUS_OK) { return ret; diff --git a/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c index 2e84badc9b7..c4d08bb21c4 100644 --- a/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c +++ b/ggml/src/ggml-hexagon/htp/gated-delta-net-ops.c @@ -586,6 +586,7 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo const uint32_t H = v->ne[1]; const uint32_t n_tokens = v->ne[2]; const uint32_t n_seqs = v->ne[3]; + const uint32_t K = state->ne[1]; const uint32_t total_rows = H * n_seqs; if (ith >= total_rows) { @@ -606,6 +607,10 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128))); float local_sums[4] __attribute__((aligned(128))); + const uint64_t state_seq_stride = state->nb[2] / sizeof(float); + const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs; + const int64_t shift = (int64_t) n_tokens - (int64_t) K; + for (uint32_t ir = ith; ir < total_rows; ir += nth) { const uint32_t iv1 = ir % H; const uint32_t iv3 = ir / H; @@ -615,8 +620,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo const uint32_t iq3 = iv3 / rq3; const uint32_t ik3 = iv3 / rk3; - float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; - const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v; memcpy(s_out, s_in, gctx->state_bytes); float * s_work = s_out; @@ -689,6 +694,16 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo } } + if (K > 1) { + const int64_t target_slot = (int64_t) t - shift; + if (target_slot >= 0 && target_slot < (int64_t) K) { + float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + if (curr_state_o != s_work) { + memcpy(curr_state_o, s_work, gctx->state_bytes); + } + } + } + attn_data += (uint64_t) S_v * H; } } @@ -709,6 +724,7 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo const uint32_t S_v = v->ne[0]; const uint32_t H = v->ne[1]; const uint32_t n_seqs = v->ne[3]; + const uint32_t K = state->ne[1]; const uint32_t total_rows = H * n_seqs; if (ith >= total_rows) { @@ -736,6 +752,9 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith; } + const uint64_t state_seq_stride = state->nb[2] / sizeof(float); + const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs; + for (uint32_t ir = ith; ir < total_rows; ir += nth) { const uint32_t iv1 = ir % H; const uint32_t iv3 = ir / H; @@ -745,8 +764,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo const uint32_t iq3 = iv3 / rq3; const uint32_t ik3 = iv3 / rk3; - float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; - const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v; + const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v; float * s_work; if (spad) { @@ -901,6 +920,7 @@ int op_gated_delta_net(struct htp_ops_context * octx) { const uint32_t H = v->ne[1]; const uint32_t n_tokens = v->ne[2]; const uint32_t n_seqs = v->ne[3]; + const uint32_t K = state->ne[1]; if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) { return HTP_STATUS_NO_SUPPORT; @@ -913,10 +933,10 @@ int op_gated_delta_net(struct htp_ops_context * octx) { (n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) { return HTP_STATUS_NO_SUPPORT; } - if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) { + if (state->ne[0] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) { return HTP_STATUS_NO_SUPPORT; } - if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) { + if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) { return HTP_STATUS_NO_SUPPORT; } diff --git a/ggml/src/ggml-hexagon/htp/get-rows-ops.c b/ggml/src/ggml-hexagon/htp/get-rows-ops.c index 5a1dc933860..bf7063e9880 100644 --- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c +++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c @@ -17,9 +17,13 @@ struct get_rows_context { struct htp_ops_context * octx; - uint32_t src1_nrows_per_thread; + uint32_t tasks_per_thread; + uint32_t total_tasks; + uint32_t chunks_per_row; + uint32_t chunk_size; struct fastdiv_values get_rows_div_ne10; struct fastdiv_values get_rows_div_ne10_ne11; + struct fastdiv_values get_rows_div_chunks_per_row; }; #define get_rows_preamble \ @@ -52,20 +56,23 @@ struct get_rows_context { \ const uint32_t nr = ne10 * ne11 * ne12; -static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) { +static void get_rows_thread_f32_f32_dma(unsigned int nth, unsigned int ith, void *data) { struct get_rows_context * grctx = (struct get_rows_context *)data; struct htp_ops_context * octx = grctx->octx; get_rows_preamble; uint64_t qt = HAP_perf_get_qtimer_count(); - // parallelize by src1 elements (which correspond to dst rows) - const uint32_t dr = grctx->src1_nrows_per_thread; + const uint32_t dr = grctx->tasks_per_thread; const uint32_t ir0 = dr * ith; - const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr; + if (ir0 >= grctx->total_tasks) { + return; + } + const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks); const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); + dma_queue * dma_queue = octx->ctx->dma[ith]; for (uint32_t i = ir0; i < ir1; ++i) { const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11); const uint32_t rem = i - i12 * ne11 * ne10; @@ -73,28 +80,76 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da const uint32_t i10 = rem - i11 * ne10; const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12; - uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr; if (i01 >= ne01) { - // invalid index, skip for now to avoid crash continue; } const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03; const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3; - hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00); + + while (!dma_queue_push(dma_queue, dma_make_ptr((void *)dst_ptr, (const void *)src0_ptr), nb1, nb01, ne00 * sizeof(float), 1)) { + dma_queue_pop(dma_queue); + } } + dma_queue_flush(dma_queue); qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt); - FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + FARF(HIGH, "get-rows-f32-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt); } -int op_get_rows(struct htp_ops_context * octx) { +static void get_rows_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) { + struct get_rows_context * grctx = (struct get_rows_context *)data; + struct htp_ops_context * octx = grctx->octx; get_rows_preamble; - const uint32_t n_threads = MIN(nr, octx->n_threads); + uint64_t qt = HAP_perf_get_qtimer_count(); + + const uint32_t dr = grctx->tasks_per_thread; + const uint32_t ir0 = dr * ith; + if (ir0 >= grctx->total_tasks) { + return; + } + const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks); + + const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); + + const uint32_t chunks_per_row = grctx->chunks_per_row; + const uint32_t chunk_size = grctx->chunk_size; + for (uint32_t i = ir0; i < ir1; ++i) { + const uint32_t row_idx = fastdiv(i, &grctx->get_rows_div_chunks_per_row); + const uint32_t chunk_idx = i - row_idx * chunks_per_row; + + const uint32_t i12 = fastdiv(row_idx, &grctx->get_rows_div_ne10_ne11); + const uint32_t rem = row_idx - i12 * ne11 * ne10; + const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10); + const uint32_t i10 = rem - i11 * ne10; + + const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12; + uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr; + + if (i01 >= ne01) { + continue; + } + + const uint32_t offset = chunk_idx * chunk_size; + if (offset < ne00) { + const uint32_t copy_size = MIN(chunk_size, ne00 - offset); + const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03 + offset * sizeof(float); + const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3 + offset * sizeof(float); + hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, copy_size); + } + } + + qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt); + FARF(HIGH, "get-rows-f32-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt); +} + +int op_get_rows(struct htp_ops_context * octx) { + get_rows_preamble; if (octx->src[0]->type != HTP_TYPE_F32) { return HTP_STATUS_NO_SUPPORT; @@ -112,13 +167,52 @@ int op_get_rows(struct htp_ops_context * octx) { return HTP_STATUS_OK; } + const uint32_t nb00 = octx->src[0]->nb[0]; + const uint32_t nb0 = octx->dst->nb[0]; + + const bool can_use_dma = (nb00 == sizeof(float)) && (nb0 == sizeof(float)); + const bool use_dma = can_use_dma && (ne00 >= 2048); + struct get_rows_context grctx; grctx.octx = octx; grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src[1]->ne[0]); grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]); - grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads; + if (use_dma) { + grctx.chunks_per_row = 1; + grctx.chunk_size = ne00; + grctx.total_tasks = nr; + grctx.get_rows_div_chunks_per_row = init_fastdiv_values(1); + + const uint32_t n_threads = MIN(nr, octx->n_threads); + grctx.tasks_per_thread = (nr + n_threads - 1) / n_threads; + + worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_dma, &grctx, n_threads); + } else { + uint32_t chunks_per_row = 1; + uint32_t chunk_size = ne00; + uint32_t total_tasks = nr; + + if (nr < octx->n_threads) { + const uint32_t min_chunk_size = 1024; + uint32_t max_chunks = ne00 / min_chunk_size; + if (max_chunks == 0) { + max_chunks = 1; + } + chunks_per_row = MIN((octx->n_threads + nr - 1) / nr, max_chunks); + chunk_size = (ne00 + chunks_per_row - 1) / chunks_per_row; + total_tasks = nr * chunks_per_row; + } + + grctx.chunks_per_row = chunks_per_row; + grctx.chunk_size = chunk_size; + grctx.total_tasks = total_tasks; + grctx.get_rows_div_chunks_per_row = init_fastdiv_values(chunks_per_row); - worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_threads); + const uint32_t n_threads = MIN(total_tasks, octx->n_threads); + grctx.tasks_per_thread = (total_tasks + n_threads - 1) / n_threads; + + worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_hvx, &grctx, n_threads); + } return HTP_STATUS_OK; } diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c index 4a4ff0b331d..f132c08500d 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c @@ -50,8 +50,8 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS); const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK] const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong - const size_t k_dma_size = hex_align_up(Bc * DK * sizeof(__fp16), 4096); // K DMA: [Bc, DK] x2 double-buf - const size_t v_dma_size = hex_align_up(Bc * DV * sizeof(__fp16), 4096); // V DMA: [Bc, DV] x2 double-buf + const size_t k_dma_size = hex_align_up(Bc * hex_round_up(DK * sizeof(__fp16), 128), 4096); // K DMA: [Bc, DK] x2 double-buf + const size_t v_dma_size = hex_align_up(Bc * hex_round_up(DV * sizeof(__fp16), 128), 4096); // V DMA: [Bc, DV] x2 double-buf const size_t k_tile_size = hex_align_up(Bc * DK * sizeof(__fp16), 4096); // K tiles: [Bc, DK] interleaved const size_t v_tile_size = hex_align_up(Bc * DV * sizeof(__fp16), 4096); // V tiles: [Bc, DV] interleaved const size_t s_tile_size = hex_align_up(g_br * Bc * sizeof(__fp16), 4096); // S/P:[g_br, Bc] @@ -852,9 +852,10 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) { v_s_rowmax1 = hvx_vec_reduce_max_f16(v_s_rowmax1); // Splat m_prev[r], m_prev[r+1] from the per-row accumulator. - // vror brings the target lane to lane 0, then extract + re-splat. - HVX_Vector v_m_prev0 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2))); - HVX_Vector v_m_prev1 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2))); + // vror brings the target lane to lane 0, then vdelta replicates it + // across all lanes โ€” stays in the vector domain (no store/reload). + HVX_Vector v_m_prev0 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2)); + HVX_Vector v_m_prev1 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2)); // HVX max โ€” both operands are splats, so result is splat of m_new. HVX_Vector v_dup_m0 = Q6_Vhf_vmax_VhfVhf(v_m_prev0, v_s_rowmax0); @@ -1247,9 +1248,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { if (DK % 32 != 0 || DV % 32 != 0) { return HTP_STATUS_NO_SUPPORT; } - if (neq1 < 32) { - return HTP_STATUS_NO_SUPPORT; - } // GQA factor const uint32_t n_kv_heads = k->ne[2]; @@ -1277,7 +1275,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { struct hmx_fa_context factx; memset(&factx, 0, sizeof(factx)); factx.octx = octx; - factx.n_threads = octx->ctx->n_threads; + factx.n_threads = n_threads; factx.DK = DK; factx.DV = DV; factx.n_kv = nek1; @@ -1327,10 +1325,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { factx.m1 = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2); // ======== VTCM allocation (GQA-aware) ======== + const size_t size_k_row = DK * sizeof(__fp16); + const size_t size_v_row = DV * sizeof(__fp16); + const size_t size_k_row_padded = hex_round_up(size_k_row, 128); + const size_t size_v_row_padded = hex_round_up(size_v_row, 128); + const size_t q_tile_bytes = hex_align_up(g_br * DK * sizeof(__fp16), 4096); const size_t o_tile_bytes = hex_align_up(g_br * DV * sizeof(__fp16), 4096); - const size_t k_dma_bytes = hex_align_up(Bc * DK * sizeof(__fp16), 4096); - const size_t v_dma_bytes = hex_align_up(Bc * DV * sizeof(__fp16), 4096); + const size_t k_dma_bytes = hex_align_up(Bc * size_k_row_padded, 4096); + const size_t v_dma_bytes = hex_align_up(Bc * size_v_row_padded, 4096); const size_t k_tile_bytes = hex_align_up(Bc * DK * sizeof(__fp16), 4096); const size_t v_tile_bytes = hex_align_up(Bc * DV * sizeof(__fp16), 4096); const size_t s_tile_bytes = hex_align_up(g_br * Bc * sizeof(__fp16), 4096); @@ -1400,11 +1403,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { // ======== DMA setup ======== dma_queue * const dma = ctx->dma[0]; - // Padded row sizes for DMA - const size_t size_k_row = nek0 * sizeof(__fp16); - const size_t size_v_row = nev0 * sizeof(__fp16); - const size_t size_k_row_padded = hex_round_up(nek0 * sizeof(__fp16), 128); - const size_t size_v_row_padded = hex_round_up(nev0 * sizeof(__fp16), 128); + // Padded row sizes for DMA (defined in outer scope) const size_t n_row_tiles_g_br = g_br / HMX_FP16_TILE_N_ROWS; const size_t n_tiles_per_bc = Bc / HMX_FP16_TILE_N_COLS; diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index e05ccfd5fc7..083d125882d 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -16,6 +16,7 @@ #include "ggml-common.h" #include "hex-dma.h" +#include "hex-fastdiv.h" #include "worker-pool.h" #include "hvx-utils.h" @@ -34,6 +35,10 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, }; +static const __fp16 q4_1_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { + 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, +}; + // MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value // kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6 static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { @@ -62,6 +67,8 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) { case HTP_TYPE_Q4_0: case HTP_TYPE_IQ4_NL: return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb + case HTP_TYPE_Q4_1: + return (size_t) nb * (QK_Q4_0x4x2 / 2 + 32); // 160 * nb case HTP_TYPE_Q8_0: return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb case HTP_TYPE_MXFP4: @@ -181,57 +188,143 @@ static int hmx_compute_chunks(size_t vtcm_total, // In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles // of the same 32 packed bytes. static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) { + (void)vlut_cvt; + HVX_Vector vq = hvx_vmemu(packed_32); + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector i8 = Q6_Vb_vsplat_R(8); + HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale)); + + HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); + v_quants = Q6_V_vand_VV(v_quants, mask_h4); + + HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8); + HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_int8)); + HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0); + + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); +} + +// Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using +// full HVX vector width. +// Output: vector_x2 each hold 32 FP16 values in the first 64 bytes. +static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx( + const uint8_t *packed_128, bool upper_nibbles, + const __fp16 *scales_4, const HVX_Vector vlut_cvt) { + (void)vlut_cvt; + HVX_Vector vq = hvx_vmemu(packed_128); + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector i8 = Q6_Vb_vsplat_R(8); + HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); + v_quants = Q6_V_vand_VV(v_quants, mask_h4); + + HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8); + + HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_int8); + HVX_Vector v_lo = Q6_V_lo_W(vp_int16); + HVX_Vector v_hi = Q6_V_hi_W(vp_int16); + + v_lo = Q6_Vhf_equals_Vh(v_lo); + v_hi = Q6_Vhf_equals_Vh(v_hi); + + HVX_Vector vscale = hvx_vmemu(scales_4); + HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale); + HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4)); + + v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); + v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); + + HVX_Vector_x2 r = { v_lo, v_hi }; + return r; +} + +static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) { + (void)vlut_cvt; + HVX_Vector vq = hvx_vmemu(packed_32); + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector v_dm = hvx_vmemu(scale_offset); + HVX_Vector v_scales = hvx_vec_repl_f16(v_dm); + HVX_Vector v_offsets = hvx_vec_repl_f16(Q6_V_vror_VR(v_dm, 2)); + + HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); + v_quants = Q6_V_vand_VV(v_quants, mask_h4); + + HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_quants)); + HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0); + + return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets)); +} + +static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx( + const uint8_t *packed_128, bool upper_nibbles, + const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) { + (void)vlut_cvt; + HVX_Vector vq = hvx_vmemu(packed_128); + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); + v_quants = Q6_V_vand_VV(v_quants, mask_h4); + + HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_quants); + HVX_Vector v_lo = Q6_V_lo_W(vp_int16); + HVX_Vector v_hi = Q6_V_hi_W(vp_int16); + + v_lo = Q6_Vhf_equals_Vh(v_lo); + v_hi = Q6_Vhf_equals_Vh(v_hi); + + HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4); + HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2); + HVX_Vector vd = Q6_V_lo_W(dm_deal); + HVX_Vector vm = Q6_V_hi_W(dm_deal); + + HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vd); + HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vd, 4)); + + HVX_Vector v_os01 = hvx_vec_repl_2x_f16(vm); + HVX_Vector v_os23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vm, 4)); + + v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01), v_os01)); + v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23), v_os23)); + + HVX_Vector_x2 r = { v_lo, v_hi }; + return r; +} + +// LUT-based dequantizers for non-linear IQ4_NL format. +static inline HVX_Vector dequantize_x4x2_iq4_nl_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) { HVX_Vector vq = hvx_vmemu(packed_32); const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale)); - // q4x4x2 stores two int4 values per byte. Keep only the selected nibble. HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); v_quants = Q6_V_vand_VV(v_quants, mask_h4); - // Shuffle before LUT v_quants = Q6_Vb_vshuff_Vb(v_quants); - // Use standard vlut16 (not _nomatch) to avoid stale-register NaN. - // _nomatch retains the previous destination-register value for colliding - // indices, but the C intrinsic doesn't model the implicit read so the - // compiler may allocate a register containing garbage/NaN. HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); HVX_Vector v_hf = Q6_V_lo_W(vp); return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); } -// Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using -// full HVX vector width. One vmemu + one vlut16 replaces 4 separate calls. -// Output: out[0..3] each hold 32 FP16 values in the first 64 bytes. -static inline void dequantize_x4x2_q4_0_x4groups_hvx( +static inline HVX_Vector_x2 dequantize_x4x2_iq4_nl_x4groups_hvx( const uint8_t *packed_128, bool upper_nibbles, - const __fp16 *scales_4, const HVX_Vector vlut_cvt, - HVX_Vector out[4]) { - // Load all 128 packed bytes (4 contiguous 32-byte groups) + const __fp16 *scales_4, const HVX_Vector vlut_cvt) { HVX_Vector vq = hvx_vmemu(packed_128); const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); v_quants = Q6_V_vand_VV(v_quants, mask_h4); - // Shuffle before LUT v_quants = Q6_Vb_vshuff_Vb(v_quants); - // Full-width vlut16: 128 byte lookups -> 128 fp16 results in a VectorPair HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); - HVX_Vector v_lo = Q6_V_lo_W(vp); // [group0: 32 fp16 | group1: 32 fp16] - HVX_Vector v_hi = Q6_V_hi_W(vp); // [group2: 32 fp16 | group3: 32 fp16] - - // Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b - volatile HVX_Vector vscale = hvx_vmemu(scales_4); + HVX_Vector v_lo = Q6_V_lo_W(vp); + HVX_Vector v_hi = Q6_V_hi_W(vp); + HVX_Vector vscale = hvx_vmemu(scales_4); HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale); HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4)); v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); - // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter - out[0] = v_lo; // group0 already in [0:63] - out[1] = v_hi; // group2 already in [0:63] + HVX_Vector_x2 r = { v_lo, v_hi }; + return r; } // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes. @@ -292,12 +385,11 @@ static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t * packed } // Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes). -static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128, +static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128, bool upper_nibbles, int sub_blk_base, const HVX_Vector vlut_cvt, - mxfp4_scales_t scales, - HVX_Vector out[4]) { + mxfp4_scales_t scales) { HVX_Vector vq = hvx_vmemu(packed_128); const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; @@ -318,118 +410,192 @@ static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_12 v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); - out[0] = v_lo; - out[1] = Q6_V_vror_VR(v_lo, 64); - out[2] = v_hi; - out[3] = Q6_V_vror_VR(v_hi, 64); + HVX_Vector_x4 r = { v_lo, Q6_V_vror_VR(v_lo, 64), v_hi, Q6_V_vror_VR(v_hi, 64) }; + return r; } +typedef struct { + __fp16 *dst; + const uint8_t *src; + int n_cols; + int k_block; + size_t row_stride; + int weight_type; + int n_tot_tiles; + int n_tiles_per_task; + int n_tasks; + int n_k_tiles; + struct fastdiv_values n_k_tiles_div; +} x4x2_dequantize_state_t; + // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16. // Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes. // Output: vtcm_dst in tile-major FP16 layout. -static void dequantize_x4x2_weight_to_fp16_tiles_task( - __fp16 *restrict vtcm_dst, - const uint8_t *restrict vtcm_src, - int n_cols, int k_block, - size_t row_stride, int weight_type, - int start_tile, int end_tile) { - - const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS; - const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL); - const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block; - const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) : - (weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) : - hvx_vmem(q4_0_to_fp16_lut); - - // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions. - // Each int32 element holds a K-row-pair (2 adjacent fp16 values). word[i] at offset i*128 - // maps to K-rows 2i and 2i+1. Column offset (n*4) added per row. - const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); - const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step - const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes) - - unsigned ct = (unsigned)start_tile / n_k_tiles; // column tile index - unsigned kt = (unsigned)start_tile % n_k_tiles; // K tile index - for (unsigned t = start_tile; t < end_tile; ) { - if (kt >= n_k_tiles) { kt = 0; ct++; } - - // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row --- - if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) { - unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; - unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4 - bool upper = (sub_blk_base >= 4); - unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes - unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE - + sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales - - __fp16 *tile_bases[4]; - for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; } +#define DEFINE_DEQUANTIZE_Q4_TASK(suffix, lut_name, helper_prefix, dblk_size, scale_step) \ +static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix( \ + const x4x2_dequantize_state_t *state, \ + int start_tile, int end_tile) { \ + \ + const int n_k_tiles = state->n_k_tiles; \ + const int qrow_size = (unsigned)state->k_block / 2; \ + const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; \ + const HVX_Vector vlut_cvt = hvx_vmem(lut_name); \ + \ + const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); \ + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); \ + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); \ + \ + unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); \ + unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); \ + \ + for (unsigned t = start_tile; t < (unsigned)end_tile; ) { \ + if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } \ + \ + if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) { \ + unsigned blk_idx = ((kt * 32) / QK_Q4_0x4x2); \ + unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; \ + bool upper = (sub_blk_base >= 4); \ + unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); \ + unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk_base * (scale_step); \ + \ + __fp16 *tile_bases[4]; \ + for (unsigned g = 0; g < 4; g++) { \ + tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS; \ + } \ + \ + HVX_Vector v_off = v_scat_base; \ + unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride; \ + \ + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { \ + const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride; \ + const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride; \ + \ + HVX_Vector_x2 dv0 = dequantize_x4x2_##helper_prefix##_x4groups_hvx( \ + r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); \ + Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]); \ + Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]); \ + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ + \ + HVX_Vector_x2 dv1 = dequantize_x4x2_##helper_prefix##_x4groups_hvx( \ + r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt); \ + Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]); \ + Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]); \ + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ + } \ + \ + for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); } \ + t += 4; kt += 4; \ + continue; \ + } \ + \ + __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; \ + { \ + unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; \ + unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; \ + bool upper = (sub_blk >= 4); \ + unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; \ + unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk * (scale_step); \ + \ + HVX_Vector v_off = v_scat_base; \ + unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride; \ + unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; \ + \ + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { \ + const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride; \ + const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride; \ + \ + HVX_Vector v0 = dequantize_x4x2_##helper_prefix##_group_hvx( \ + r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); \ + HVX_Vector v1 = (row1 < (unsigned)state->n_cols) \ + ? dequantize_x4x2_##helper_prefix##_group_hvx( \ + r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt) \ + : Q6_V_vzero(); \ + \ + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); \ + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); \ + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ + } \ + (void) *(volatile HVX_Vector *)(tile_base); \ + } \ + ++t; ++kt; \ + } \ + \ + if (start_tile < end_tile) { \ + (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); \ + } \ +} \ + \ +static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) { \ + x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; \ + for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { \ + int start = task_id * state->n_tiles_per_task; \ + int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); \ + dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end); \ + } \ +} - HVX_Vector v_off = v_scat_base; +DEFINE_DEQUANTIZE_Q4_TASK(q4_0, q4_0_to_fp16_lut, q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16)) +DEFINE_DEQUANTIZE_Q4_TASK(q4_1, q4_1_to_fp16_lut, q4_1, 32, 4) +DEFINE_DEQUANTIZE_Q4_TASK(iq4_nl, iq4_nl_to_fp16_lut, iq4_nl, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16)) - unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride; - unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; +static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4( + const x4x2_dequantize_state_t *state, + int start_tile, int end_tile) { - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { - HVX_Vector v0[2]; - const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride; - dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0); - Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]); - Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + const int n_k_tiles = state->n_k_tiles; + const int qrow_size = state->k_block; + const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; + const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut); + const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); - r0 = vtcm_src + row_offset; row_offset += row_stride; - dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0); - Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]); - Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } + unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); + unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); - for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); } - t += 4; kt += 4; - continue; - } + for (unsigned t = start_tile; t < (unsigned)end_tile; ) { + if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } - // --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales --- - if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) { + // Batch-4 fast path for MXFP4 + if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) { int blk_idx = (kt * 32) / QK_MXFP4x4x2; - int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32; // 0 or 4 + int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32; bool upper = (sub_blk_base >= 4); - int packed_off = blk_idx * (QK_MXFP4x4x2 / 2); // 128 contiguous packed bytes - int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; // all 8 E8M0 scales + int packed_off = blk_idx * (QK_MXFP4x4x2 / 2); + int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; __fp16 * tile_bases[4]; for (int g = 0; g < 4; g++) { - tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; + tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS; } HVX_Vector v_off = v_scat_base; for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { int row0 = ct * HMX_FP16_TILE_N_COLS + r; int row1 = row0 + 1; - const uint8_t * r0 = vtcm_src + row0 * row_stride; - const uint8_t * r1 = vtcm_src + row1 * row_stride; + const uint8_t * r0 = state->src + row0 * state->row_stride; + const uint8_t * r1 = state->src + row1 * state->row_stride; - // Batch-convert all 8 E8M0 scales once per row (stays in HVX register) mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off); - HVX_Vector v0[4], v1[4]; - dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0); - if (row1 < n_cols) { + HVX_Vector_x4 dv0, dv1; + dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8); + if (row1 < state->n_cols) { mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off); - dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1); + dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8); } else { - v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero(); + dv1.v[0] = dv1.v[1] = dv1.v[2] = dv1.v[3] = Q6_V_vzero(); } for (int g = 0; g < 4; g++) { - Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); + Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[g]); } v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); for (int g = 0; g < 4; g++) { - Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); + Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[g]); } v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); } @@ -438,41 +604,13 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( (void) *(volatile HVX_Vector *) (tile_bases[g]); } - t += 4; + t += 4; kt += 4; continue; } - // --- Single-tile fallback --- - __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS; - - if (is_q4) { - unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; - unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; - bool upper = (sub_blk >= 4); - unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; - unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); - - HVX_Vector v_off = v_scat_base; // reset to column 0 - unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride; - unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { - const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride; - const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride; - - HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx( - r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); - HVX_Vector v1 = (row1 < n_cols) - ? dequantize_x4x2_q4_0_group_hvx( - r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt) - : Q6_V_vzero(); - - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } - (void) *(volatile HVX_Vector *)(tile_base); - } else if (weight_type == HTP_TYPE_MXFP4) { + // Single-tile fallback + __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; + { int blk_idx = (kt * 32) / QK_MXFP4x4x2; int sub_blk = ((kt * 32) % QK_MXFP4x4x2) / 32; bool upper = (sub_blk >= 4); @@ -484,15 +622,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( int row0 = ct * HMX_FP16_TILE_N_COLS + r; int row1 = row0 + 1; - const uint8_t * r0 = vtcm_src + row0 * row_stride; - const uint8_t * r1 = vtcm_src + row1 * row_stride; + const uint8_t * r0 = state->src + row0 * state->row_stride; + const uint8_t * r1 = state->src + row1 * state->row_stride; - // Batch-convert all 8 E8M0 scales once per row (stays in HVX register) mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off); HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8); HVX_Vector v1; - if (row1 < n_cols) { + if (row1 < state->n_cols) { mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off); v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8); } else { @@ -505,23 +642,59 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); } (void) *(volatile HVX_Vector *) (tile_base); - } else { - // Q8_0 + } + ++t; ++kt; + } + + if (start_tile < end_tile) { + (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); + } +} + +static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) { + x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; + for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { + int start = task_id * state->n_tiles_per_task; + int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); + dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end); + } +} + +static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0( + const x4x2_dequantize_state_t *state, + int start_tile, int end_tile) { + + const int n_k_tiles = state->n_k_tiles; + const int qrow_size = state->k_block; + const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; + + const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); + + unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); + unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); + + for (unsigned t = start_tile; t < (unsigned)end_tile; ) { + if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } + + __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; + { int blk_idx = (kt * 32) / QK_Q8_0x4x2; int sub_blk = ((kt * 32) % QK_Q8_0x4x2) / 32; int byte_off = blk_idx * QK_Q8_0x4x2 + sub_blk * 32; int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); - HVX_Vector v_off = v_scat_base; // reset to column 0 + HVX_Vector v_off = v_scat_base; for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { int row0 = ct * HMX_FP16_TILE_N_COLS + r; int row1 = row0 + 1; - const uint8_t *r0 = vtcm_src + row0 * row_stride; - const uint8_t *r1 = vtcm_src + row1 * row_stride; + const uint8_t *r0 = state->src + row0 * state->row_stride; + const uint8_t *r1 = state->src + row1 * state->row_stride; HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off)); - HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero(); + HVX_Vector v1 = (row1 < state->n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero(); Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); @@ -533,50 +706,31 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( ++t; ++kt; } - // Drain HVX scatter write buffer: a vmem load on the same HW thread retires - // all pending scatter entries to VTCM. Without this, the main thread's HMX - // reads may see stale data because atomic_fetch_sub (release) only orders - // regular stores, not the HVX scatter buffer. if (start_tile < end_tile) { - (void) *(volatile HVX_Vector *)(vtcm_dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); + (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); } } -typedef struct { - __fp16 *dst; - const uint8_t *src; - int n_cols; - int k_block; - size_t row_stride; - int weight_type; - int n_tot_tiles; - int n_tiles_per_task; - int n_tasks; -} x4x2_dequantize_state_t; - -static void dequantize_x4x2_worker_loop(unsigned int n, unsigned int i, void *data) { +static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) { x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; - for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { int start = task_id * state->n_tiles_per_task; int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); - - dequantize_x4x2_weight_to_fp16_tiles_task( - state->dst, state->src, state->n_cols, state->k_block, - state->row_stride, state->weight_type, start, end); + dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end); } } static void dequantize_x4x2_weight_chunk_to_fp16_tiles( struct htp_context *ctx, __fp16 *vtcm_dst, const void *vtcm_src, int n_cols, int k_block, - size_t row_stride, int weight_type) { + size_t row_stride, int weight_type, + int n_k_tiles, struct fastdiv_values n_k_tiles_div, + worker_callback_t dequant_worker_fn) { assert(n_cols % HMX_FP16_TILE_N_COLS == 0); assert(k_block % HMX_FP16_TILE_N_COLS == 0); size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS; - size_t n_k_tiles = k_block / HMX_FP16_TILE_N_COLS; size_t n_tot_tiles = n_col_tiles * n_k_tiles; size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads); @@ -591,12 +745,16 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles( state.k_block = k_block; state.row_stride = row_stride; state.weight_type = weight_type; + state.n_k_tiles = n_k_tiles; + state.n_k_tiles_div = n_k_tiles_div; - worker_pool_run_func(ctx->worker_pool, dequantize_x4x2_worker_loop, &state, ctx->n_threads); + worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, ctx->n_threads); } // --- End x4x2 dequantizers --- +#pragma clang diagnostic ignored "-Wbackend-plugin" // spurios warning for hmx intrinsics + // requires external HMX lock static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales, int n_row_tiles, int n_col_tiles, int n_dot_tiles) { @@ -612,11 +770,13 @@ static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS; const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS; - for (int k = 0; k < n_dot_tiles; ++k) { - Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047); - Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047); - row_tiles += HMX_FP16_TILE_N_ELMS; - col_tiles += HMX_FP16_TILE_N_ELMS; + for (int k = 0, k_block; k < n_dot_tiles; k += k_block) { + k_block = hex_smin(n_dot_tiles - k, 32); + const uint32_t range = 2048u * (uint32_t)k_block - 1; + Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range); + Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range); + row_tiles += k_block * HMX_FP16_TILE_N_ELMS; + col_tiles += k_block * HMX_FP16_TILE_N_ELMS; } __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS; @@ -832,10 +992,6 @@ static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 * worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, ctx->n_threads); } -// - -#define FALLBACK_TO_STANDARD 1 - // C += AB static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile, @@ -861,314 +1017,94 @@ static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, co Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047); } - for (int k = 0; k < n_dot_tiles; ++k) { - Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047); - Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047); - row_tiles += HMX_FP16_TILE_N_ELMS; - col_tiles += HMX_FP16_TILE_N_ELMS; - } - Q6_mxmem_AR_after_hf(accum_tile, 0); - } - } -} - -static __attribute__((noinline)) int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, - float *restrict out, const float *restrict x, const uint8_t *restrict w, - int m, int k, int n, int weight_type) { - // assume k % 32 == 0 && n % 32 == 0 - const size_t row_stride = get_x4x2_row_stride(weight_type, k); - if (row_stride == 0) { - return -1; - } - - const size_t vtcm_budget = ctx->vtcm_size; - - const size_t K_BLOCK_SIZE = 1024; - - // Fallback: if k doesn't need K-blocking, out-stationary has no advantage - const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE; - if (k_iters_check <= 1) { - FARF(HIGH, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k); - return FALLBACK_TO_STANDARD; - } - - // Dynamic M,N search via hmx_compute_chunks - const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE); - const size_t per_m = K_BLOCK_SIZE * sizeof(float) // scratch1: Mร—Kร—4 (act DMA staging F32) - + K_BLOCK_SIZE * sizeof(__fp16); // activation: Mร—Kร—2 (F16 tiles) - const size_t per_n = sub_row_stride_alloc // scratch0: Nร—sub_row(K) (packed quant) - + K_BLOCK_SIZE * sizeof(__fp16); // weight: Nร—Kร—2 (F16 tiles) - const size_t per_mn = sizeof(__fp16); // output: Mร—Nร—2 (out-stationary) - - // Alignment margin: hex_align_up can add up to 2047 bytes per buffer; - // scratch1 (mcร—6144) is naturally 2048-aligned, remaining 4 buffers need margin - const size_t align_margin = 4 * HMX_FP16_TILE_SIZE; - const size_t overhead = HMX_FP16_TILE_SIZE + 256 + align_margin; // eye_tile + scales + alignment - - size_t M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used; - // Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost. - // From profiling: wt_dequant per element โ‰ˆ 1.5ร— activation load per element. - // m_block_cost = n*3: each extra M-block re-dequants all Nร—K weight (expensive). - // n_block_cost = m*2: each extra N-block re-loads all Mร—K activation (cheaper). - const size_t m_block_cost = (size_t) n * 3; - const size_t n_block_cost = (size_t) m * 2; - if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, - hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, - m_block_cost, n_block_cost, &M_BLOCK_SIZE, - &N_BLOCK_SIZE, &vtcm_used) != 0) { - FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); - return -1; - } - - // Compute precise buffer sizes from searched M,N and fixed K - const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); - const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); - const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); - const size_t scratch0_sz = hex_align_up(N_BLOCK_SIZE * sub_row_stride_alloc, HMX_FP16_TILE_SIZE); - const size_t scratch1_sz = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(float), HMX_FP16_TILE_SIZE); - - const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256; - if (total_vtcm > vtcm_budget) { - FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm, - vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE); - return -1; - } - - uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; - __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_size); - __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_size); - __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, out_size); - uint8_t *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_sz); - uint8_t *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch1_sz); - __fp16 *vtcm_eye_tile = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, HMX_FP16_TILE_SIZE); - __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget); - - FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type, - M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget); - - // initialize eye tile (32x32 identity matrix) - { - HVX_Vector v; - v = Q6_V_vzero(); - v = Q6_Vw_vinsert_VwR(v, 0x3c000000); - v = Q6_V_vror_VR(v, VLEN - 4); - v = Q6_Vw_vinsert_VwR(v, 0x00003c00); - for (int i = 0; i < 16; ++i) { - ((HVX_Vector *) vtcm_eye_tile)[i] = v; - v = Q6_V_vror_VR(v, VLEN - 8); - } - } - hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 - - TIMER_DEFINE(fetch); - TIMER_DEFINE(act_load); - TIMER_DEFINE(wt_dequant); - TIMER_DEFINE(core); - - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - - for (size_t mr = 0; mr < m; mr += M_BLOCK_SIZE) { - size_t m_blk_sz = hex_smin(m - mr, M_BLOCK_SIZE); - for (size_t nc = 0; nc < n; nc += N_BLOCK_SIZE) { - size_t n_blk_sz = hex_smin(n - nc, N_BLOCK_SIZE); - - const int n_row_tiles = hmx_ceil_div(m_blk_sz, HMX_FP16_TILE_N_ROWS); - const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS); - - for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) { - const size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE); - - TIMER_START(fetch); - // fetch activation block into VTCM - { - const float *activation_block = x + mr * k + kk; - - dma_queue_push(ctx->dma[0], - dma_make_ptr(vtcm_scratch1, activation_block), - k_blk_sz * sizeof(float), - k * sizeof(float), - k_blk_sz * sizeof(float), - m_blk_sz); - } - - // fetch weight block into VTCM (x4x2 sub-block: quants + scales) - const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz); - { - const int blk_start = kk / QK_Q4_0x4x2; - const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2; - const int full_qrow = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2); - const int scale_blk_size = (weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE; - uint8_t *dst = vtcm_scratch0; - const uint8_t *src = w + nc * row_stride; - const size_t n_rows = n_blk_sz; - const size_t src_stride = row_stride; - const size_t dst_stride = sub_row_stride; - const size_t quant_off = (weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2)); - const size_t quant_width = (weight_type == HTP_TYPE_Q8_0) ? (nb_sub * QK_Q8_0x4x2) : (nb_sub * (QK_Q4_0x4x2 / 2)); - const size_t scale_off = full_qrow + blk_start * scale_blk_size; - const size_t scale_width = nb_sub * scale_blk_size; - - // 2D DMA: quants sub-range - dma_queue_push(ctx->dma[0], dma_make_ptr(dst, src + quant_off), dst_stride, src_stride, quant_width, n_rows); - // 2D DMA: scales sub-range - dma_queue_push(ctx->dma[0], dma_make_ptr(dst + quant_width, src + scale_off), dst_stride, src_stride, scale_width, n_rows); - } - TIMER_STOP(fetch); - - TIMER_START(act_load); - // load activation block - { - dma_queue_pop(ctx->dma[0]); // wait for act DNA - transfer_activation_chunk_threaded(ctx, vtcm_activation, (float *) vtcm_scratch1, m_blk_sz, k_blk_sz, k_blk_sz); - } - TIMER_STOP(act_load); - - TIMER_START(wt_dequant); - // dequantize weight block - { - dma_queue_pop(ctx->dma[0]); - dma_queue_pop(ctx->dma[0]); - // vtcm_scratch0 is used to store the qweight chunk - // worker_pool_run_func already returned, so fetch is done - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0, - n_blk_sz, k_blk_sz, sub_row_stride, weight_type); - } - TIMER_STOP(wt_dequant); - - // core mma - TIMER_START(core); - { - core_mma_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, vtcm_eye_tile, n_row_tiles, - n_col_tiles, k_blk_sz / HMX_FP16_TILE_N_COLS, kk == 0); - } - TIMER_STOP(core); + for (int k = 0, k_block; k < n_dot_tiles; k += k_block) { + k_block = hex_smin(n_dot_tiles - k, 32); + const uint32_t range = 2048u * (uint32_t)k_block - 1; + Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range); + Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range); + row_tiles += k_block * HMX_FP16_TILE_N_ELMS; + col_tiles += k_block * HMX_FP16_TILE_N_ELMS; } - // store output block - { - float *output_block = out + (mr * n + nc); - transfer_output_chunk_threaded(ctx, output_block, vtcm_output, m_blk_sz, n_blk_sz, n); - } + Q6_mxmem_AR_after_hf(accum_tile, 0); } } - - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "fetch: %lld us, act_load: %lld us, wt_dequant: %lld us, core: %lld us", - TIMER_US(fetch), TIMER_US(act_load), TIMER_US(wt_dequant), TIMER_US(core)); -#endif - return 0; } -int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, +int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, const uint8_t *restrict permuted_weight, int m, int k, int n, int weight_type) { - if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; } if (k % 32 != 0 || n % 32 != 0) { return -1; } if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) { return -1; } - // for large m, k (e.g. prefill FFN Down), use out-stationary version - if (m >= 128 && k > n && n > 1024) { - int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type); - if (rc != FALLBACK_TO_STANDARD) { - return rc; // 0 success, -1 error - } - FARF(HIGH, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n); - // fall through to standard path - } - size_t row_stride = get_x4x2_row_stride(weight_type, k); if (row_stride == 0) { return -1; } - FARF(HIGH, "hmx_matmul_qk: STANDARD path m=%d k=%d n=%d type=%d", m, k, n, weight_type); + worker_callback_t dequant_worker_fn = NULL; + switch (weight_type) { + case HTP_TYPE_Q4_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break; + case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break; + case HTP_TYPE_Q4_1: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break; + case HTP_TYPE_MXFP4: dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break; + case HTP_TYPE_Q8_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break; + default: + return -1; + } + + const int n_k_tiles = k / HMX_FP16_TILE_N_COLS; + const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles); // --- Dynamic VTCM layout --- - const size_t vtcm_budget = ctx->vtcm_size; - const size_t vec_dot_size = k * sizeof(__fp16); + const size_t vec_dot_size = k * sizeof(__fp16); + const size_t vtcm_budget = ctx->vtcm_size; + size_t vtcm_used = 0; // Pipeline = 4-stage DMAโ†’dequantโ†’HMXโ†’store with HMX worker overlap. - // Only pays off when the chunker yields >=2 n-chunks, so the main loop can - // overlap HMX (C) with HVX (B/D); with a single n-chunk the extra VTCM for - // double-buffered output and the worker-dispatch overhead are pure loss. - // Try pipeline costs first; fall back to sequential if the layout collapses - // to one n-chunk. m >= 128 floor keeps HMX utilization reasonable. - const size_t pipe_per_n = row_stride + 2 * vec_dot_size; // Q + S0 + S1 (dequant bufs) - const size_t pipe_per_mn = 2 * sizeof(__fp16); // O x 2 (output double buffer) - const size_t seq_per_n = vec_dot_size + 2 * row_stride; // W + S0 + S1 (x4x2 DMA bufs) - const size_t seq_per_mn = sizeof(__fp16); // O x 1 - - size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; - bool use_pipeline = false; - - if (m >= 128) { - size_t mc = 0, nc = 0, used = 0; - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, pipe_per_n, /*per_m=*/vec_dot_size, pipe_per_mn, - hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, - /*m_block_cost=*/(size_t) n * 3, - /*n_block_cost=*/(size_t) m * 2, &mc, &nc, &used) == 0 && - hmx_ceil_div((size_t) n, nc) >= 2) { - m_chunk_n_rows = mc; - n_chunk_n_cols = nc; - vtcm_used = used; - use_pipeline = true; - } - } + const size_t size_per_n = row_stride + 2 * vec_dot_size; // Q + S0 + S1 (dequant bufs) + const size_t size_per_mn = 2 * sizeof(__fp16); // O x 2 (output double buffer) - if (!use_pipeline) { - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, seq_per_n, /*per_m=*/vec_dot_size, seq_per_mn, - hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, - /*m_block_cost=*/(size_t) n * 3, - /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { - FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); - return -1; - } + size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0; + if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, /*per_m=*/vec_dot_size, size_per_mn, + hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, + /*m_block_cost=*/(size_t) n * 3, + /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) { + FARF(HIGH, "hmx-mm-q: VTCM too small : m %d k %d n %d budget %zu", m, k, n, vtcm_budget); + return -1; } - // Compute precise buffer sizes per execution path - const size_t weight_area_size = hex_align_up( - n_chunk_n_cols * (use_pipeline ? row_stride : vec_dot_size), HMX_FP16_TILE_SIZE); - const size_t activation_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); - const size_t output_area_size = hex_align_up( - m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); + const size_t weight_area_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE); + const size_t act_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); + const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); size_t scratch0_size, scratch1_size, scratch2_size; - if (use_pipeline) { - scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); // dequant buf 0 - scratch1_size = scratch0_size; // dequant buf 1 - scratch2_size = output_area_size; // output buf 1 - } else { - scratch0_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE); // x4x2 DMA buf 0 - scratch1_size = scratch0_size; // x4x2 DMA buf 1 - scratch2_size = 0; // unused - } + scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); // dequant buf 0 + scratch1_size = scratch0_size; // dequant buf 1 + scratch2_size = output_area_size; // output buf 1 uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); - __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size); + __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size); __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size); void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch1_size); void *vtcm_scratch2 = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL; __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - if ((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) > vtcm_budget) { - FARF(ERROR, "%s: vtcm overflow: used=%zu limit=%zu", __func__, - (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + + vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base; + if (vtcm_used > vtcm_budget) { + FARF(ERROR, "hmx-mm-q: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget); return -1; } hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 - FARF(HIGH, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu", - __func__, m, k, n, weight_type, use_pipeline, - m_chunk_n_rows, n_chunk_n_cols, - (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + FARF(HIGH, "hmx-mm-q: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu", + m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget); TIMER_DEFINE(activation_load); TIMER_DEFINE(weight_load); @@ -1178,184 +1114,115 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds TIMER_DEFINE(total); TIMER_START(total); - FARF(HIGH, "hmx_matmul_qk: %s mc=%zu nc=%zu vtcm=%zu/%zu", - use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols, - (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); - - if (!use_pipeline) { - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { - // transfer activation matrix chunk into VTCM - const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); - const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS); - - TIMER_START(activation_load); - { - const float *activation_chunk = activation + mr * k; - transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k); - } - TIMER_STOP(activation_load); - - void *buf_curr = vtcm_scratch0; - void *buf_next = vtcm_scratch1; - - { - const size_t n_cols_first = hex_smin(n, n_chunk_n_cols); - dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, permuted_weight), row_stride, row_stride, row_stride, n_cols_first); - } - - for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) { - const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); - const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS); - - TIMER_START(weight_load); - { - dma_queue_pop(ctx->dma[0]); // wait until current weight chunk become ready - - const size_t nc_next = nc + n_chunk_n_cols; - if (nc_next < n) { - const size_t n_cols_next = hex_smin(n - nc_next, n_chunk_n_cols); + // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D) + // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D). - const uint8_t *next_weight_chunk = permuted_weight + nc_next * row_stride; + // A --> B: vtcm_qweight, 1 buffer + // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers + // C --> D: vtcm_output0/vtcm_output1, 2 buffers - dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), row_stride, row_stride, row_stride, n_cols_next); - } + // Async timeline (C overlaps B+D): + // main+HVX: [A0][Act][B0][A1][sub C0][B1โ€–C0][A2][wait,sub C1][D0+B2โ€–C1][wait,sub C2][D1โ€–C2][wait][D2] + // HMX queue: [โ–ˆโ–ˆโ–ˆโ–ˆ C0 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ][โ–ˆโ–ˆโ–ˆโ–ˆ C1 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ][โ–ˆโ–ˆโ–ˆโ–ˆ C2 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ] - // Dequant + vscatter writes directly to [K, N] transposed tiles. - // HMX computes C = A x B, where A=[M,K] activation, B=[K,N] weight. - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, buf_curr, n_cols, k, row_stride, weight_type); + int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); + hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors - hex_swap_ptr(&buf_curr, &buf_next); - } - TIMER_STOP(weight_load); + for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); - TIMER_START(hmx_core); - { - core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32); - } - TIMER_STOP(hmx_core); + void *vtcm_qweight = vtcm_weight; + void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 }; + void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 }; - TIMER_START(output_store); - { - float *output = dst + (mr * n + nc); - transfer_output_chunk_threaded(ctx, output, vtcm_output, n_rows, n_cols, n); - } - TIMER_STOP(output_store); - } + // prologue: A0 + const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols); + { + const uint8_t *qweight_chunk_A0 = permuted_weight; + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0); } - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - } else { - // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D) - // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D). - // A --> B: vtcm_qweight, 1 buffer - // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers - // C --> D: vtcm_output0/vtcm_output1, 2 buffers - - // Async timeline (C overlaps B+D): - // main+HVX: [A0][Act][B0][A1][sub C0][B1โ€–C0][A2][wait,sub C1][D0+B2โ€–C1][wait,sub C2][D1โ€–C2][wait][D2] - // HMX queue: [โ–ˆโ–ˆโ–ˆโ–ˆ C0 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ][โ–ˆโ–ˆโ–ˆโ–ˆ C1 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ][โ–ˆโ–ˆโ–ˆโ–ˆ C2 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ] - - int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); - hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors - - for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { - const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); - - void *vtcm_qweight = vtcm_weight; - void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 }; - void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 }; + { + const float *activation_chunk = activation + mr * k; + transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k); + } - // prologue: A0 - const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols); - { - // Use 2D DMA (n_cols rows x row_stride) to avoid 16-bit roiwidth overflow. - const uint8_t *qweight_chunk_A0 = permuted_weight; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, row_stride, row_stride, n_cols_A0); + // prologue: B0, A1, submit C0 (async), B1 (overlaps C0) + { + // B0: wait for DMA, dequant weight chunk 0 + dma_queue_pop(ctx->dma[0]); + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn); + + // A1: issue DMA for weight chunk 1 + const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols); + if (1 < n_chunk_cnt) { + const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride; + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1); } - { - const float *activation_chunk = activation + mr * k; - transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k); - } + // submit C0 (non-blocking โ€” HMX worker executes in parallel) + hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, + (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, + hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), + hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0])); - // prologue: B0, A1, submit C0 (async), B1 (overlaps C0) - { - // B0: wait for DMA, dequant weight chunk 0 + // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker) + if (1 < n_chunk_cnt) { dma_queue_pop(ctx->dma[0]); - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type); - - // A1: issue DMA for weight chunk 1 - const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols); - if (1 < n_chunk_cnt) { - const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1); - } - - // submit C0 (non-blocking โ€” HMX worker executes in parallel) - hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, - (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, - hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), - hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); - hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0])); - - // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker) - if (1 < n_chunk_cnt) { - dma_queue_pop(ctx->dma[0]); - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type); - } + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn); } + } - // main loop: wait C_i โ†’ submit C_{i+1} โ†’ D_i + B_{i+2} (parallel with C_{i+1}) - for (int i = 0; i < n_chunk_cnt; ++i) { - const size_t nc = i * n_chunk_n_cols; - const size_t nc_p1 = nc + 1 * n_chunk_n_cols; - const size_t nc_p2 = nc + 2 * n_chunk_n_cols; + // main loop: wait C_i โ†’ submit C_{i+1} โ†’ D_i + B_{i+2} (parallel with C_{i+1}) + for (int i = 0; i < n_chunk_cnt; ++i) { + const size_t nc = i * n_chunk_n_cols; + const size_t nc_p1 = nc + 1 * n_chunk_n_cols; + const size_t nc_p2 = nc + 2 * n_chunk_n_cols; - const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); - const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols); - const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols); + const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); + const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols); + const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols); - // issue A_{i+2}: DMA push (non-blocking) - if (i + 2 < n_chunk_cnt) { - const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2); - } + // issue A_{i+2}: DMA push (non-blocking) + if (i + 2 < n_chunk_cnt) { + const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride; + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2); + } - // wait C_i: block until prologue/previous C completes - hmx_queue_pop(ctx->hmx_queue); - - // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below) - // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's - // counterpart โ€” and (i+1)%2 was last used by C_{i-1} which completed - // before C_i was submitted. - if (i + 1 < n_chunk_cnt) { - hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2], - (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], - vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), - hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); - hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2])); - } + // wait C_i: block until prologue/previous C completes + hmx_queue_pop(ctx->hmx_queue); + + // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below) + // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's + // counterpart โ€” and (i+1)%2 was last used by C_{i-1} which completed + // before C_i was submitted. + if (i + 1 < n_chunk_cnt) { + hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2], + (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], + vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), + hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2])); + } - // D_i: store output (multi-thread HVX, parallel with C_{i+1}) - float *output_chunk = dst + (mr * n + nc); - transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n); + // D_i: store output (multi-thread HVX, parallel with C_{i+1}) + float *output_chunk = dst + (mr * n + nc); + transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n); - // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1}) - if (i + 2 < n_chunk_cnt) { - dma_queue_pop(ctx->dma[0]); - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type); - } + // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1}) + if (i + 2 < n_chunk_cnt) { + dma_queue_pop(ctx->dma[0]); + dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn); } } - - hmx_queue_suspend(ctx->hmx_queue); } + hmx_queue_suspend(ctx->hmx_queue); + TIMER_STOP(total); #if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d pipeline=%d", __func__, TIMER_US(total), m, k, n, use_pipeline); + FARF(HIGH, "hex-mm-q: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n); if (!use_pipeline) { FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); @@ -1370,15 +1237,15 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds // -static inline int hmx_matmul_batch_r2(const hmx_matmul_w16a32_batched_params_t *params) { +static inline int hmx_matmul_batch_r2(const hmx_matmul_f16_f32_batched_params_t *params) { return params->ne02 > 0 ? params->ne12 / params->ne02 : 1; } -static inline int hmx_matmul_batch_r3(const hmx_matmul_w16a32_batched_params_t *params) { +static inline int hmx_matmul_batch_r3(const hmx_matmul_f16_f32_batched_params_t *params) { return params->ne03 > 0 ? params->ne13 / params->ne03 : 1; } -static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params, +static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params, int dst_b2, int dst_b3) { const int r2 = hmx_matmul_batch_r2(params); const int r3 = hmx_matmul_batch_r3(params); @@ -1387,37 +1254,36 @@ static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_w16a32_ (size_t) (dst_b3 / r3) * params->src0_nb3); } -static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params, +static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params, int dst_b2, int dst_b3) { return (const float *) ((const uint8_t *) params->activation + (size_t) dst_b2 * params->src1_nb2 + (size_t) dst_b3 * params->src1_nb3); } -static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_w16a32_batched_params_t *params, +static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params, int dst_b2, int dst_b3) { return (float *) ((uint8_t *) params->dst + (size_t) dst_b2 * params->dst_nb2 + (size_t) dst_b3 * params->dst_nb3); } -static int hmx_mat_mul_permuted_w16a32_batched_legacy(struct htp_context *ctx, - const hmx_matmul_w16a32_batched_params_t *params) { +static int hmx_matmul_f16_f32_batched_legacy(struct htp_context *ctx, + const hmx_matmul_f16_f32_batched_params_t *params) { int ret = 0; for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) { for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) { - ret = hmx_mat_mul_permuted_w16a32(ctx, - hmx_matmul_dst_batch_ptr(params, b2, b3), - hmx_matmul_activation_batch_ptr(params, b2, b3), - hmx_matmul_weight_batch_ptr(params, b2, b3), - params->m, params->k, params->n, - params->act_stride, params->weight_stride); + ret = hmx_matmul_f16_f32(ctx, hmx_matmul_dst_batch_ptr(params, b2, b3), + hmx_matmul_activation_batch_ptr(params, b2, b3), + hmx_matmul_weight_batch_ptr(params, b2, b3), + params->m, params->k, params->n, + params->act_stride, params->weight_stride); } } return ret; } -int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmul_w16a32_batched_params_t *params) { +int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params) { if (!ctx || !params || !params->dst || !params->activation || !params->permuted_weight) { return -1; } if (!params->m || !params->k || !params->n) { return -1; } if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; } @@ -1435,7 +1301,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu if (group_size <= 1) { FARF(HIGH, "%s: no dim2 GQA reuse (group=%d), using legacy batched loop", __func__, group_size); - return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); + return hmx_matmul_f16_f32_batched_legacy(ctx, params); } // Grouped path: reuse interleaved weight across all q_heads sharing a @@ -1464,7 +1330,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu /*m_block_cost=*/(size_t) params->n, /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__); - return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); + return hmx_matmul_f16_f32_batched_legacy(ctx, params); } const size_t act_head_stride = m_chunk_n_rows * (size_t) params->k; // fp16 elements between heads @@ -1486,7 +1352,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) { FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to legacy batched loop", __func__); - return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); + return hmx_matmul_f16_f32_batched_legacy(ctx, params); } hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 @@ -1614,7 +1480,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu // -int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, +int hmx_matmul_f16_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, const __fp16 *restrict permuted_weight, int m, int k, int n, int act_stride, int weight_stride) { if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; } diff --git a/ggml/src/ggml-hexagon/htp/hmx-ops.h b/ggml/src/ggml-hexagon/htp/hmx-ops.h index 1c78ffadd1c..f114edb822f 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-ops.h +++ b/ggml/src/ggml-hexagon/htp/hmx-ops.h @@ -33,14 +33,14 @@ typedef struct { size_t src1_nb3; size_t dst_nb2; size_t dst_nb3; -} hmx_matmul_w16a32_batched_params_t; +} hmx_matmul_f16_f32_batched_params_t; // HMX matrix multiplication โ€” tile-permuted FP16 weights, FP32 activation/output // act_stride: activation row stride in elements (= k for contiguous, or // nb[1]/sizeof(float) for permuted tensors like attention Q). // weight_stride: weight row stride in elements (= k for compact weights, or // nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK). -int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, +int hmx_matmul_f16_f32(struct htp_context *ctx, float *restrict dst, const float *activation, const __fp16 *permuted_weight, @@ -48,13 +48,12 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, int act_stride, int weight_stride); -// Batched F16 wrapper over hmx_mat_mul_permuted_w16a32. +// Batched F16 wrapper over hmx_mat_mul_f16_f32. // Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3. -int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, - const hmx_matmul_w16a32_batched_params_t *params); +int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params); -// HMX matrix multiplication โ€” tile-permuted quantised weights (Q4_0/Q8_0/IQ4_NL) -int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, +// HMX matrix multiplication โ€” quantised weights (Q4_0/Q8_0/IQ4_NL/MXFP4) +int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *activation, const uint8_t *permuted_weight, diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 92f02eac6e3..51f9243ce0a 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -104,8 +104,11 @@ int op_argsort(struct htp_ops_context * octx); int op_ssm_conv(struct htp_ops_context * octx); int op_cumsum(struct htp_ops_context * octx); int op_fill(struct htp_ops_context * octx); +int op_concat(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); int op_gated_delta_net(struct htp_ops_context * octx); +int op_tri(struct htp_ops_context * octx); +int op_pad(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 98db864dd42..fa85bf4ca0c 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -20,6 +20,7 @@ enum htp_data_type { HTP_TYPE_F32 = 0, HTP_TYPE_F16 = 1, HTP_TYPE_Q4_0 = 2, + HTP_TYPE_Q4_1 = 3, HTP_TYPE_Q8_0 = 8, HTP_TYPE_IQ4_NL = 20, HTP_TYPE_I32 = 26, @@ -28,6 +29,7 @@ enum htp_data_type { // types used internally for repack, dyn.quant, etc HTP_TYPE_Q4_0x4x2 = 200, + HTP_TYPE_Q4_1x4x2, HTP_TYPE_Q8_0x4x2, HTP_TYPE_MXFP4x4x2, @@ -56,6 +58,7 @@ enum htp_op_code { HTP_OP_MUL_MAT, HTP_OP_MUL_MAT_ID, HTP_OP_RMS_NORM, + HTP_OP_RMS_NORM_MUL, HTP_OP_UNARY_SILU, HTP_OP_UNARY_GELU, HTP_OP_UNARY_SIGMOID, @@ -86,6 +89,10 @@ enum htp_op_code { HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, HTP_OP_GATED_DELTA_NET, + HTP_OP_TRI, + HTP_OP_PAD, + HTP_OP_NORM, + HTP_OP_CONCAT, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h b/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h new file mode 100644 index 00000000000..c5b9a5d47c1 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hvx-sin-cos.h @@ -0,0 +1,90 @@ +#ifndef HVX_SIN_COS_H +#define HVX_SIN_COS_H + +#include "hvx-base.h" +#include "hvx-floor.h" + +static inline HVX_Vector hvx_vec_cos_f32(HVX_Vector x) { + HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f); + HVX_Vector const_half = hvx_vec_splat_f32(0.5f); + HVX_Vector const_pi = hvx_vec_splat_f32(3.141592653589793f); + HVX_Vector const_one = hvx_vec_splat_f32(1.0f); + HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f); + + // n = floor(x * (1/pi) + 0.5) + HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half)); + + // y = x - n * pi + HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi)); + + // Sign determination: if n is odd, sign is -1.0f, else 1.0f + // half_n = n * 0.5f + HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half); + // floor_half_n = floor(half_n) + HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n); + // is_odd = half_n > floor_half_n + HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n); + // sign = vmux(is_odd, -1.0f, 1.0f) + HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one); + + // z = y^2 + HVX_Vector z = hvx_vec_mul_f32_f32(y, y); + + // Chebyshev approximation for cos(y) + HVX_Vector c4 = hvx_vec_splat_f32(2.3557242013849433e-05f); + HVX_Vector c3 = hvx_vec_splat_f32(-0.0013871428263450528f); + HVX_Vector c2 = hvx_vec_splat_f32(0.041665895266688284f); + HVX_Vector c1 = hvx_vec_splat_f32(-0.4999999360426369f); + HVX_Vector c0 = hvx_vec_splat_f32(0.9999999999071725f); + + HVX_Vector cos_y = hvx_vec_add_f32_f32(c3, hvx_vec_mul_f32_f32(z, c4)); + cos_y = hvx_vec_add_f32_f32(c2, hvx_vec_mul_f32_f32(z, cos_y)); + cos_y = hvx_vec_add_f32_f32(c1, hvx_vec_mul_f32_f32(z, cos_y)); + cos_y = hvx_vec_add_f32_f32(c0, hvx_vec_mul_f32_f32(z, cos_y)); + + return hvx_vec_mul_f32_f32(cos_y, sign); +} + +static inline HVX_Vector hvx_vec_sin_f32(HVX_Vector x) { + HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f); + HVX_Vector const_half = hvx_vec_splat_f32(0.5f); + HVX_Vector const_pi = hvx_vec_splat_f32(3.141592653589793f); + HVX_Vector const_one = hvx_vec_splat_f32(1.0f); + HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f); + + // n = floor(x * (1/pi) + 0.5) + HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half)); + + // y = x - n * pi + HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi)); + + // Sign determination: if n is odd, sign is -1.0f, else 1.0f + // half_n = n * 0.5f + HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half); + // floor_half_n = floor(half_n) + HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n); + // is_odd = half_n > floor_half_n + HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n); + // sign = vmux(is_odd, -1.0f, 1.0f) + HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one); + + // z = y^2 + HVX_Vector z = hvx_vec_mul_f32_f32(y, y); + + // Chebyshev approximation for sin(y) + HVX_Vector s4 = hvx_vec_splat_f32(2.642186986152672e-06f); + HVX_Vector s3 = hvx_vec_splat_f32(-0.00019825318964070864f); + HVX_Vector s2 = hvx_vec_splat_f32(0.00833326283319605f); + HVX_Vector s1 = hvx_vec_splat_f32(-0.16666666082087775f); + HVX_Vector s0 = hvx_vec_splat_f32(0.999999999915155f); + + HVX_Vector sin_y = hvx_vec_add_f32_f32(s3, hvx_vec_mul_f32_f32(z, s4)); + sin_y = hvx_vec_add_f32_f32(s2, hvx_vec_mul_f32_f32(z, sin_y)); + sin_y = hvx_vec_add_f32_f32(s1, hvx_vec_mul_f32_f32(z, sin_y)); + sin_y = hvx_vec_add_f32_f32(s0, hvx_vec_mul_f32_f32(z, sin_y)); + sin_y = hvx_vec_mul_f32_f32(y, sin_y); + + return hvx_vec_mul_f32_f32(sin_y, sign); +} + +#endif /* HVX_SIN_COS_H */ diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index e0452811ec3..0a760cd344c 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -14,6 +14,8 @@ #include "hvx-sqrt.h" #include "hvx-arith.h" #include "hvx-div.h" +#include "hvx-floor.h" +#include "hvx-sin-cos.h" #include "hvx-base.h" #endif /* HVX_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 883a31d6163..623008be4e2 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -87,35 +87,37 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) { } } +#if __HVX_ARCH__ >= 75 { - // Power on HMX + // Power on HMX and set HMX clock HAP_power_request_t request; memset(&request, 0, sizeof(HAP_power_request_t)); - request.type = HAP_power_set_HMX; - request.hmx.power_up = TRUE; - FARF(ALWAYS, "Powering HMX on\n"); - err = HAP_power_set((void *) &ctx, &request); + request.type = HAP_power_set_HMX_v2; + request.hmx_v2.set_power = TRUE; + request.hmx_v2.power_up = TRUE; + request.hmx_v2.set_clock = TRUE; + request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX; + request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX; + request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX; + request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH; + FARF(ALWAYS, "Setting HMX clock\n"); + err = HAP_power_set((void *) ctx, &request); if (err != AEE_SUCCESS) { - FARF(ERROR, "Error powering on HMX."); + FARF(ERROR, "Error setting HMX clock."); return err; } } - -#if __HVX_ARCH__ >= 75 +#else { - // Set HMX clock + // Power on HMX HAP_power_request_t request; memset(&request, 0, sizeof(HAP_power_request_t)); - request.type = HAP_power_set_HMX_v2; - request.hmx_v2.set_clock = TRUE; - request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX; - request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX; - request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX; - request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH; - FARF(ALWAYS, "Setting HMX clock\n"); - err = HAP_power_set((void *) &ctx, &request); + request.type = HAP_power_set_HMX; + request.hmx.power_up = TRUE; + FARF(ALWAYS, "Powering HMX on\n"); + err = HAP_power_set((void *) ctx, &request); if (err != AEE_SUCCESS) { - FARF(ERROR, "Error setting HMX clock."); + FARF(ERROR, "Error powering on HMX."); return err; } } @@ -418,8 +420,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->n_threads = n_hvx; for (int i = 0; i < ctx->n_threads; i++) { - // see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541 - ctx->dma[i] = dma_queue_create(128); + ctx->dma[i] = dma_queue_create(256); // queue depth } // init worker pool @@ -534,7 +535,9 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_ADD_ID: return op_binary(octx); + case HTP_OP_NORM: case HTP_OP_RMS_NORM: + case HTP_OP_RMS_NORM_MUL: case HTP_OP_SCALE: case HTP_OP_SQR: case HTP_OP_SQRT: @@ -595,9 +598,18 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_SOLVE_TRI: return op_solve_tri(octx); + case HTP_OP_PAD: + return op_pad(octx); + + case HTP_OP_CONCAT: + return op_concat(octx); + case HTP_OP_GATED_DELTA_NET: return op_gated_delta_net(octx); + case HTP_OP_TRI: + return op_tri(octx); + case HTP_OP_INVALID: break; @@ -842,6 +854,11 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { for (uint32_t i=0; i < n_ops; i++) { struct profile_data prof; + if (i == (n_ops-1)) { + // wake up the host before starting the last op + dspqueue_write_early_wakeup_noblock(queue, 0, 0); + } + profile_start(ctx->profiler, &prof); proc_op_req(octx, tens, i, &ops[i]); @@ -858,8 +875,6 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { } } - // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0); - struct htp_opbatch_rsp rsp; rsp.id = req.id; rsp.status = HTP_STATUS_OK; diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 2461ae617fa..7036c491bc4 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -40,6 +40,11 @@ struct htp_matmul_context { const void * restrict vx0, const void * restrict vx1, const void * restrict vy0, const void * restrict vy1); + void (*vec_dot_4x1)(const int n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vx2, const void * restrict vx3, + const void * restrict vy0); + // Precomputed values uint32_t src0_nrows_per_thread; uint32_t src1_nrows_per_thread; @@ -155,6 +160,13 @@ static inline size_t q8x4x2_row_size(uint32_t ne) { return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128); } +static inline size_t q8_1x4x2_row_size(uint32_t ne) { + // ensures perfect alignment of quants and full row + const uint32_t qk = QK_Q8_0x4x2; + const uint32_t nb = (ne + qk - 1) / qk; + return hex_round_up(ne + nb * 8 * 2 * sizeof(__fp16), 128); +} + static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) { const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; @@ -223,6 +235,62 @@ static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, u return r; } +static inline HVX_Vector_x8 hvx_vec_load_q4_1x4x8_full(const uint8_t * restrict ptr) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; + + HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) + HVX_Vector v2_3 = vptr[1]; // ... + HVX_Vector v4_5 = vptr[2]; // ... + HVX_Vector v6_7 = vptr[3]; // ... + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + + HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements + HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ... + HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4 + HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F + HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4 + HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F + HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4 + + HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 }; + return r; +} + +static HVX_Vector_x8 hvx_vec_load_q4_1x4x8_partial(const uint8_t * restrict ptr, uint32_t n) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; + + const uint32_t qk = QK_Q4_0x4x2; // 256 + const uint32_t nb = n / qk; + const uint32_t nloe = n % qk; + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + + HVX_Vector_x8 r; + uint32_t i = 0; + + #pragma unroll(2) + for (i=0; i < nb; i++) { + HVX_Vector v = vptr[i]; // 256 elements (128 bytes) + HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements + r.v[i*2+0] = v0; + r.v[i*2+1] = v1; + } + + if (nloe) { + HVX_Vector v = vptr[i]; // 256 elements (128 bytes) + HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements + HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements + HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... + r.v[i*2+0] = Q6_V_lo_W(v0_1_p); + r.v[i*2+1] = Q6_V_hi_W(v0_1_p); + } + + return r; +} + static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) { const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; @@ -401,82 +469,96 @@ static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 return hvx_vec_rmpy_x8_partial(x, y, 512); } -static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { +static void vec_dot_q4_1x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { assert(n % 32 == 0); // min sub-block size assert((unsigned long) vx0 % 128 == 0); assert((unsigned long) vy0 % 128 == 0); const uint32_t qk = QK_Q4_0x4x2 * 4; - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes const uint32_t x_qblk_size = qk / 2; // int4 const uint32_t x_qrow_size = n / 2; // int4 (not padded) - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes const uint32_t y_qblk_size = qk; // int8 const uint32_t y_qrow_size = n; // int8 (not padded) const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales/offsets const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales/sums // Row sum (sf) HVX_Vector r0_sum = Q6_V_vzero(); - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elemements uint32_t i = 0; for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); + HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); + + HVX_Vector dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(dm, dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(dm_deal)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); } // Process leftovers if (nloe) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); + HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); + + HVX_Vector dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(dm, dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(dm_deal)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ms = Q6_V_vand_QV(bmask, r0_ms); r0_ia = Q6_V_vand_QV(bmask, r0_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); } r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - hvx_vec_store_u(s0, 4, r0_sum); } -static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, +static void vec_dot_q4_1x4x2_q8x4x2_2x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0) { assert(n % 32 == 0); // min sub-block size @@ -486,11 +568,11 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, const uint32_t qk = QK_Q4_0x4x2 * 4; - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes const uint32_t x_qblk_size = qk / 2; // int4 const uint32_t x_qrow_size = n / 2; // int4 (not padded) - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes const uint32_t y_qblk_size = qk; // int8 const uint32_t y_qrow_size = n; // int8 (not padded) @@ -500,77 +582,306 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales/sums // Row sum (sf) HVX_Vector r0_sum = Q6_V_vzero(); HVX_Vector r1_sum = Q6_V_vzero(); - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elemements uint32_t i = 0; for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); + HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); + + HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); + + HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); + HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); } // Process leftovers if (nloe) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); + HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); + + HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); + + HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); + HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ms = Q6_V_vand_QV(bmask, r0_ms); r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r1_ms = Q6_V_vand_QV(bmask, r1_ms); r0_ia = Q6_V_vand_QV(bmask, r0_ia); r1_ia = Q6_V_vand_QV(bmask, r1_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); } HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); hvx_vec_store_u(s0, 8, rsum); } -static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, +static void vec_dot_q4_1x4x2_q8x4x2_4x1(const int n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vx2, const void * restrict vx3, + const void * restrict vy0) { + assert(n % 32 == 0); // min sub-block size + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vx2 % 128 == 0); + assert((unsigned long) vx3 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes + const uint32_t x_qblk_size = qk / 2; // int4 + const uint32_t x_qrow_size = n / 2; // int4 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first + const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales + const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first + const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales/sums + + // Row sum (sf) + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); + HVX_Vector r2_sum = Q6_V_vzero(); + HVX_Vector r3_sum = Q6_V_vzero(); + + const uint32_t nb = n / qk; // num full blocks + const uint32_t nloe = n % qk; // num leftover elements + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r2_q = hvx_vec_load_q4_1x4x8_full(r2_x_q + i * x_qblk_size); + HVX_Vector_x8 r3_q = hvx_vec_load_q4_1x4x8_full(r3_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); + + HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); + HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); + + HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); + + HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); + HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); + + HVX_Vector r2_dm = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); + HVX_VectorPair r2_dm_deal = Q6_W_vdeal_VVR(r2_dm, r2_dm, -2); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r2_dm_deal)); + HVX_Vector r2_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r2_dm_deal)); + + HVX_Vector r3_dm = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); + HVX_VectorPair r3_dm_deal = Q6_W_vdeal_VVR(r3_dm, r3_dm, -2); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r3_dm_deal)); + HVX_Vector r3_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r3_dm_deal)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); + + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); + + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r2_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_m, vy_s))); + + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); + HVX_Vector r3_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_m, vy_s))); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); + + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); + + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r2_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_ms); + + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); + HVX_Vector r3_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_ms); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa_total, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa_total, r3_sum)); + } + + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r2_q = hvx_vec_load_q4_1x4x8_partial(r2_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r3_q = hvx_vec_load_q4_1x4x8_partial(r3_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); + + HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); + HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); + + HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); + + HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); + HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); + + HVX_Vector r2_dm = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); + HVX_VectorPair r2_dm_deal = Q6_W_vdeal_VVR(r2_dm, r2_dm, -2); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r2_dm_deal)); + HVX_Vector r2_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r2_dm_deal)); + + HVX_Vector r3_dm = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); + HVX_VectorPair r3_dm_deal = Q6_W_vdeal_VVR(r3_dm, r3_dm, -2); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r3_dm_deal)); + HVX_Vector r3_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r3_dm_deal)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); + + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); + + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r2_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_m, vy_s))); + + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); + HVX_Vector r3_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_m, vy_s))); + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ms = Q6_V_vand_QV(bmask, r0_ms); + r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r1_ms = Q6_V_vand_QV(bmask, r1_ms); + r2_dd = Q6_V_vand_QV(bmask, r2_dd); + r2_ms = Q6_V_vand_QV(bmask, r2_ms); + r3_dd = Q6_V_vand_QV(bmask, r3_dd); + r3_ms = Q6_V_vand_QV(bmask, r3_ms); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); + r2_ia = Q6_V_vand_QV(bmask, r2_ia); + r3_ia = Q6_V_vand_QV(bmask, r3_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); + + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); + + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r2_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_ms); + + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); + HVX_Vector r3_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_ms); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa_total, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa_total, r3_sum)); + } + + HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; + HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); + hvx_vec_store_u(s0, 16, rsum); +} + + +static void vec_dot_q4_1x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0, const void * restrict vy1) { assert(n % 32 == 0); @@ -581,11 +892,11 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * const uint32_t qk = QK_Q4_0x4x2 * 4; - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes const uint32_t x_qblk_size = qk / 2; // int4 const uint32_t x_qrow_size = n / 2; // int4 (not padded) - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes const uint32_t y_qblk_size = qk; // int8 const uint32_t y_qrow_size = n; // int8 (not padded) @@ -595,9 +906,9 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales + const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales/sums const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first - const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales + const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales/sums // Row sums (sf) - 4 accumulators for 2ร—2 tile HVX_Vector r0_c0_sum = Q6_V_vzero(); @@ -610,13 +921,13 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * uint32_t i = 0; for (; i < nb; i++) { - // Load src1 columns (reused across both src0 rows) + // Load src1 columns HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); - // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); + // Load src0 rows + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size); // Compute 4 dot products: r0ร—c0, r0ร—c1, r1ร—c0, r1ร—c1 HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); @@ -625,16 +936,38 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); // Load scales - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector ds0 = *(const HVX_UVector *) (y0_d + i * y_dblk_size); + HVX_VectorPair ds0_deal = Q6_W_vdeal_VVR(ds0, ds0, -2); + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds0_deal)); + HVX_Vector vy0_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds0_deal)); + + HVX_Vector ds1 = *(const HVX_UVector *) (y1_d + i * y_dblk_size); + HVX_VectorPair ds1_deal = Q6_W_vdeal_VVR(ds1, ds1, -2); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds1_deal)); + HVX_Vector vy1_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds1_deal)); + + HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); + + HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); + HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); // Compute combined scales HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); + HVX_Vector r0_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy0_s))); + HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); + HVX_Vector r0_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy1_s))); + HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); + HVX_Vector r1_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy0_s))); + HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); + HVX_Vector r1_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy1_s))); // Apply scales and accumulate HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); @@ -642,40 +975,72 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); + HVX_Vector r0_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_ms); + HVX_Vector r0_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_ms); + HVX_Vector r1_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_ms); + HVX_Vector r1_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_ms); + + r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa_total, r0_c0_sum)); + r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa_total, r0_c1_sum)); + r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa_total, r1_c0_sum)); + r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa_total, r1_c1_sum)); } // Process leftovers if (nloe) { HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe); HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector ds0 = *(const HVX_UVector *) (y0_d + i * y_dblk_size); + HVX_VectorPair ds0_deal = Q6_W_vdeal_VVR(ds0, ds0, -2); + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds0_deal)); + HVX_Vector vy0_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds0_deal)); + + HVX_Vector ds1 = *(const HVX_UVector *) (y1_d + i * y_dblk_size); + HVX_VectorPair ds1_deal = Q6_W_vdeal_VVR(ds1, ds1, -2); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds1_deal)); + HVX_Vector vy1_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds1_deal)); + + HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); + HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); + + HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); + HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); + HVX_Vector r0_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy0_s))); + HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); + HVX_Vector r0_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy1_s))); + HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); + HVX_Vector r1_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy0_s))); + HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); + HVX_Vector r1_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy1_s))); - // Zero out unused scales + // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); + r0_c0_ms = Q6_V_vand_QV(bmask, r0_c0_ms); r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); + r0_c1_ms = Q6_V_vand_QV(bmask, r0_c1_ms); r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); + r1_c0_ms = Q6_V_vand_QV(bmask, r1_c0_ms); r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); + r1_c1_ms = Q6_V_vand_QV(bmask, r1_c1_ms); + r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); @@ -686,10 +1051,15 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); + HVX_Vector r0_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_ms); + HVX_Vector r0_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_ms); + HVX_Vector r1_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_ms); + HVX_Vector r1_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_ms); + + r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa_total, r0_c0_sum)); + r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa_total, r0_c1_sum)); + r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa_total, r1_c0_sum)); + r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa_total, r1_c1_sum)); } // Reduce and store results @@ -700,26 +1070,26 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * hvx_vec_store_u(s1, 8, r0_r1_c1_sum); // row0,col1 row1,col1 } -static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { +static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { assert(n % 32 == 0); // min sub-block size assert((unsigned long) vx0 % 128 == 0); assert((unsigned long) vy0 % 128 == 0); const uint32_t qk = QK_Q4_0x4x2 * 4; - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk; // int8 - const uint32_t x_qrow_size = n; // int8 (not padded) + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk / 2; // int4 + const uint32_t x_qrow_size = n / 2; // int4 (not padded) - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (sf) HVX_Vector r0_sum = Q6_V_vzero(); @@ -729,12 +1099,12 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo // Apply scale to acc and accumulate into the row sum (qf32). const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) + const uint32_t nloe = n % qk; // num leftover elemements uint32_t i = 0; for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); @@ -751,7 +1121,433 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo // Process leftovers if (nloe) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + + // Zero out unused elements + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + } + + r0_sum = hvx_vec_reduce_sum_f32(r0_sum); + + hvx_vec_store_u(s0, 4, r0_sum); +} + +static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vy0) { + assert(n % 32 == 0); // min sub-block size + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk / 2; // int4 + const uint32_t x_qrow_size = n / 2; // int4 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + + // Row sum (sf) + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); + + // Multiply and accumulate into int32. + // Compute combined scale (fp32). + // Apply scale to acc and accumulate into the row sum (qf32). + + const uint32_t nb = n / qk; // num full blocks + const uint32_t nloe = n % qk; // num leftover elemements + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + } + + // Process leftovers + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + + // Zero out unused elements + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + } + + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); + hvx_vec_store_u(s0, 8, rsum); +} + +static void vec_dot_q4x4x2_q8x4x2_4x1(const int n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vx2, const void * restrict vx3, + const void * restrict vy0) { + assert(n % 32 == 0); // min sub-block size + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vx2 % 128 == 0); + assert((unsigned long) vx3 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk / 2; // int4 + const uint32_t x_qrow_size = n / 2; // int4 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; + const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; + const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; + const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; + const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); + + // Row sum (sf) + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); + HVX_Vector r2_sum = Q6_V_vzero(); + HVX_Vector r3_sum = Q6_V_vzero(); + + const uint32_t nb = n / qk; // num full blocks + const uint32_t nloe = n % qk; // num leftover elements + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r2_q = hvx_vec_load_q4x4x8_full(r2_x_q + i * x_qblk_size); + HVX_Vector_x8 r3_q = hvx_vec_load_q4x4x8_full(r3_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); + } + + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r2_q = hvx_vec_load_q4x4x8_partial(r2_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r3_q = hvx_vec_load_q4x4x8_partial(r3_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r2_dd = Q6_V_vand_QV(bmask, r2_dd); + r3_dd = Q6_V_vand_QV(bmask, r3_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); + r2_ia = Q6_V_vand_QV(bmask, r2_ia); + r3_ia = Q6_V_vand_QV(bmask, r3_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); + } + + HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; + HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); + hvx_vec_store_u(s0, 16, rsum); +} + + +static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vy0, const void * restrict vy1) { + assert(n % 32 == 0); + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + assert((unsigned long) vy1 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk / 2; // int4 + const uint32_t x_qrow_size = n / 2; // int4 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + + const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first + const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales + const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first + const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales + + // Row sums (sf) - 4 accumulators for 2ร—2 tile + HVX_Vector r0_c0_sum = Q6_V_vzero(); + HVX_Vector r0_c1_sum = Q6_V_vzero(); + HVX_Vector r1_c0_sum = Q6_V_vzero(); + HVX_Vector r1_c1_sum = Q6_V_vzero(); + + const uint32_t nb = n / qk; // num full blocks + const uint32_t nloe = n % qk; // num leftover elements + + uint32_t i = 0; + for (; i < nb; i++) { + // Load src1 columns (reused across both src0 rows) + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); + + // Load src0 rows (reused across both src1 columns) + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); + + // Compute 4 dot products: r0ร—c0, r0ร—c1, r1ร—c0, r1ร—c1 + HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); + HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q)); + HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q)); + HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); + + // Load scales + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + + // Compute combined scales + HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); + HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); + HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); + HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); + + // Apply scales and accumulate + HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); + HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); + HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); + HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); + + r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); + r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); + r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); + r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); + } + + // Process leftovers + if (nloe) { + HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); + HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); + HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); + HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); + HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); + + HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); + HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + + HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); + HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); + HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); + HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); + + // Zero out unused scales + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); + r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); + r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); + r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); + r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); + r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); + r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); + r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia); + + HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); + HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); + HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); + HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); + + r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); + r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); + r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); + r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); + } + + // Reduce and store results + HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum); + HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum); + + hvx_vec_store_u(s0, 8, r0_r1_c0_sum); // row0,col0 row1,col0 + hvx_vec_store_u(s1, 8, r0_r1_c1_sum); // row0,col1 row1,col1 +} + +static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { + assert(n % 32 == 0); // min sub-block size + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk; // int8 + const uint32_t x_qrow_size = n; // int8 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + + // Row sum (sf) + HVX_Vector r0_sum = Q6_V_vzero(); + + // Multiply and accumulate into int32. + // Compute combined scale (fp32). + // Apply scale to acc and accumulate into the row sum (qf32). + + const uint32_t nb = n / qk; // num full blocks + int32_t nloe = n % qk; // num leftover elemements (must be signed) + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + } + + // Process leftovers + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); @@ -804,10 +1600,109 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, // Row sum (qf32) HVX_Vector r0_sum = Q6_V_vzero(); HVX_Vector r1_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). + + // Multiply and accumulate into int32. + // Compute combined scale (fp32). + // Apply scale to acc and accumulate into the row sum (qf32). + + const uint32_t nb = n / qk; // num full blocks + int32_t nloe = n % qk; // num leftover elemements (must be signed) + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + } + + // Process leftovers + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + + // Zero out unused elements + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + } + + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); + hvx_vec_store_u(s0, 8, rsum); +} + +static void vec_dot_q8x4x2_q8x4x2_4x1(const int n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vx2, const void * restrict vx3, + const void * restrict vy0) { + assert(n % 32 == 0); // min sub-block size + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vx2 % 128 == 0); + assert((unsigned long) vx3 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk; // int8 + const uint32_t x_qrow_size = n; // int8 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first + const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales + const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first + const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + + // Row sum (qf32) + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); + HVX_Vector r2_sum = Q6_V_vzero(); + HVX_Vector r3_sum = Q6_V_vzero(); const uint32_t nb = n / qk; // num full blocks int32_t nloe = n % qk; // num leftover elemements (must be signed) @@ -817,58 +1712,86 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r2_q = hvx_vec_load_q8x4x8_full(r2_x_q + i * x_qblk_size); + HVX_Vector_x8 r3_q = hvx_vec_load_q8x4x8_full(r3_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); } - // Process leftovers if (nloe) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r2_q = hvx_vec_load_q8x4x8_partial(r2_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r3_q = hvx_vec_load_q8x4x8_partial(r3_x_q + i * x_qblk_size, nloe); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - // Zero out unused elements HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r2_dd = Q6_V_vand_QV(bmask, r2_dd); + r3_dd = Q6_V_vand_QV(bmask, r3_dd); r0_ia = Q6_V_vand_QV(bmask, r0_ia); r1_ia = Q6_V_vand_QV(bmask, r1_ia); + r2_ia = Q6_V_vand_QV(bmask, r2_ia); + r3_ia = Q6_V_vand_QV(bmask, r3_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); } - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); + HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; + HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); + hvx_vec_store_u(s0, 16, rsum); } + static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0, const void * restrict vy1) { @@ -1163,6 +2086,135 @@ static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n, hvx_vec_store_u(s0, 8, rsum); } +static void vec_dot_iq4nlx4x2_q8x4x2_4x1(const int n, + float * restrict s0, + const void * restrict vx0, + const void * restrict vx1, + const void * restrict vx2, + const void * restrict vx3, + const void * restrict vy0) { + assert(n % 32 == 0); + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vx2 % 128 == 0); + assert((unsigned long) vx3 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_Q4_0x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_qblk_size = qk / 2; // int4 + const uint32_t x_qrow_size = n / 2; // int4 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first + const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales + const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first + const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + + HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); + HVX_Vector r2_sum = Q6_V_vzero(); + HVX_Vector r3_sum = Q6_V_vzero(); + + const uint32_t nb = n / qk; + const uint32_t nloe = n % qk; + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r2_q = hvx_vec_load_iq4nlx4x8_full(r2_x_q + i * x_qblk_size); + HVX_Vector_x8 r3_q = hvx_vec_load_iq4nlx4x8_full(r3_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); + } + + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r2_q = hvx_vec_load_iq4nlx4x8_partial(r2_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r3_q = hvx_vec_load_iq4nlx4x8_partial(r3_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); + + HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); + HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); + HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); + HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r2_dd = Q6_V_vand_QV(bmask, r2_dd); + r3_dd = Q6_V_vand_QV(bmask, r3_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); + r2_ia = Q6_V_vand_QV(bmask, r2_ia); + r3_ia = Q6_V_vand_QV(bmask, r3_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); + } + + HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; + HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); + hvx_vec_store_u(s0, 16, rsum); +} + + static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, @@ -1286,33 +2338,144 @@ static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n, hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); } -static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { +static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { + assert(n % 32 == 0); // min sub-block size + assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vy0 % 128 == 0); + + const uint32_t qk = QK_MXFP4x4x2 * 4; + + const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 + const uint32_t x_qblk_size = qk / 2; // fp4 + const uint32_t x_qrow_size = n / 2; // fp4 (not padded) + + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) + + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales + + const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + + // Row sum (sf) + HVX_Vector r0_sum = Q6_V_vzero(); + + // Multiply and accumulate into int32. + // Compute combined scale (fp32). + // Apply scale to acc and accumulate into the row sum (qf32). + + const uint32_t nb = n / qk; // num full blocks + int32_t nloe = n % qk; // num leftover elemements (must be signed) + + uint32_t i = 0; + for (; i < nb; i++) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + + // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving + HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 + vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); + vy_d = Q6_Vsf_equals_Vqf32(vy_d); + + // Convert rX_d scales from e8m0 to fp32 + // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... + // Left shift with zero fill to create FP32 + // FIXME: might need to handle zero as a special case (see ggml-cpu code) + HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); + r0_d = Q6_V_vdelta_VV(r0_d, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + r0_d = Q6_Vw_vasl_VwR(r0_d, 23); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + } + + // Process leftovers + if (nloe) { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); + HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + + // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving + HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 + vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); + vy_d = Q6_Vsf_equals_Vqf32(vy_d); + + // Convert rX_d scales from e8m0 to fp32 + // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... + // Left shift with zero fill to create FP32 + // FIXME: might need to handle zero as a special case (see ggml-cpu code) + HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); + r0_d = Q6_V_vdelta_VV(r0_d, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + r0_d = Q6_Vw_vasl_VwR(r0_d, 23); + + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); + + // Zero-out unused scales + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); + r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + } + + r0_sum = hvx_vec_reduce_sum_f32(r0_sum); + + hvx_vec_store_u(s0, 4, r0_sum); +} + +static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vy0) { assert(n % 32 == 0); // min sub-block size assert((unsigned long) vx0 % 128 == 0); + assert((unsigned long) vx1 % 128 == 0); assert((unsigned long) vy0 % 128 == 0); const uint32_t qk = QK_MXFP4x4x2 * 4; - const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 - const uint32_t x_qblk_size = qk / 2; // fp4 - const uint32_t x_qrow_size = n / 2; // fp4 (not padded) + const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 + const uint32_t x_qblk_size = qk / 2; // fp4 + const uint32_t x_qrow_size = n / 2; // fp4 (not padded) - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) + const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_qblk_size = qk; // int8 + const uint32_t y_qrow_size = n; // int8 (not padded) - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales + const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first + const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales + const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first + const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales + const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0; // quants first + const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales // Row sum (sf) HVX_Vector r0_sum = Q6_V_vzero(); + HVX_Vector r1_sum = Q6_V_vzero(); // Multiply and accumulate into int32. // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). + // Apply scale to acc and accumulate into the row sum (f32). const uint32_t nb = n / qk; // num full blocks int32_t nloe = n % qk; // num leftover elemements (must be signed) @@ -1321,11 +2484,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 @@ -1341,23 +2507,32 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const r0_d = Q6_V_vdelta_VV(r0_d, expand); r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); r0_d = Q6_Vw_vasl_VwR(r0_d, 23); + r1_d = Q6_V_vdelta_VV(r1_d, expand); + r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); + r1_d = Q6_Vw_vasl_VwR(r1_d, 23); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } // Process leftovers if (nloe) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); + HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 @@ -1373,30 +2548,40 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const r0_d = Q6_V_vdelta_VV(r0_d, expand); r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); r0_d = Q6_Vw_vasl_VwR(r0_d, 23); + r1_d = Q6_V_vdelta_VV(r1_d, expand); + r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); + r1_d = Q6_Vw_vasl_VwR(r1_d, 23); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); - // Zero-out unused scales + // Zero-out unused values HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r1_dd = Q6_V_vand_QV(bmask, r1_dd); r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } - r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - - hvx_vec_store_u(s0, 4, r0_sum); + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); + hvx_vec_store_u(s0, 8, rsum); } -static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, +static void vec_dot_mxfp4x4x2_q8x4x2_4x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vx1, + const void * restrict vx2, const void * restrict vx3, const void * restrict vy0) { assert(n % 32 == 0); // min sub-block size assert((unsigned long) vx0 % 128 == 0); assert((unsigned long) vx1 % 128 == 0); + assert((unsigned long) vx2 % 128 == 0); + assert((unsigned long) vx3 % 128 == 0); assert((unsigned long) vy0 % 128 == 0); const uint32_t qk = QK_MXFP4x4x2 * 4; @@ -1413,17 +2598,19 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first + const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales + const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first + const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales + const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales // Row sum (sf) HVX_Vector r0_sum = Q6_V_vzero(); HVX_Vector r1_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (f32). + HVX_Vector r2_sum = Q6_V_vzero(); + HVX_Vector r3_sum = Q6_V_vzero(); const uint32_t nb = n / qk; // num full blocks int32_t nloe = n % qk; // num leftover elemements (must be signed) @@ -1433,13 +2620,19 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); + HVX_Vector_x8 r2_q = hvx_vec_load_mxfp4x4x8_full(r2_x_q + i * x_qblk_size); + HVX_Vector_x8 r3_q = hvx_vec_load_mxfp4x4x8_full(r3_x_q + i * x_qblk_size); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_Vector r2_d = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); + HVX_Vector r3_d = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 @@ -1447,9 +2640,6 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, vy_d = Q6_Vsf_equals_Vqf32(vy_d); // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); r0_d = Q6_V_vdelta_VV(r0_d, expand); @@ -1458,29 +2648,46 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, r1_d = Q6_V_vdelta_VV(r1_d, expand); r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); r1_d = Q6_Vw_vasl_VwR(r1_d, 23); + r2_d = Q6_V_vdelta_VV(r2_d, expand); + r2_d = Q6_V_vand_VV(r2_d, e8m0_mask); + r2_d = Q6_Vw_vasl_VwR(r2_d, 23); + r3_d = Q6_V_vdelta_VV(r3_d, expand); + r3_d = Q6_V_vand_VV(r3_d, e8m0_mask); + r3_d = Q6_Vw_vasl_VwR(r3_d, 23); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r2_d, vy_d)); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r3_d, vy_d)); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); } - // Process leftovers if (nloe) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r2_q = hvx_vec_load_mxfp4x4x8_partial(r2_x_q + i * x_qblk_size, nloe); + HVX_Vector_x8 r3_q = hvx_vec_load_mxfp4x4x8_partial(r3_x_q + i * x_qblk_size, nloe); HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); + HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + HVX_Vector r2_d = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); + HVX_Vector r3_d = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 @@ -1488,9 +2695,6 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, vy_d = Q6_Vsf_equals_Vqf32(vy_d); // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); r0_d = Q6_V_vdelta_VV(r0_d, expand); @@ -1499,28 +2703,46 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, r1_d = Q6_V_vdelta_VV(r1_d, expand); r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); r1_d = Q6_Vw_vasl_VwR(r1_d, 23); + r2_d = Q6_V_vdelta_VV(r2_d, expand); + r2_d = Q6_V_vand_VV(r2_d, e8m0_mask); + r2_d = Q6_Vw_vasl_VwR(r2_d, 23); + r3_d = Q6_V_vdelta_VV(r3_d, expand); + r3_d = Q6_V_vand_VV(r3_d, e8m0_mask); + r3_d = Q6_Vw_vasl_VwR(r3_d, 23); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); + HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r2_d, vy_d)); + HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r3_d, vy_d)); // Zero-out unused values HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r2_dd = Q6_V_vand_QV(bmask, r2_dd); + r3_dd = Q6_V_vand_QV(bmask, r3_dd); r0_ia = Q6_V_vand_QV(bmask, r0_ia); r1_ia = Q6_V_vand_QV(bmask, r1_ia); + r2_ia = Q6_V_vand_QV(bmask, r2_ia); + r3_ia = Q6_V_vand_QV(bmask, r3_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); + HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); + r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); + r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); } - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); + HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; + HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); + hvx_vec_store_u(s0, 16, rsum); } + static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0, const void * restrict vy1) { @@ -2138,7 +3360,6 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { const uint32_t src0_start_row = src0_nrows_per_thread * ith; const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); - const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); // no work for this thread if (src0_start_row >= src0_end_row) { @@ -2168,39 +3389,89 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { const uint8_t * restrict src1_col = (const uint8_t *) src1_data; float * restrict dst_col = (float *) dst->data; - // Prefill spad with 2x src0 rows - #pragma unroll(2) - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const uint32_t is0 = (ir0 - src0_start_row); - if (is0 >= MM_SPAD_SRC0_NROWS) { - break; + if (mmctx->vec_dot_4x1 != NULL) { + const uint32_t src0_end_row_x4 = src0_start_row + ((src0_end_row - src0_start_row) & ~3U); + + // Prefill spad with 4x src0 rows + #pragma unroll(4) + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) { + const uint32_t is0 = (ir0 - src0_start_row); + if (is0 >= MM_SPAD_SRC0_NROWS) { + break; + } + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, 4); } - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 2); - } - // Process src0 rows - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); + // Process src0 rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) { + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + mmctx->vec_dot_4x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, ss0 + 2 * src0_stride, ss0 + 3 * src0_stride, src1_col); - // Prefetch next (n + spad_nrows) row - const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); - const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - if (pr0 < src0_end_row_x2) { - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size), + // Prefetch next (n + spad_nrows) row + const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); + const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; + if (pr0 < src0_end_row_x4) { + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size), + src0_stride, src0_row_size, 4); + } + } + + // Process leftovers + uint32_t ir0 = src0_end_row_x4; + if (ir0 + 2 <= src0_end_row) { + const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), src0_stride, src0_row_size, 2); + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); + ir0 += 2; } - } + if (ir0 < src0_end_row) { + const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, 1); + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); + ir0 += 1; + } + } else { + const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); - // Process the last row (if any) - if (src0_end_row != src0_end_row_x2) { - const uint32_t ir0 = src0_end_row_x2; - const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 1); - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); + // Prefill spad with 2x src0 rows + #pragma unroll(2) + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const uint32_t is0 = (ir0 - src0_start_row); + if (is0 >= MM_SPAD_SRC0_NROWS) { + break; + } + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, 2); + } + + // Process src0 rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); + + // Prefetch next (n + spad_nrows) row + const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); + const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; + if (pr0 < src0_end_row_x2) { + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size), + src0_stride, src0_row_size, 2); + } + } + + // Process the last row (if any) + if (src0_end_row != src0_end_row_x2) { + const uint32_t ir0 = src0_end_row_x2; + const uint32_t is0 = (ir0 - src0_start_row); + dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, 1); + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); + } } hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); @@ -2432,6 +3703,94 @@ static void matvec_id(unsigned int nth, unsigned int ith, void * data) { // *** dynamic quant +static inline void quantize_block_f32_q8_1x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { + assert((unsigned long) x % 128 == 0); + assert((unsigned long) y_q % 128 == 0); + + HVX_Vector * vx = (HVX_Vector *) x; + HVX_Vector zero = Q6_V_vzero(); + + // Use reduce max fp32 to find max(abs(e)) first + HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); + HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); + HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); + HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); + + // Load and convert into QF32 + HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements + HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements + HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements + HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements + + // Convert to QF32 + HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); + HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); + HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); + HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); + + // Combine and convert to fp16 + HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf))); + HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf))); + + // Convert into fp16 + HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); + HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + + HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); + HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); + + // Divide input by the scale + HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); + HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); + vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); + vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); + + // Convert to int8 + HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); + HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); + HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + + *(HVX_Vector *) y_q = vx_i8; + + // --- Sum calculation --- + const HVX_Vector ones = Q6_Vb_vsplat_R(1); + HVX_Vector v_sums = Q6_Vw_vrmpy_VbVb(vx_i8, ones); // sum every 4 consecutive elements + // Sum 8 elements: + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 4)); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 8)); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 16)); + + // Copy to stack to extract sums and vmaxes + float vmax0[32] __attribute__((aligned(128))); + float vmax1[32] __attribute__((aligned(128))); + float vmax2[32] __attribute__((aligned(128))); + float vmax3[32] __attribute__((aligned(128))); + int32_t sums[32] __attribute__((aligned(128))); + + hvx_vec_store_u(vmax0, 128, vmax0_sf); + hvx_vec_store_u(vmax1, 128, vmax1_sf); + hvx_vec_store_u(vmax2, 128, vmax2_sf); + hvx_vec_store_u(vmax3, 128, vmax3_sf); + hvx_vec_store_u(sums, 128, v_sums); + + float d0 = vmax0[0] / 127.0f; + float d1 = vmax1[0] / 127.0f; + float d2 = vmax2[0] / 127.0f; + float d3 = vmax3[0] / 127.0f; + + __fp16 * y_d_half = (__fp16 *) y_d; + y_d_half[0] = d0; + y_d_half[1] = (float) sums[0] * d0; + y_d_half[2] = d1; + y_d_half[3] = (float) sums[8] * d1; + y_d_half[4] = d2; + y_d_half[5] = (float) sums[16] * d2; + y_d_half[6] = d3; + y_d_half[7] = (float) sums[24] * d3; +} + static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { assert((unsigned long) x % 128 == 0); assert((unsigned long) y_q % 128 == 0); @@ -2656,6 +4015,77 @@ static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } +static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) { + assert(k % 32 == 0); + const uint32_t qk = QK_Q8_0x4x2; + const uint32_t nb = (k + qk - 1) / qk; + + const uint32_t qrow_size = k; // int8 + + const uint32_t dblk_size = 8 * 4; // 8x (d, s) __fp16 = 32 bytes + const uint32_t qblk_size = QK_Q8_0x4x2; // int8 + + uint8_t * restrict y_q = (y + 0); // quants first + uint8_t * restrict y_d = (y + qrow_size); // then scales/sums + + // Temp scales override input since we're working off of the aligned temp buffer in VTCM + uint8_t * restrict t_d = (uint8_t *) x; + + for (uint32_t i = 0; i < nb; i++) { + quantize_block_f32_q8_1x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); + quantize_block_f32_q8_1x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); + } + + // now copy the scales/sums into final location + hvx_copy_f16_ua(y_d, t_d, nb * 16); +} + +static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * data) { + struct htp_matmul_context * mmctx = data; + struct htp_ops_context * octx = mmctx->octx; + + const struct htp_tensor * src = octx->src[1]; + uint8_t * restrict dst = octx->src1_spad.data; + struct htp_spad * spad = &octx->src0_spad; + uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; + + uint64_t t1 = HAP_perf_get_qtimer_count(); + + const uint32_t ne0 = src->ne[0]; + const uint32_t ne1 = src->ne[1]; + const uint32_t ne2 = src->ne[2]; + const uint32_t ne3 = src->ne[3]; + + const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows + + const uint32_t ir_first = nrows_per_thread * ith; // first row + const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row + + const size_t src_row_size = src->nb[1]; + const size_t dst_row_size = q8_1x4x2_row_size(ne0); + + uint8_t * restrict src_data = (uint8_t *) src->data + (src_row_size * ir_first); + uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first); + uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith); + + const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float)); + memset(tmp_data, 0, src_row_size_padded); // zero-out temp row data for padding + + for (uint32_t i = ir_first; i < ir_last; ++i) { + hex_l2fetch(src_data, src_row_size, src_row_size, 2); + hvx_copy_f32_aa(tmp_data, src_data, ne0); + + quantize_row_f32_q8_1x4x2((float *) tmp_data, dst_data, ne0); + dst_data += dst_row_size; + src_data += src_row_size; + } + + uint64_t t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "quantize-f32-q8_1x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, + ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) { struct htp_matmul_context * mmctx = data; struct htp_ops_context * octx = mmctx->octx; @@ -2751,24 +4181,35 @@ static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_t mmctx->vec_dot_1x1 = vec_dot_q4x4x2_q8x4x2_1x1; mmctx->vec_dot_2x1 = vec_dot_q4x4x2_q8x4x2_2x1; mmctx->vec_dot_2x2 = vec_dot_q4x4x2_q8x4x2_2x2; + mmctx->vec_dot_4x1 = vec_dot_q4x4x2_q8x4x2_4x1; + return 0; + case HTP_TYPE_Q4_1: + mmctx->type = "q4_1x4x2-f32"; + mmctx->vec_dot_1x1 = vec_dot_q4_1x4x2_q8x4x2_1x1; + mmctx->vec_dot_2x1 = vec_dot_q4_1x4x2_q8x4x2_2x1; + mmctx->vec_dot_2x2 = vec_dot_q4_1x4x2_q8x4x2_2x2; + mmctx->vec_dot_4x1 = vec_dot_q4_1x4x2_q8x4x2_4x1; return 0; case HTP_TYPE_Q8_0: mmctx->type = "q8x4x2-f32"; mmctx->vec_dot_1x1 = vec_dot_q8x4x2_q8x4x2_1x1; mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1; mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2; + mmctx->vec_dot_4x1 = vec_dot_q8x4x2_q8x4x2_4x1; return 0; case HTP_TYPE_IQ4_NL: mmctx->type = "iq4nlx4x2-f32"; mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1; mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1; mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2; + mmctx->vec_dot_4x1 = vec_dot_iq4nlx4x2_q8x4x2_4x1; return 0; case HTP_TYPE_MXFP4: mmctx->type = "mxfp4x4x2-f32"; mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1; mmctx->vec_dot_2x1 = vec_dot_mxfp4x4x2_q8x4x2_2x1; mmctx->vec_dot_2x2 = vec_dot_mxfp4x4x2_q8x4x2_2x2; + mmctx->vec_dot_4x1 = vec_dot_mxfp4x4x2_q8x4x2_4x1; return 0; default: return -1; @@ -2894,8 +4335,13 @@ static int op_matmul_hvx(struct htp_ops_context * octx) { return HTP_STATUS_NO_SUPPORT; } - quant_job_func = quantize_f32_q8x4x2; - src1_row_size = q8x4x2_row_size(ne10); + if (src0->type == HTP_TYPE_Q4_1) { + quant_job_func = quantize_f32_q8_1x4x2; + src1_row_size = q8_1x4x2_row_size(ne10); + } else { + quant_job_func = quantize_f32_q8x4x2; + src1_row_size = q8x4x2_row_size(ne10); + } htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, 0); } @@ -2962,7 +4408,7 @@ int op_matmul(struct htp_ops_context * octx) { // HMX supports F16, Q4_0, Q8_0, IQ4_NL, MXFP4 weights. // Other types fall back to HVX. uint32_t wtype = src0->type; - if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) { + if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q4_1 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) { return op_matmul_hvx(octx); } @@ -2995,7 +4441,6 @@ int op_matmul(struct htp_ops_context * octx) { // is handled by HMX itself; when M < 32 fall back to HVX. const int m_total = (int) src1->ne[1]; const int m_hmx = m_total & ~31; // 0 when M < 32 - if (m_hmx == 0) { return op_matmul_hvx(octx); } @@ -3020,7 +4465,7 @@ int op_matmul(struct htp_ops_context * octx) { if (src0->type == HTP_TYPE_F16) { if (is_batched) { - hmx_matmul_w16a32_batched_params_t batch_params = { + hmx_matmul_f16_f32_batched_params_t batch_params = { .dst = (float *) dst->data, .activation = (float *) src1->data, .permuted_weight = (const __fp16 *) src0->data, @@ -3041,15 +4486,14 @@ int op_matmul(struct htp_ops_context * octx) { .dst_nb2 = dst->nb[2], .dst_nb3 = dst->nb[3], }; - ret = hmx_mat_mul_permuted_w16a32_batched(octx->ctx, &batch_params); + ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params); } else { - ret = hmx_mat_mul_permuted_w16a32(octx->ctx, + ret = hmx_matmul_f16_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const __fp16 *) src0->data, m_total, k, n, act_stride, wgt_stride); } } else { - ret = hmx_mat_mul_permuted_qk_0_d16a32(octx->ctx, - (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data, + ret = hmx_matmul_q_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data, m_total, k, n, (int) src0->type); } @@ -3100,8 +4544,13 @@ int op_matmul_id(struct htp_ops_context * octx) { return HTP_STATUS_NO_SUPPORT; } - quant_job_func = quantize_f32_q8x4x2; - src1_row_size = q8x4x2_row_size(ne10); + if (src0->type == HTP_TYPE_Q4_1) { + quant_job_func = quantize_f32_q8_1x4x2; + src1_row_size = q8_1x4x2_row_size(ne10); + } else { + quant_job_func = quantize_f32_q8x4x2; + src1_row_size = q8x4x2_row_size(ne10); + } const size_t src2_spad_size_per_thread = hex_round_up(matrix_row_counts_size + matrix_row_map_size, 256); htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, src2_spad_size_per_thread); diff --git a/ggml/src/ggml-hexagon/htp/pad-ops.c b/ggml/src/ggml-hexagon/htp/pad-ops.c new file mode 100644 index 00000000000..7d926338c9c --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/pad-ops.c @@ -0,0 +1,544 @@ +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" + +#include +#include + +#include + +#include "hex-dma.h" +#include "hvx-utils.h" + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" +#include "htp-ctx.h" +#include "htp-ops.h" + +/* Circular wrap: maps any integer x into [0, n) */ +static inline uint32_t wrap_around(int32_t x, uint32_t n) { + return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n); +} + +/* Decompose a flat dst row index into (i1, i2, i3) */ +static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2, + uint32_t *i1, uint32_t *i2, uint32_t *i3) { + *i1 = ir % ne1; + *i2 = (ir / ne1) % ne2; + *i3 = ir / (ne1 * ne2); +} + +/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */ +static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t rp1, uint32_t ne1, + int32_t lp2, int32_t rp2, uint32_t ne2, + int32_t lp3, int32_t rp3, uint32_t ne3) { + return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) && + ((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) && + ((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3); +} + +/* Compute the DDR src row pointer for a zero-pad interior row */ +static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + (i1 - (uint32_t)lp1) * src->nb[1] + + (i2 - (uint32_t)lp2) * src->nb[2] + + (i3 - (uint32_t)lp3) * src->nb[3]; +} + +/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */ +static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1] + + wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2] + + wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3]; +} + +struct htp_pad_context { + struct htp_ops_context * octx; + + int32_t lp0, rp0; + int32_t lp1, rp1; + int32_t lp2, rp2; + int32_t lp3, rp3; + + uint32_t nrows_per_thread; + uint32_t total_dst_rows; + + size_t type_size; + + // Row sizes for DMA kernel (populated when VTCM is available) + size_t src_row_size; + size_t src_row_size_aligned; + size_t dst_row_size; + size_t dst_row_size_aligned; +}; + +#define htp_pad_preamble \ + const struct htp_tensor * src = octx->src[0]; \ + const struct htp_tensor * dst = octx->dst; \ + \ + const uint32_t ne00 = src->ne[0]; \ + const uint32_t nb00 = src->nb[0]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ + const uint32_t nb3 = dst->nb[3]; \ + \ + const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \ + const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \ + const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \ + const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \ + \ + const size_t type_size = pctx->type_size; \ + \ + const uint32_t row_start = pctx->nrows_per_thread * ith; \ + const uint32_t row_end = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows); + + +#define htp_pad_dma_preamble \ + const size_t src_row_size = pctx->src_row_size; \ + const size_t src_row_size_aligned = pctx->src_row_size_aligned; \ + const size_t dst_row_size = pctx->dst_row_size; \ + const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \ + \ + uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \ + uint8_t * dst_spad_base = octx->dst_spad.data + ith * octx->dst_spad.size_per_thread; \ + \ + dma_queue * dma = octx->ctx->dma[ith]; + +// --------------------------------------------------------------------------- +// HVX vectorized PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_u(dst_ptr, 0.0f, ne0); + } else { + const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (lp0 > 0) { + hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0); + } + + uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size; + if (nb00 == type_size) { + hvx_copy_f32_uu(dst_row_start, src_ptr, ne00); + } else { + for (uint32_t i = 0; i < ne00; i++) { + memcpy(dst_row_start + i * type_size, + src_ptr + (size_t)i * nb00, + type_size); + } + } + + if (rp0 > 0) { + hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA PAD kernel โ€” aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline before the main loop begins. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + const uint8_t * src_ptr = interior + ? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL; + + // Interior row: real DMA (1 row) from DDR to VTCM. + // Border row: null DMA (nrows=0) + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + src_ptr ? src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, src_ptr ? 1 : 0); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops, + // push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + } else { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + + uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size; + + if ((uintptr_t)dst_interior % VLEN == 0) { + hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00); + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t ni1, ni2, ni3; + pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3); + const int next_interior = pad_is_interior(ni1, ni2, ni3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + const uint8_t * next_src_ptr = next_interior + ? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL; + + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX circular PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (nb00 == type_size) { + + if (lp0 > 0) { + if ((uint32_t)lp0 < 32) { + memcpy(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (uint32_t)lp0); + } + } + hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00); + if (rp0 > 0) { + if ((uint32_t)rp0 < 32) { + memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (size_t)rp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (uint32_t)rp0); + } + } + } else { + for (uint32_t i = 0; i < (uint32_t)lp0; i++) { + *(float *)(dst_ptr + i * type_size) = + *(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00); + } + for (uint32_t i = 0; i < ne00; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + for (uint32_t i = 0; i < (uint32_t)rp0; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA circular PAD kernel โ€” aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline. Every row is a real src DMA (no null DMAs). + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t pi1, pi2, pi3; + pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, assemble circular row in VTCM with + // aligned HVX ops, push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + + if (lp0 > 0) { + uint8_t * dst_left = dst_spad_cur; + const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size; + if ((uint32_t)lp0 < 32) { + memcpy(dst_left, src_left, (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0); + } + } + + { + uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size; + if ((uintptr_t)dst_mid % VLEN == 0) { + hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00); + } + } + + if (rp0 > 0) { + uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size; + if ((uint32_t)rp0 < 32) { + memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size); + } else { + if ((uintptr_t)dst_right % VLEN == 0) { + hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0); + } else { + hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0); + } + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t nri1, nri2, nri3; + pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +int op_pad(struct htp_ops_context * octx) { + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; + + // Only F32 supported + size_t type_size; + switch (src0->type) { + case HTP_TYPE_F32: type_size = 4; break; + default: + FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type); + return HTP_STATUS_NO_SUPPORT; + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + + const int32_t lp0 = octx->op_params[0]; + const int32_t rp0 = octx->op_params[1]; + const int32_t lp1 = octx->op_params[2]; + const int32_t rp1 = octx->op_params[3]; + const int32_t lp2 = octx->op_params[4]; + const int32_t rp2 = octx->op_params[5]; + const int32_t lp3 = octx->op_params[6]; + const int32_t rp3 = octx->op_params[7]; + const int32_t circular = octx->op_params[8]; + + const uint32_t ne0 = dst->ne[0]; + const uint32_t ne00 = src0->ne[0]; + + const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3]; + const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1); + + const size_t src_row_size = (size_t)ne00 * type_size; + const size_t dst_row_size = (size_t)ne0 * type_size; + const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); + + // Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread + const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned); + + const int use_dma = (src0->nb[0] == (uint32_t)type_size) && + (ne00 >= 512) && + (octx->ctx->vtcm_base != NULL) && + (octx->ctx->vtcm_size >= vtcm_needed); + + if (use_dma) { + octx->src0_spad.size_per_thread = 2 * src_row_size_aligned; + octx->dst_spad.size_per_thread = 2 * dst_row_size_aligned; + octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; + octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + } + + struct htp_pad_context pctx = { + .octx = octx, + .lp0 = lp0, .rp0 = rp0, + .lp1 = lp1, .rp1 = rp1, + .lp2 = lp2, .rp2 = rp2, + .lp3 = lp3, .rp3 = rp3, + .nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads, + .total_dst_rows = total_dst_rows, + .type_size = type_size, + .src_row_size = src_row_size, + .src_row_size_aligned = src_row_size_aligned, + .dst_row_size = dst_row_size, + .dst_row_size_aligned = dst_row_size_aligned, + }; + + FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n", + circular ? "-circ" : "", + use_dma ? "-dma" : "", + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + + if (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); } + else if (circular) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular, &pctx, n_threads); } + else if (use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma, &pctx, n_threads); } + else { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx, &pctx, n_threads); } + + return HTP_STATUS_OK; +} diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 1d8b0796bc9..c839044b84f 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -7,6 +7,7 @@ #include #include +#include #include "hex-dma.h" #include "hvx-utils.h" @@ -18,9 +19,11 @@ #include "htp-ops.h" #include "htp-ops.h" -// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h +// Redefined the rope type constants as we can't include ggml.h #define HTP_ROPE_TYPE_NORMAL 0 #define HTP_ROPE_TYPE_NEOX 2 +#define HTP_ROPE_TYPE_MROPE 8 +#define HTP_ROPE_TYPE_IMROPE 40 #define HTP_ROPE_SPAD_NROWS 16 #define HTP_ROPE_SPAD_BLOCK (HTP_ROPE_SPAD_NROWS/2) @@ -73,6 +76,9 @@ struct htp_rope_context { size_t theta_cache_offset; uint32_t src0_nrows; + struct fastdiv_values div_ne2_ne1; + struct fastdiv_values div_ne1; + uint64_t t_start; }; @@ -82,7 +88,30 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { return (1 - MIN(1, MAX(0, y))); } -static void rope_cache_init(const float theta_base, +// Compute one (cos, sin) pair into cache[i0], cache[i0+1] applying YaRN scaling. +static inline void rope_yarn_one(float theta, float freq_scale, float * corr_dims, + uint32_t i0, float ext_factor, float mscale, + float * cache) { + float theta_extrap = theta; + + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta_final = theta_interp; + float mscale_final = mscale; + + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + + cache[i0 + 0] = cosf(theta_final) * mscale_final; + cache[i0 + 1] = sinf(theta_final) * mscale_final; +} + +static __attribute__((noinline)) void rope_cache_init(const float theta_base, const float freq_scale, const float * freq_factors, float * corr_dims, @@ -92,30 +121,137 @@ static void rope_cache_init(const float theta_base, float * cache, const float theta_scale) { // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py - float theta = theta_base; +#if __HVX_ARCH__ >= 79 + const bool is_v79_or_newer = true; +#else + const bool is_v79_or_newer = false; +#endif + + if (is_v79_or_newer && ext_factor == 0.0f) { + // Fast path: fully vectorized + // We process 32 pairs (64 elements) per iteration. + const uint32_t n_blocks = ne0 / 64; + + // Initialize theta scale powers: [1.0f, theta_scale, theta_scale^2, ..., theta_scale^31] + float __attribute__((aligned(128))) theta_powers[32]; + theta_powers[0] = 1.0f; + for (int j = 1; j < 32; j++) { + theta_powers[j] = theta_powers[j - 1] * theta_scale; + } + HVX_Vector v_theta_powers = hvx_vmem(theta_powers); - for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { - const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + HVX_Vector v_freq_scale = hvx_vec_splat_f32(freq_scale); + HVX_Vector v_mscale = hvx_vec_splat_f32(mscale); + + // Base theta starts at theta_base + float theta_block = theta_base; + // The scale factor for the next block is theta_scale^32 + float theta_scale_32 = 1.0f; + for (int j = 0; j < 32; j++) { + theta_scale_32 *= theta_scale; + } + + for (uint32_t b = 0; b < n_blocks; b++) { + uint32_t i0 = b * 64; + HVX_Vector v_theta_base = hvx_vec_splat_f32(theta_block); + HVX_Vector v_theta = hvx_vec_mul_f32_f32(v_theta_base, v_theta_powers); + + if (freq_factors) { + // Load 32 elements of freq_factors + HVX_Vector v_ff = hvx_vmemu(freq_factors + i0 / 2); + HVX_Vector v_inv_ff = hvx_vec_inverse_f32(v_ff); + v_theta = hvx_vec_mul_f32_f32(v_theta, v_inv_ff); + } - float theta_extrap = theta / ff; + HVX_Vector v_theta_final = hvx_vec_mul_f32_f32(v_theta, v_freq_scale); - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta_final = theta_interp; - float mscale_final = mscale; + HVX_Vector vcos = hvx_vec_cos_f32(v_theta_final); + HVX_Vector vsin = hvx_vec_sin_f32(v_theta_final); - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + vcos = hvx_vec_mul_f32_f32(vcos, v_mscale); + vsin = hvx_vec_mul_f32_f32(vsin, v_mscale); - // Get n-d magnitude scaling corrected for interpolation - mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); + HVX_VectorPair vstore = Q6_W_vshuff_VVR(vsin, vcos, -4); + + if (((uintptr_t)cache) % 128 == 0) { + hvx_vmem(cache + i0 + 0) = Q6_V_lo_W(vstore); + hvx_vmem(cache + i0 + 32) = Q6_V_hi_W(vstore); + } else { + hvx_vec_store_u(cache + i0 + 0, 32 * sizeof(float), Q6_V_lo_W(vstore)); + hvx_vec_store_u(cache + i0 + 32, 32 * sizeof(float), Q6_V_hi_W(vstore)); + } + + theta_block *= theta_scale_32; + } + + // Leftovers + float theta = theta_block; + for (uint32_t i0 = n_blocks * 64; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); + theta *= theta_scale; + } + } else { + // Fallback to original scalar loop + float theta = theta_base; + for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); + theta *= theta_scale; } + } +} - cache[i0 + 0] = cosf(theta_final) * mscale_final; - cache[i0 + 1] = sinf(theta_final) * mscale_final; +// pos_t/h/w/e: the four position ids for this sequence step (t=time, h=height, w=width, e=extra). +// sections[4]: number of head dims assigned to each position component. +static __attribute__((noinline)) void mrope_cache_init(const float pos_t, + const float pos_h, + const float pos_w, + const float pos_e, + const int32_t sections[4], + const bool is_imrope, + const float freq_scale, + const float * freq_factors, + float * corr_dims, + const uint32_t ne0, + const float ext_factor, + const float mscale, + float * cache, + const float theta_scale) { + const int sect_dims = sections[0] + sections[1] + sections[2] + sections[3]; + const int sec_w = sections[0] + sections[1]; + const int sec_e = sec_w + sections[2]; + + float theta_t = pos_t; + float theta_h = pos_h; + float theta_w = pos_w; + float theta_e = pos_e; - theta *= theta_scale; + for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + const int sector = (i0 / 2) % sect_dims; + + float theta; + if (is_imrope) { + // Interleaved: sector mod 3 selects component + if (sector % 3 == 0 && sector < 3 * sections[0]) { theta = theta_t; } + else if (sector % 3 == 1 && sector < 3 * sections[1]) { theta = theta_h; } + else if (sector % 3 == 2 && sector < 3 * sections[2]) { theta = theta_w; } + else { theta = theta_e; } + } else { + // Contiguous sections + if (sector < sections[0]) { theta = theta_t; } + else if (sector < sec_w) { theta = theta_h; } + else if (sector < sec_e) { theta = theta_w; } + else { theta = theta_e; } + } + + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); + + theta_t *= theta_scale; + theta_h *= theta_scale; + theta_w *= theta_scale; + theta_e *= theta_scale; } } @@ -134,24 +270,18 @@ static void rope_corr_dims(int n_dims, } static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) { - const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0; - const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache; - HVX_Vector * restrict vdst = (HVX_Vector *) dst; + const uint32_t he = ne / 2; + const uint32_t nvec = he / 32; + const uint32_t nloe = he % 32; - uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2 + for (uint32_t i = 0; i < nvec; i++) { + HVX_Vector v0 = ((const HVX_Vector *) src0)[i]; + HVX_Vector v1 = hvx_vmemu(src0 + he + i * 32); - uint32_t he = ne / 2; // half_dims offset in elements - uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors + HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0]; + HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1]; - #pragma unroll(2) - for (uint32_t i = 0; i < nvec; i += 2) { - HVX_Vector v0 = vsrc[i/2+0]; - HVX_Vector v1 = vsrc[i/2+hv]; - - HVX_Vector v2 = vtheta[i+0]; - HVX_Vector v3 = vtheta[i+1]; - - HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta + HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin)); HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin)); @@ -161,37 +291,45 @@ static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * rest HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); - vdst[i/2+0] = Q6_Vsf_equals_Vqf32(v4); - vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5); + ((HVX_Vector *) dst)[i] = Q6_Vsf_equals_Vqf32(v4); + hvx_vmemu(dst + he + i * 32) = Q6_Vsf_equals_Vqf32(v5); } - for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) { - const float cos_theta = theta_cache[i+0]; - const float sin_theta = theta_cache[i+1]; - float x0 = src0[i/2]; - float x1 = src0[i/2 + he]; - dst[i/2] = x0 * cos_theta - x1 * sin_theta; - dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta; + if (nloe > 0) { + HVX_Vector v0 = hvx_vmemu(src0 + nvec * 32); + HVX_Vector v1 = hvx_vmemu(src0 + he + nvec * 32); + + HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 0]; + HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 1]; + + HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); + + HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin)); + HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin)); + HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin)); + HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin)); + + HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); + HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); + + hvx_vec_store_u(dst + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v4)); + hvx_vec_store_u(dst + he + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v5)); } } static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) { - const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0; - const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache; - HVX_Vector * restrict vdst = (HVX_Vector *) dst; + const uint32_t nvec = ne / 64; + const uint32_t nloe = ne % 64; - uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two + for (uint32_t i = 0; i < nvec; i++) { + HVX_Vector v0 = ((const HVX_Vector *) src0)[i * 2 + 0]; + HVX_Vector v1 = ((const HVX_Vector *) src0)[i * 2 + 1]; - #pragma unroll(2) - for (uint32_t i = 0; i < nvec; i+=2) { - HVX_Vector v0 = vsrc[i+0]; - HVX_Vector v1 = vsrc[i+1]; + HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0]; + HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1]; - HVX_Vector v2 = vtheta[i+0]; - HVX_Vector v3 = vtheta[i+1]; - - HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); // vx0_x1[0] = x0, vx0_x1[1] = x1 - HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta + HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); + HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin)); HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin)); @@ -203,17 +341,52 @@ static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4); - vdst[i+0] = Q6_V_lo_W(vstore); - vdst[i+1] = Q6_V_hi_W(vstore); + ((HVX_Vector *) dst)[i * 2 + 0] = Q6_V_lo_W(vstore); + ((HVX_Vector *) dst)[i * 2 + 1] = Q6_V_hi_W(vstore); } - for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) { - const float cos_theta = theta_cache[i+0]; - const float sin_theta = theta_cache[i+1]; - float x0 = src0[i+0]; - float x1 = src0[i+1]; - dst[i+0] = x0 * cos_theta - x1 * sin_theta; - dst[i+1] = x0 * sin_theta + x1 * cos_theta; + if (nloe > 0) { + if (nloe <= 32) { + HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64); + HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64); + + HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(Q6_V_vzero(), v0, -4); + HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(Q6_V_vzero(), v2, -4); + + HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin)); + HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin)); + HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin)); + HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin)); + + HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); + HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); + + HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4); + + hvx_vec_store_u(dst + nvec * 64, nloe * sizeof(float), Q6_V_lo_W(vstore)); + } else { + HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64); + HVX_Vector v1 = hvx_vmemu(src0 + nvec * 64 + 32); + + HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64); + HVX_Vector v3 = hvx_vmemu(theta_cache + nvec * 64 + 32); + + HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); + HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); + + HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin)); + HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin)); + HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin)); + HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin)); + + HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); + HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); + + HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4); + + ((HVX_Vector *) dst)[nvec * 2 + 0] = Q6_V_lo_W(vstore); + hvx_vec_store_u(dst + nvec * 64 + 32, (nloe - 32) * sizeof(float), Q6_V_hi_W(vstore)); + } } } @@ -274,7 +447,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { uint64_t tt = HAP_perf_get_qtimer_count(); const int32_t mode = rctx->mode; - const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; + // MROPE and IMROPE use NEOX-style pairing for the rotation + const bool is_neox = (mode & HTP_ROPE_TYPE_NEOX) || (mode & HTP_ROPE_TYPE_MROPE); // VTCM setup uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread); @@ -286,13 +460,19 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { const int32_t * pos = (const int32_t *) src1->data; const float * freq_factors = src2 ? (const float *) src2->data : NULL; - uint32_t ir = 0; + const uint32_t i3_start = fastdiv(src0_start_row, &rctx->div_ne2_ne1); + const uint32_t rem = fastmodulo(src0_start_row, ne2 * ne1, &rctx->div_ne2_ne1); + const uint32_t i2_start = fastdiv(rem, &rctx->div_ne1); + const uint32_t i1_start = fastmodulo(rem, ne1, &rctx->div_ne1); + + uint32_t ir = src0_start_row; uint32_t prev_i2 = (uint32_t) -1; - for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch - for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len - for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads - if (ir < src0_start_row) { ir++; i1++; continue; } + for (uint32_t i3 = i3_start; i3 < ne3; i3++) { // batch + const uint32_t i2_init = (i3 == i3_start) ? i2_start : 0; + for (uint32_t i2 = i2_init; i2 < ne2; i2++) { // seq-len + const uint32_t i1_init = (i3 == i3_start && i2 == i2_start) ? i1_start : 0; + for (uint32_t i1 = i1_init; i1 < ne1; ) { // attn-heads if (ir >= src0_end_row) goto done; // Rows in this block @@ -326,11 +506,25 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { if (i2 != prev_i2) { prev_i2 = i2; - const int32_t p = pos[i2]; - rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale); - - // FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache, - // (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start)); + const bool is_mrope = (rctx->mode & HTP_ROPE_TYPE_MROPE) != 0; + if (is_mrope) { + // src1 holds four position arrays stacked along ne0: + // pos[i2], pos[i2+ne2], pos[i2+ne2*2], pos[i2+ne2*3] + const bool is_imrope = (rctx->mode == HTP_ROPE_TYPE_IMROPE); + mrope_cache_init( + (float) pos[i2], + (float) pos[i2 + ne2], + (float) pos[i2 + ne2 * 2], + (float) pos[i2 + ne2 * 3], + rctx->sections, is_imrope, + rctx->freq_scale, freq_factors, rctx->corr_dims, + ne0, rctx->ext_factor, rctx->attn_factor, + theta_cache, rctx->theta_scale); + } else { + rope_cache_init(pos[i2], rctx->freq_scale, freq_factors, rctx->corr_dims, + ne0, rctx->ext_factor, rctx->attn_factor, + theta_cache, rctx->theta_scale); + } } // Skip output DMA transactions from prev block (if any) @@ -410,7 +604,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) { // Aligned row sizes for VTCM const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN); const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); - const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128); + const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 256); // Calculate spad sizes per thread size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned; @@ -467,6 +661,11 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) { rctx.src0_nrows = src0_nrows; rctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads; + if (src0_nrows > 0) { + rctx.div_ne2_ne1 = init_fastdiv_values(dst->ne[2] * dst->ne[1]); + rctx.div_ne1 = init_fastdiv_values(dst->ne[1]); + } + FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0, rctx.ext_factor, rctx.theta_scale, rctx.attn_factor); diff --git a/ggml/src/ggml-hexagon/htp/set-rows-ops.c b/ggml/src/ggml-hexagon/htp/set-rows-ops.c index 0def7b408bf..58c54967db0 100644 --- a/ggml/src/ggml-hexagon/htp/set-rows-ops.c +++ b/ggml/src/ggml-hexagon/htp/set-rows-ops.c @@ -65,6 +65,9 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da // parallelize by rows of src0 const uint32_t dr = srctx->src0_nrows_per_thread; const uint32_t ir0 = dr * ith; + if (ir0 >= nr) { + return; + } const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr; const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); @@ -109,6 +112,9 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da // parallelize by rows of src0 const uint32_t dr = srctx->src0_nrows_per_thread; const uint32_t ir0 = dr * ith; + if (ir0 >= nr) { + return; + } const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr; const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32); diff --git a/ggml/src/ggml-hexagon/htp/ssm-conv.c b/ggml/src/ggml-hexagon/htp/ssm-conv.c index a28fd03e978..d574da2e2bc 100644 --- a/ggml/src/ggml-hexagon/htp/ssm-conv.c +++ b/ggml/src/ggml-hexagon/htp/ssm-conv.c @@ -20,55 +20,56 @@ #include "htp-ops.h" #include "hvx-utils.h" -#define htp_ssm_conv_tensors_preamble \ - const struct htp_tensor * restrict src0 = octx->src[0]; \ - const struct htp_tensor * restrict src1 = octx->src[1]; \ - const struct htp_tensor * restrict dst = octx->dst; \ - struct htp_spad * restrict src0_spad = &octx->src0_spad; \ - struct htp_spad * restrict src1_spad = &octx->src1_spad; \ - struct htp_spad * restrict dst_spad = &octx->dst_spad; \ - \ - const uint32_t ne00 = src0->ne[0]; \ - const uint32_t ne01 = src0->ne[1]; \ - const uint32_t ne02 = src0->ne[2]; \ - const uint32_t ne03 = src0->ne[3]; \ - \ - const uint32_t ne10 = src1->ne[0]; \ - const uint32_t ne11 = src1->ne[1]; \ - const uint32_t ne12 = src1->ne[2]; \ - const uint32_t ne13 = src1->ne[3]; \ - \ - const uint32_t ne0 = dst->ne[0]; \ - const uint32_t ne1 = dst->ne[1]; \ - const uint32_t ne2 = dst->ne[2]; \ - const uint32_t ne3 = dst->ne[3]; \ - \ - const uint32_t nb00 = src0->nb[0]; \ - const uint32_t nb01 = src0->nb[1]; \ - const uint32_t nb02 = src0->nb[2]; \ - const uint32_t nb03 = src0->nb[3]; \ - \ - const uint32_t nb10 = src1->nb[0]; \ - const uint32_t nb11 = src1->nb[1]; \ - const uint32_t nb12 = src1->nb[2]; \ - const uint32_t nb13 = src1->nb[3]; \ - \ - const uint32_t nb0 = dst->nb[0]; \ - const uint32_t nb1 = dst->nb[1]; \ - const uint32_t nb2 = dst->nb[2]; \ +#define htp_ssm_conv_tensors_preamble \ + const struct htp_tensor * restrict src0 = octx->src[0]; \ + const struct htp_tensor * restrict src1 = octx->src[1]; \ + const struct htp_tensor * restrict dst = octx->dst; \ + struct htp_spad * restrict src0_spad = &octx->src0_spad; \ + struct htp_spad * restrict src1_spad = &octx->src1_spad; \ + struct htp_spad * restrict dst_spad = &octx->dst_spad; \ + \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne02 = src0->ne[2]; \ + const uint32_t ne03 = src0->ne[3]; \ + \ + const uint32_t ne10 = src1->ne[0]; \ + const uint32_t ne11 = src1->ne[1]; \ + const uint32_t ne12 = src1->ne[2]; \ + const uint32_t ne13 = src1->ne[3]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb00 = src0->nb[0]; \ + const uint32_t nb01 = src0->nb[1]; \ + const uint32_t nb02 = src0->nb[2]; \ + const uint32_t nb03 = src0->nb[3]; \ + \ + const uint32_t nb10 = src1->nb[0]; \ + const uint32_t nb11 = src1->nb[1]; \ + const uint32_t nb12 = src1->nb[2]; \ + const uint32_t nb13 = src1->nb[3]; \ + \ + const uint32_t nb0 = dst->nb[0]; \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ const uint32_t nb3 = dst->nb[3]; struct htp_ssm_conv_context { struct htp_ops_context * octx; uint32_t nrows_per_thread; + uint32_t d_inner_tile; uint64_t t_start; }; -#define htp_ssm_conv_preamble \ +#define htp_ssm_conv_preamble \ struct htp_ssm_conv_context * scctx = (struct htp_ssm_conv_context *) data; \ - struct htp_ops_context * octx = scctx->octx; \ - htp_ssm_conv_tensors_preamble; \ - dma_queue * dma_queue = octx->ctx->dma[ith]; + struct htp_ops_context * octx = scctx->octx; \ + htp_ssm_conv_tensors_preamble; \ + dma_queue * dma_queue = octx->ctx->dma[ith]; // Scalar FP32 SSM_CONV implementation static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) { @@ -128,118 +129,211 @@ static void ssm_conv_thread_f32_f32(unsigned int nth, unsigned int ith, void *da dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -// HVX FP32 SSM_CONV implementation - vectorizes across d_inner dimension -static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) { - htp_ssm_conv_preamble; - - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); - const int nc = src1->ne[0]; // d_conv - const int ncs = src0->ne[0]; // d_conv - 1 + n_t +// In-register 32x32 fp32 transpose using std 5-stage HVX vshuff butterfly. +static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) { + HVX_Vector tmp[32]; - const uint32_t d_conv = src1->ne[0]; - const uint32_t d_inner = src0->ne[1]; - const uint32_t n_t = dst->ne[1]; - const uint32_t n_s = dst->ne[2]; + // Stage 0 (R = -4): pair (2i, 2i+1) for i = 0..15. m -> tmp. + for (int i = 0; i < 16; ++i) { + HVX_VectorPair p = Q6_W_vshuff_VVR(m[2*i + 1], m[2*i], -4); + tmp[2*i + 0] = Q6_V_lo_W(p); + tmp[2*i + 1] = Q6_V_hi_W(p); + } - const float * src0_data = (const float *) src0->data; - const float * src1_data = (const float *) src1->data; - float * dst_data = (float *) dst->data; + // Stage 1 (R = -8): per block of 4, pair (b+0, b+2) and (b+1, b+3). tmp -> m. + for (int b = 0; b < 32; b += 4) { + HVX_VectorPair p0 = Q6_W_vshuff_VVR(tmp[b + 2], tmp[b + 0], -8); + HVX_VectorPair p1 = Q6_W_vshuff_VVR(tmp[b + 3], tmp[b + 1], -8); + m[b + 0] = Q6_V_lo_W(p0); m[b + 1] = Q6_V_hi_W(p0); + m[b + 2] = Q6_V_lo_W(p1); m[b + 3] = Q6_V_hi_W(p1); + } - // Calculate row range for this thread - const int dr = scctx->nrows_per_thread; - const uint32_t ir0 = dr * ith; - const uint32_t ir1 = MIN(ir0 + dr, d_inner); - const uint32_t ir = ir1 - ir0; + // Stage 2 (R = -16): per block of 8, pair (b+i, b+i+4) for i = 0..3. m -> tmp. + for (int b = 0; b < 32; b += 8) { + for (int i = 0; i < 4; ++i) { + HVX_VectorPair p = Q6_W_vshuff_VVR(m[b + i + 4], m[b + i], -16); + tmp[b + 2*i + 0] = Q6_V_lo_W(p); + tmp[b + 2*i + 1] = Q6_V_hi_W(p); + } + } - if (ir0 >= ir1) { - return; // No work for this thread + // Stage 3 (R = -32): per block of 16, pair (b+i, b+i+8) for i = 0..7. tmp -> m. + for (int b = 0; b < 32; b += 16) { + for (int i = 0; i < 8; ++i) { + HVX_VectorPair p = Q6_W_vshuff_VVR(tmp[b + i + 8], tmp[b + i], -32); + m[b + 2*i + 0] = Q6_V_lo_W(p); + m[b + 2*i + 1] = Q6_V_hi_W(p); + } } - // src0 and src1 gather offsets - uint32_t __attribute__((aligned(VLEN))) src0_offsets[VLEN_FP32] = { 0 }; - uint32_t __attribute__((aligned(VLEN))) src1_offsets[VLEN_FP32] = { 0 }; + // Stage 4 (R = -64): pair (i, i+16) for i = 0..15. m -> tmp -> m. + for (int i = 0; i < 16; ++i) { + HVX_VectorPair p = Q6_W_vshuff_VVR(m[i + 16], m[i], -64); + tmp[2 * i + 0] = Q6_V_lo_W(p); + tmp[2 * i + 1] = Q6_V_hi_W(p); + } - for (uint32_t i = 0; i < VLEN_FP32; ++i) { - src0_offsets[i] = i * (ncs) * sizeof(float); - src1_offsets[i] = i * (d_conv) * sizeof(float); + for (int i = 0; i < 32; ++i) { + m[i] = tmp[i]; } +} - const uint32_t src0_gather_len = VLEN * ncs; - const uint32_t src1_gather_len = VLEN * d_conv; +// HVX FP32 SSM_CONV implementation - channel-vectorized HVX kernel with src0/src1 +// transposed into VTCM. +// +// VTCM layouts (per thread): +// src1_T : {d_inner_per_thread, d_conv} โ€” staged once per launch (small). +// src0_T : {d_inner_tile, ncs} โ€” staged per d_inner-tile. +// +// d_inner_tile is chosen so that per-thread VTCM stays under the budget. +// Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially. +#define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread + +// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM) +static inline void transpose_src1(const float * src1_data, + uint32_t src1_stride_inner, + uint32_t i1_off, + uint32_t d_inner_per_thread, + uint32_t d_conv, + float * src1_T) { + for (uint32_t i = 0; i < d_inner_per_thread; ++i) { + const float * src_row = src1_data + (i1_off + i) * src1_stride_inner; + for (uint32_t j = 0; j < d_conv; ++j) { + src1_T[j * d_inner_per_thread + i] = src_row[j]; + } + } +} - // gather scratchpads - HVX_Vector * src0_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + 0); - HVX_Vector * src1_vec = (HVX_Vector *) (octx->ctx->vtcm_base + ith * VLEN*2 + VLEN); +// HVX 32x32 src0 transpose: src0 {ncs, d_inner} (DDR) -> src0_T {d_inner_tile, ncs} (VTCM) +static inline void transpose_src0_block(const float * src0_block, + uint32_t ncs, + uint32_t cb_n, + uint32_t d_inner_tile, + float * src0_T_block_dst, + uint32_t cb /* dst column offset */) { + const uint32_t T_TILE = VLEN_FP32; + + HVX_Vector __attribute__((aligned(VLEN))) sub[32]; + + for (uint32_t t0 = 0; t0 < ncs; t0 += T_TILE) { + const uint32_t t_n = MIN(T_TILE, ncs - t0); + + // Load 32 rows (channels) of T_TILE samples; pad missing channels with zeros. + for (uint32_t r = 0; r < cb_n; ++r) { + const float * src_row = src0_block + r * ncs + t0; + if (t_n == T_TILE) { + sub[r] = *(const HVX_UVector *) src_row; + } else { + HVX_Vector v = hvx_vec_splat_f32(0.0f); + hvx_vec_store_u(&v, t_n * sizeof(float), hvx_vec_splat_f32(0.0f)); + + float __attribute__((aligned(VLEN))) tmp[VLEN_FP32] = { 0 }; + for (uint32_t k = 0; k < t_n; ++k) tmp[k] = src_row[k]; + v = *(const HVX_Vector *) tmp; + sub[r] = v; + } + } + for (uint32_t r = cb_n; r < T_TILE; ++r) { + sub[r] = hvx_vec_splat_f32(0.0f); + } - float * data_src0 = (float *) ((char *) src0->data + ir0 * src0->nb[1]); - float * data_src1 = (float *) ((char *) src1->data + ir0 * src1->nb[1]); + hvx_transpose_32x32_f32(sub); - uint8_t * spad_src0 = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; - uint8_t * spad_src1 = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread; + // Store transposed sub-tile to src0_T at offsets (t0 + j) * d_inner_tile + cb. + // Only write the valid t_n rows of the transposed result. + for (uint32_t r = 0; r < t_n; ++r) { + float * dst = src0_T_block_dst + (t0 + r) * d_inner_tile + cb; + if (cb_n == T_TILE) { + *(HVX_UVector *) dst = sub[r]; + } else { + hvx_vec_store_u(dst, cb_n * sizeof(float), sub[r]); + } + } + } +} - // copy src1 workload to VTCM - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src1, data_src1), nb11, nb11, ir); +static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) { + htp_ssm_conv_preamble; - // FARF(HIGH, "ssm-conv-src1-fetch %d: ir0 %u size %u\n", ith, ir0, nb11 * ir); + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); - for (uint32_t i3 = 0; i3 < n_s; ++i3) { - float * src0_data_ptr = (float *) ((char *) data_src0 + i3 * (src0->nb[2])); + const uint32_t d_conv = src1->ne[0]; + const uint32_t d_inner = src0->ne[1]; + const uint32_t n_t = dst->ne[1]; + const uint32_t n_s = dst->ne[2]; + const uint32_t ncs = src0->ne[0]; - // copy src0 workload to VTCM - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0, src0_data_ptr), nb01, nb01, ir); + const uint32_t src0_stride_inner = src0->nb[1] / sizeof(float); + const uint32_t src0_stride_seq = src0->nb[2] / sizeof(float); + const uint32_t src1_stride_inner = src1->nb[1] / sizeof(float); + const uint32_t dst_stride_token = dst->nb[1] / sizeof(float); + const uint32_t dst_stride_seq = dst->nb[2] / sizeof(float); - // FARF(HIGH, "ssm-conv-src0-fetch %d: ir0 %u i3 %u size %u\n", ith, ir0, i3, nb01 * ir); + const uint32_t dr = scctx->nrows_per_thread; + const uint32_t ir0 = dr * ith; + const uint32_t ir1 = MIN(ir0 + dr, d_inner); - dma_queue_flush(dma_queue); + if (ir0 >= ir1) { + return; + } - for (uint32_t i2 = 0; i2 < n_t; ++i2) { - float * dst_ptr = (float *) ((char *) dst->data + ir0 * (dst->nb[0]) + i2 * (dst->nb[1]) + i3 * (dst->nb[2])); + const uint32_t d_inner_per_thread = ir1 - ir0; + const uint32_t d_inner_tile = scctx->d_inner_tile; - const uint32_t nvec = ir / VLEN_FP32; - const uint32_t nloe = ir % VLEN_FP32; - uint32_t i1 = 0; + const float * src0_data = (const float *) src0->data; + const float * src1_data = (const float *) src1->data; + float * dst_data = (float *) dst->data; - for (uint32_t vi1 = 0; vi1 < nvec; vi1++) { - HVX_Vector acc_vec = Q6_V_vsplat_R(0); + // Per-thread VTCM regions. + float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread); + float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread); - for (uint32_t i0 = 0; i0 < d_conv; ++i0) { - uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]); - uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float); - Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets)); - Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets)); + // Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout. + transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T); - HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec); - acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod); - } + const uint32_t C_TILE = VLEN_FP32; - *(HVX_UVector *) (dst_ptr + i1) = Q6_Vsf_equals_Vqf32(acc_vec); - i1 += VLEN_FP32; - } + for (uint32_t i3 = 0; i3 < n_s; ++i3) { + for (uint32_t tile_off = 0; tile_off < d_inner_per_thread; tile_off += d_inner_tile) { + const uint32_t tile_n = MIN(d_inner_tile, d_inner_per_thread - tile_off); - if (nloe) { - HVX_Vector acc_vec = Q6_V_vsplat_R(0); + // Place src0 chunk into VTCM in {d_inner_tile, ncs} layout. + const float * src0_block = src0_data + i3 * src0_stride_seq + (ir0 + tile_off) * src0_stride_inner; - for (uint32_t i0 = 0; i0 < d_conv; ++i0) { - uint32_t src0_base = (uint32_t) spad_src0 + (i0 + i1 * ncs) * sizeof(float) + i2 * (src0->nb[0]); - uint32_t src1_base = (uint32_t) spad_src1 + (i0 + i1 * nc) * sizeof(float); - Q6_vgather_ARMVw(src0_vec, src0_base, src0_gather_len, (*(const HVX_Vector *) src0_offsets)); - Q6_vgather_ARMVw(src1_vec, src1_base, src1_gather_len, (*(const HVX_Vector *) src1_offsets)); + for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) { + const uint32_t cb_n = MIN(C_TILE, tile_n - cb); + transpose_src0_block(src0_block + cb * src0_stride_inner, ncs, cb_n, d_inner_tile, src0_T, cb); + } - HVX_Vector prod = Q6_Vqf32_vmpy_VsfVsf(*(const HVX_Vector *) src0_vec, *(const HVX_Vector *) src1_vec); - acc_vec = Q6_Vqf32_vadd_Vqf32Vqf32(acc_vec, prod); + for (uint32_t t = 0; t < n_t; ++t) { + for (uint32_t cb = 0; cb < tile_n; cb += C_TILE) { + const uint32_t cb_n = MIN(C_TILE, tile_n - cb); + + HVX_Vector acc = hvx_vec_splat_f32(0.0f); + for (uint32_t j = 0; j < d_conv; ++j) { + HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb); + HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb); + acc = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w)); + } + HVX_Vector res = Q6_Vsf_equals_Vqf32(acc); + + float * dst_ptr = dst_data + i3 * dst_stride_seq + t * dst_stride_token + (ir0 + tile_off + cb); + if (cb_n == C_TILE) { + *(HVX_UVector *) dst_ptr = res; + } else { + hvx_vec_store_u(dst_ptr, cb_n * sizeof(float), res); + } } - - hvx_vec_store_u(dst_ptr + i1, (ir - i1) * 4, Q6_Vsf_equals_Vqf32(acc_vec)); } } } t2 = HAP_perf_get_qtimer_count(); - FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", - ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1, + FARF(HIGH, "ssm-conv-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) tile=%u * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", + ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1, d_inner_tile, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } @@ -264,46 +358,44 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) { if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { uint32_t use_hvx = 0; - if (d_inner >= VLEN_FP32 && d_inner % VLEN_FP32 == 0) { - int is_aligned = hex_is_aligned((void *) src0->data, VLEN) && - hex_is_aligned((void *) src1->data, VLEN) && - hex_is_aligned((void *) dst->data, VLEN); - - if (is_aligned) { - use_hvx = 1; - } + if (d_inner >= VLEN_FP32 && n_t >= VLEN_FP32) { + use_hvx = 1; } - if (use_hvx) { - scctx.nrows_per_thread = (d_inner + n_threads - 1) / n_threads; // d_inner chunks per thread - scctx.nrows_per_thread += (scctx.nrows_per_thread & 1); // round up to even + scctx.nrows_per_thread = (d_inner + n_threads - 1) / n_threads; + scctx.nrows_per_thread += (scctx.nrows_per_thread & 1); - octx->src0_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb01, 256); - octx->src1_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * nb11, 256); - octx->dst_spad.size_per_thread = hex_round_up(scctx.nrows_per_thread * sizeof(float), 256); + const uint32_t d_inner_per_thread = scctx.nrows_per_thread; + const uint32_t ncs = src0->ne[0]; - octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads; - octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads; - octx->dst_spad.size = octx->dst_spad.size_per_thread * n_threads; + const uint32_t src1_T_size = hex_round_up(d_conv * d_inner_per_thread * sizeof(float), 256); + const uint32_t src0_T_max = HTP_SSM_CONV_VTCM_BUDGET > src1_T_size ? HTP_SSM_CONV_VTCM_BUDGET - src1_T_size : 0; - // Compute gather scratchpad size for src0 and src1 - const size_t gather_spad_size = n_threads * VLEN * 2; + uint32_t d_inner_tile = (src0_T_max / sizeof(float)) / ncs; + d_inner_tile -= (d_inner_tile % VLEN_FP32); + if (d_inner_tile == 0) { + FARF(HIGH, "ssm_conv-f32: inner tile rounds to 0 (ncs=%u), falling back to scalar\n", ncs); + use_hvx = 0; + } else { + scctx.d_inner_tile = d_inner_tile; - octx->src0_spad.data = octx->ctx->vtcm_base + gather_spad_size; octx->src0_spad.src = NULL; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; octx->src1_spad.src = NULL; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; octx->dst_spad.src = NULL; + octx->src0_spad.size_per_thread = hex_round_up(d_inner_tile * ncs * sizeof(float), 256); + octx->src1_spad.size_per_thread = src1_T_size; + octx->dst_spad.size_per_thread = 0; - FARF(HIGH, "ssm_conv-f32: gather-spad:%zu spad-per-thread:(%u:%u:%u) spad-sizes:(%u:%u:%u) spad-data:(%p:%p:%p)\n", - gather_spad_size, octx->src0_spad.size_per_thread, octx->src1_spad.size_per_thread, - octx->dst_spad.size_per_thread, octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size, - octx->src0_spad.data, octx->src1_spad.data, octx->dst_spad.data); + octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads; + octx->src1_spad.size = octx->src1_spad.size_per_thread * n_threads; + octx->dst_spad.size = 0; - const size_t total_spad_size = - gather_spad_size + octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; + octx->src0_spad.src = NULL; + octx->src1_spad.src = NULL; - if (total_spad_size > octx->ctx->vtcm_size) { - FARF(HIGH, "ssm_conv-f32: HVX scratchpad size %zu exceeds VTCM size %zu", total_spad_size, - octx->ctx->vtcm_size); + const size_t total_spad = octx->src0_spad.size + octx->src1_spad.size; + if (total_spad > octx->ctx->vtcm_size) { + FARF(HIGH, "ssm_conv-f32: scratchpad %zu exceeds VTCM %zu, falling back to scalar\n", + total_spad, octx->ctx->vtcm_size); use_hvx = 0; } } diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index d4ae89ee6f0..770a6673211 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -17,28 +17,32 @@ #include "ggml-common.h" #include "htp-ctx.h" #include "htp-ops.h" -#include "htp-ops.h" struct htp_unary_context { struct htp_ops_context * octx; // Precomputed values const uint8_t * data_src0; + const uint8_t * data_src1; // weight/scale tensor for RMS_NORM_MUL uint8_t * data_dst; size_t src0_data_row_size; // actual data bytes per row + size_t src1_data_row_size; size_t dst_data_row_size; // actual data bytes per row size_t src0_row_size_aligned; + size_t src1_row_size_aligned; size_t dst_row_size_aligned; size_t src0_spad_half_size; + size_t src1_spad_half_size; size_t dst_spad_half_size; uint32_t block; uint32_t src0_nrows; uint32_t src0_nrows_per_thread; uint32_t nc; + bool broadcast_weight; }; // Convert flat row index to DDR byte offset using the tensor's actual strides. @@ -159,6 +163,144 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, } } +static void hvx_fast_rms_norm_mul_f32(const uint8_t * restrict src, + const uint8_t * restrict weight, + uint8_t * restrict dst, + const int num_elems, + float epsilon) { + const HVX_Vector * restrict v_src = (const HVX_Vector *) src; + const HVX_Vector * restrict v_weight = (const HVX_Vector *) weight; + HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + + const int nvec = num_elems / VLEN_FP32; // number of full vectors + const int nloe = num_elems % VLEN_FP32; // leftover elements + + // Compute sum of squares for full vectors + HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); + } + + // Reduce HVX sum + sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); + + HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); + HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); + HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v); + HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v); + + // Scale and multiply + HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v)); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v); + HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2); + HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[i]); + v_dst[i] = Q6_Vsf_equals_Vqf32(result); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v); + HVX_Vector v3 = Q6_Vsf_equals_Vqf32(v2); + HVX_Vector result = Q6_Vqf32_vmpy_VsfVsf(v3, v_weight[nvec]); + HVX_Vector res_v = Q6_Vsf_equals_Vqf32(result); + + // Store with masking to avoid overwriting memory beyond the tensor + hvx_vec_store_a(&v_dst[nvec], nloe * 4, res_v); + } +} + +static void hvx_fast_norm_f32(const uint8_t * restrict src, + uint8_t * restrict dst, + uint8_t * restrict pad, + const int num_elems, + float epsilon) { + (void)pad; + + const HVX_Vector * restrict v_src = (HVX_Vector *) src; + HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + + const int nvec = num_elems / VLEN_FP32; // number of full vectors + const int nloe = num_elems % VLEN_FP32; // leftover elements + + // Compute sum of squares and sum of values for full vectors + HVX_Vector sum_sq_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector sum_x_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2); + sum_x_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v, Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero())); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2); + sum_x_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v, Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero())); + } + + // Reduce HVX sums + sum_sq_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_sq_v)); + sum_x_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_x_v)); + + HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); + HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); + HVX_Vector mean_sq_v = Q6_Vqf32_vmpy_VsfVsf(sum_sq_v, denom_v); + HVX_Vector mean_x_v = Q6_Vqf32_vmpy_VsfVsf(sum_x_v, denom_v); + HVX_Vector mean_x_sq_v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(mean_x_v), Q6_Vsf_equals_Vqf32(mean_x_v)); + HVX_Vector var_v = Q6_Vqf32_vsub_Vqf32Vqf32(mean_sq_v, mean_x_sq_v); + HVX_Vector var_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(var_v, epsilon_v); + + // scale = rsqrt(variance + epsilon), mean_x broadcast for subtraction + HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v)); + HVX_Vector mean_x_b = hvx_vec_repl_f32(Q6_Vsf_equals_Vqf32(mean_x_v)); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b); + HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v); + v_dst[i] = Q6_Vsf_equals_Vqf32(v3); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b); + HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v); + HVX_Vector result = Q6_Vsf_equals_Vqf32(v3); + + // Store with masking to avoid overwriting memory beyond the tensor + hvx_vec_store_a(&v_dst[nvec], nloe * 4, result); + } +} + static void scale_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -197,6 +339,45 @@ static void rms_norm_f32(const float * restrict src, } } +static void rms_norm_mul_f32(const float * restrict src, + const float * restrict weight, + float * restrict dst, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + const size_t weight_row_size, + int32_t * op_params, + bool broadcast_weight) { + float epsilon = 0.f; + memcpy(&epsilon, op_params, sizeof(float)); + + for (uint32_t ir = 0; ir < num_rows; ir++) { + const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size); + const uint8_t * restrict w_local = (const uint8_t *)weight + (broadcast_weight ? 0 : ir * weight_row_size); + uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size); + + hvx_fast_rms_norm_mul_f32(src_local, w_local, dst_local, row_elems, epsilon); + } +} + +static void norm_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params) { + float epsilon = 0.f; + memcpy(&epsilon, op_params, sizeof(float)); + + for (uint32_t ir = 0; ir < num_rows; ir++) { + const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size); + uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size); + + hvx_fast_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon); + } +} + static void sqr_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -277,6 +458,95 @@ static void sigmoid_f32(const float * restrict src, } } +static void tri_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params, + const uint32_t ir, + const struct htp_unary_context * uctx) { + + const int32_t ttype = op_params[0]; + const HVX_Vector zero = hvx_vec_splat_f32(0.0f); + const uint32_t nvec = row_elems / VLEN_FP32; + const uint32_t nloe = row_elems % VLEN_FP32; + + const uint32_t ne01 = uctx->octx->src[0]->ne[1]; + + for (uint32_t b = 0; b < num_rows; b++) { + const uint32_t abs_row = ir + b; + const uint32_t i01 = abs_row % ne01; + + const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size); + HVX_Vector * restrict v_dst = (HVX_Vector *) ((uint8_t *) dst + b * row_size); + + uint32_t boundary; + int keep_left; + switch (ttype) { + case 0: boundary = i01; keep_left = 0; break; // keep col >= row + case 1: boundary = i01 + 1; keep_left = 0; break; // keep col > row + case 2: boundary = i01 + 1; keep_left = 1; break; // keep col <= row + case 3: boundary = i01; keep_left = 1; break; // keep col < row + default: boundary = 0; keep_left = 0; break; + } + if (boundary > row_elems) boundary = row_elems; + + // Full HVX vectors โ€” each starts at a 128-byte aligned offset + for (uint32_t i = 0; i < nvec; i++) { + const uint32_t vec_start = i * VLEN_FP32; + const uint32_t vec_end = vec_start + VLEN_FP32; + if (keep_left) { + if (vec_end <= boundary) { + v_dst[i] = v_src[i]; + } else if (vec_start >= boundary) { + v_dst[i] = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, v_src[i], zero); + } + } else { + if (vec_end <= boundary) { + v_dst[i] = zero; + } else if (vec_start >= boundary) { + v_dst[i] = v_src[i]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, zero, v_src[i]); + } + } + } + + // Tail elements (row_elems not a multiple of VLEN_FP32) + if (nloe > 0) { + const uint32_t vec_start = nvec * VLEN_FP32; + const uint32_t vec_end = vec_start + nloe; + HVX_Vector tail_val; + if (keep_left) { + if (vec_end <= boundary) { + tail_val = v_src[nvec]; + } else if (vec_start >= boundary) { + tail_val = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, v_src[nvec], zero); + } + } else { + if (vec_end <= boundary) { + tail_val = zero; + } else if (vec_start >= boundary) { + tail_val = v_src[nvec]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, zero, v_src[nvec]); + } + } + hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val); + } + } +} + static void softplus_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -419,12 +689,15 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * t1 = HAP_perf_get_qtimer_count(); const uint8_t * restrict data_src = uctx->data_src0; + const uint8_t * restrict data_src1 = uctx->data_src1; uint8_t * restrict data_dst = uctx->data_dst; uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread); + uint8_t * src1_spad_data = octx->src1_spad.data + (ith * octx->src1_spad.size_per_thread); uint8_t * dst_spad_data = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread); size_t src0_spad_half_size = uctx->src0_spad_half_size; + size_t src1_spad_half_size = uctx->src1_spad_half_size; size_t dst_spad_half_size = uctx->dst_spad_half_size; // Non-contiguous tensors have gaps at dim-2/3 boundaries that a single-stride @@ -445,6 +718,12 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * dma_queue * dma_queue = octx->ctx->dma[ith]; + // If weight is broadcasted, load it once per thread at the beginning of execution + if (htp_op == HTP_OP_RMS_NORM_MUL && uctx->broadcast_weight) { + dma_queue_push(dma_queue, dma_make_ptr(src1_spad_data, data_src1), uctx->src1_row_size_aligned, 0, uctx->src1_data_row_size, 1); + dma_queue_flush(dma_queue); + } + for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; spad_idx++) { const uint32_t block_size = unary_block_size(ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); @@ -457,6 +736,14 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * dma_queue_push(dma_queue, dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + src0_off), src0_row_size_aligned, nb01, src0_data_row_size, block_size); + + if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) { + const size_t src1_off = unary_row_offset(ir, ne01, ne02, nb01, nb02, nb03); + dma_queue_push(dma_queue, + dma_make_ptr(src1_spad_data + (spad_idx * src1_spad_half_size), data_src1 + src1_off), + uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, block_size); + } + ir += block_size; } @@ -465,12 +752,25 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * float * dst_spad = (float *) dma_queue_pop(dma_queue).src; float * src0_spad = (float *) dma_queue_pop(dma_queue).dst; + float * src1_spad = NULL; + if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) { + src1_spad = (float *) dma_queue_pop(dma_queue).dst; + } // Process block in VTCM switch (htp_op) { + case HTP_OP_NORM: + norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); + break; case HTP_OP_RMS_NORM: rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; + case HTP_OP_RMS_NORM_MUL: + { + const float * w_ptr = uctx->broadcast_weight ? (const float *) src1_spad_data : src1_spad; + rms_norm_mul_f32(src0_spad, w_ptr, dst_spad, block_size, ne0, src0_row_size_aligned, uctx->src1_row_size_aligned, op_params, uctx->broadcast_weight); + } + break; case HTP_OP_SCALE: scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; @@ -498,6 +798,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * case HTP_OP_L2_NORM: l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; + case HTP_OP_TRI: + tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx); + break; default: break; } @@ -515,9 +818,16 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * if (pref_ir < src0_end_row) { const uint32_t pref_block_size = unary_block_size(pref_ir, src0_end_row, BLOCK, src0_contig, dst_contig, ne01, ne1); const size_t src0_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03); - dma_queue_push(dma_queue, - dma_make_ptr(src0_spad, data_src + src0_pref_off), - src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size); + dma_queue_push(dma_queue, + dma_make_ptr(src0_spad, data_src + src0_pref_off), + src0_row_size_aligned, nb01, src0_data_row_size, pref_block_size); + + if (htp_op == HTP_OP_RMS_NORM_MUL && !uctx->broadcast_weight) { + const size_t src1_pref_off = unary_row_offset(pref_ir, ne01, ne02, nb01, nb02, nb03); + dma_queue_push(dma_queue, + dma_make_ptr(src1_spad, data_src1 + src1_pref_off), + uctx->src1_row_size_aligned, nb01, uctx->src1_data_row_size, pref_block_size); + } } } ir += block_size; @@ -541,9 +851,15 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const char * op_type = NULL; switch (octx->op) { + case HTP_OP_NORM: + op_type = "norm-f32"; + break; case HTP_OP_RMS_NORM: op_type = "rmsnorm-f32"; break; + case HTP_OP_RMS_NORM_MUL: + op_type = "rmsnorm-mul-f32"; + break; case HTP_OP_SCALE: op_type = "scale-f32"; break; @@ -571,6 +887,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { case HTP_OP_L2_NORM: op_type = "l2norm-f32"; break; + case HTP_OP_TRI: + op_type = "tri-f32"; + break; + default: FARF(ERROR, "Unsupported unary Op %u\n", octx->op); return HTP_STATUS_NO_SUPPORT; @@ -585,12 +905,44 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const size_t src0_row_size_aligned = hex_round_up(src0_data_row_size, VLEN); const size_t dst_row_size_aligned = hex_round_up(dst_data_row_size, VLEN); + size_t src1_data_row_size = 0; + size_t src1_row_size_aligned = 0; + bool broadcast_weight = false; + const struct htp_tensor * src1 = NULL; + + if (octx->op == HTP_OP_RMS_NORM_MUL) { + src1 = octx->src[1]; + src1_data_row_size = src1->ne[0] * sizeof(float); + src1_row_size_aligned = hex_round_up(src1_data_row_size, VLEN); + broadcast_weight = (src1->ne[1] * src1->ne[2] * src1->ne[3] == 1); + } + // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size // Double buffering requires 2x size per buffer - size_t spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned); - size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row); + size_t spad_size_per_row = 0; + size_t vtcm_row_per_thread = 0; + + if (octx->op == HTP_OP_RMS_NORM_MUL) { + if (broadcast_weight) { + size_t available_vtcm = octx->ctx->vtcm_size; + size_t src1_spad_total = n_threads * src1_row_size_aligned; + if (available_vtcm > src1_spad_total) { + available_vtcm -= src1_spad_total; + } else { + available_vtcm = 0; + } + spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned); + vtcm_row_per_thread = available_vtcm / (n_threads * spad_size_per_row); + } else { + spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned + src1_row_size_aligned); + vtcm_row_per_thread = (octx->ctx->vtcm_size) / (n_threads * spad_size_per_row); + } + } else { + spad_size_per_row = 2 * (src0_row_size_aligned + dst_row_size_aligned); + vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row); + } // Make sure the reserved vtcm size is sufficient if (vtcm_row_per_thread == 0) { @@ -605,8 +957,25 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + if (octx->op == HTP_OP_RMS_NORM_MUL) { + if (broadcast_weight) { + octx->src1_spad.size_per_thread = src1_row_size_aligned; + } else { + octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread * 2; + } + octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread; + } else { + octx->src1_spad.size = 0; + octx->src1_spad.size_per_thread = 0; + } + octx->src0_spad.data = octx->ctx->vtcm_base; - octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + if (octx->op == HTP_OP_RMS_NORM_MUL) { + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + } else { + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + } FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], @@ -619,19 +988,24 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { .src0_nrows = src0_nrows, .data_src0 = (const uint8_t *)src0->data, + .data_src1 = (octx->op == HTP_OP_RMS_NORM_MUL) ? (const uint8_t *)src1->data : NULL, .data_dst = (uint8_t *)dst->data, .src0_data_row_size = src0_data_row_size, + .src1_data_row_size = src1_data_row_size, .dst_data_row_size = dst_data_row_size, .src0_row_size_aligned = src0_row_size_aligned, + .src1_row_size_aligned = src1_row_size_aligned, .dst_row_size_aligned = dst_row_size_aligned, .src0_spad_half_size = octx->src0_spad.size_per_thread / 2, + .src1_spad_half_size = (octx->op == HTP_OP_RMS_NORM_MUL) ? (octx->src1_spad.size_per_thread / (broadcast_weight ? 1 : 2)) : 0, .dst_spad_half_size = octx->dst_spad.size_per_thread / 2, .block = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned, .nc = src0->ne[0], + .broadcast_weight = broadcast_weight, }; worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_threads); @@ -640,6 +1014,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { return err; } +int op_tri(struct htp_ops_context * octx) { + int err = HTP_STATUS_OK; + + switch (octx->src[0]->type) { + case HTP_TYPE_F32: + err = execute_op_unary_f32(octx); + break; + + default: + err = HTP_STATUS_NO_SUPPORT; + break; + } + + return err; +} + int op_unary(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; diff --git a/ggml/src/ggml-hexagon/op-desc.h b/ggml/src/ggml-hexagon/op-desc.h deleted file mode 100644 index a1e8ddd8b97..00000000000 --- a/ggml/src/ggml-hexagon/op-desc.h +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef OP_DESC_H -#define OP_DESC_H - -#define GGML_COMMON_IMPL_CPP -#include "ggml-backend-impl.h" -#include "ggml-common.h" - -#include -#include - -struct op_desc { - char strides[64 * GGML_MAX_SRC]; - char dims[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - int format_tensor_dims(char * str, const struct ggml_tensor * t) { - if (t->ne[2] == 1 && t->ne[3] == 1) { - return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]); - } else { - return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); - } - } - - void format_op_dims(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += format_tensor_dims(p, t->src[0]); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += format_tensor_dims(p, t->src[i]); - } - - p += sprintf(p, " -> "); - } - - // format self dims separately for better visual alignment - char self[64]; - format_tensor_dims(self, t); - - p += sprintf(p, "%s", self); - } - - int format_tensor_strides(char * str, const struct ggml_tensor * t) { - const char * c = ggml_is_contiguous(t) ? "" : "!"; - - if (t->ne[2] == 1 && t->ne[3] == 1) { - return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c); - } else { - return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c); - } - } - - void format_op_strides(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += format_tensor_strides(p, t->src[0]); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += format_tensor_strides(p, t->src[i]); - } - - p += sprintf(p, " -> "); - } - - // format self dims separately for better visual alignment - char self[64]; - format_tensor_strides(self, t); - - p += sprintf(p, "%s", self); - } - - void format_op_types(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += sprintf(p, "%s", ggml_type_name(t->src[0]->type)); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", ggml_type_name(t->src[i]->type)); - } - - p += sprintf(p, " -> "); - } - - p += sprintf(p, "%s", ggml_type_name(t->type)); - } - - const char * tensor_buff_name(const struct ggml_tensor * t) { - if (t->buffer) { - return ggml_backend_buffer_name(t->buffer); - } - return "NONE"; - } - - void format_op_buffs(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += sprintf(p, "%s", tensor_buff_name(t->src[0])); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", tensor_buff_name(t->src[i])); - } - - p += sprintf(p, " -> "); - } - - p += sprintf(p, "%s", tensor_buff_name(t)); - } - - void format_op_names(char * str, const struct ggml_tensor * t) { - char * p = str; - - // append src0 and src1 (if any) - if (t->src[0]) { - p += sprintf(p, "%s", t->src[0]->name); - - for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", t->src[i]->name); - } - - p += sprintf(p, " -> "); - } - - p += sprintf(p, "%s", t->name); - } - - void format(const ggml_tensor * op) { - format_op_dims(dims, op); - format_op_strides(strides, op); - format_op_types(types, op); - format_op_buffs(buffs, op); - format_op_names(names, op); - } - - op_desc() {} - op_desc(const ggml_tensor * op) { format(op); } -}; - -#endif // OP_DESC_H diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index f0147af84c1..5d4b10d34b9 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -590,6 +590,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net( const int ne20 = op->src[2]->ne[0]; // S_v const int ne21 = op->src[2]->ne[1]; // H const int ne30 = op->src[3]->ne[0]; // G + // state is src[5], 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count. + const int K = op->src[5]->ne[1]; const int nsg = op->src[2]->ne[0]/32; @@ -598,7 +600,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net( GGML_ASSERT(ne20 % 32 == 0); snprintf(base, 256, "kernel_gated_delta_net_%s_%d", ggml_type_name(op->src[0]->type), nsg); - snprintf(name, 256, "%s_ne20=%d_ne30=%d", base, ne20, ne30); + snprintf(name, 256, "%s_ne20=%d_ne30=%d_K=%d", base, ne20, ne30, K); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); if (!res.pipeline) { @@ -606,6 +608,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net( ggml_metal_cv_set_int16(cv, ne20, FC_GATED_DELTA_NET + 0); ggml_metal_cv_set_int16(cv, ne30, FC_GATED_DELTA_NET + 1); + ggml_metal_cv_set_int16(cv, K, FC_GATED_DELTA_NET + 2); res = ggml_metal_library_compile_pipeline(lib, base, name, cv); @@ -1729,6 +1732,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_IM2COL); + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + GGML_ASSERT(ggml_is_contiguous(op->src[1])); GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); GGML_ASSERT(op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); @@ -1736,7 +1741,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col(ggml_meta char base[256]; char name[256]; - snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type)); + if (ne00*ne01 <= 1024) { + snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type)); + } else { + snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type)); + } snprintf(name, 256, "%s", base); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); @@ -1894,7 +1903,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l char base[256]; char name[256]; - snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type)); + // note: this is slower + //const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0; + const bool is_c4 = false; + + snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : ""); snprintf(name, 256, "%s", base); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); @@ -1904,6 +1917,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + res.c4 = is_c4; + return res; } diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 1f212a92f98..4a3ebb5569d 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -215,6 +215,30 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets); // device // +enum ggml_metal_device_id { + GGML_METAL_DEVICE_GENERIC = 0, + + GGML_METAL_DEVICE_M1, + GGML_METAL_DEVICE_M1_PRO, + GGML_METAL_DEVICE_M1_MAX, + GGML_METAL_DEVICE_M1_ULTRA, + GGML_METAL_DEVICE_M2, + GGML_METAL_DEVICE_M2_PRO, + GGML_METAL_DEVICE_M2_MAX, + GGML_METAL_DEVICE_M2_ULTRA, + GGML_METAL_DEVICE_M3, + GGML_METAL_DEVICE_M3_PRO, + GGML_METAL_DEVICE_M3_MAX, + GGML_METAL_DEVICE_M3_ULTRA, + GGML_METAL_DEVICE_M4, + GGML_METAL_DEVICE_M4_PRO, + GGML_METAL_DEVICE_M4_MAX, + GGML_METAL_DEVICE_M5, + GGML_METAL_DEVICE_M5_PRO, + GGML_METAL_DEVICE_M5_MAX, + GGML_METAL_DEVICE_M5_ULTRA, +}; + struct ggml_metal_device_props { int device; char name[128]; @@ -234,6 +258,8 @@ struct ggml_metal_device_props { bool supports_gpu_family_apple7; + enum ggml_metal_device_id device_id; + int op_offload_min_batch_size; }; diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 780dfe81bb3..196af102643 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -628,6 +628,50 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) { free(rsets); } +static enum ggml_metal_device_id ggml_metal_device_id_parse(const char * name) { + if (!name) { + return GGML_METAL_DEVICE_GENERIC; + } + + static const char prefix[] = "Apple "; + if (strncmp(name, prefix, sizeof(prefix) - 1) != 0) { + return GGML_METAL_DEVICE_GENERIC; + } + const char * suffix = name + sizeof(prefix) - 1; + + static const struct { + const char * name; + enum ggml_metal_device_id id; + } table[] = { + {"M1", GGML_METAL_DEVICE_M1}, + {"M1 Pro", GGML_METAL_DEVICE_M1_PRO}, + {"M1 Max", GGML_METAL_DEVICE_M1_MAX}, + {"M1 Ultra", GGML_METAL_DEVICE_M1_ULTRA}, + {"M2", GGML_METAL_DEVICE_M2}, + {"M2 Pro", GGML_METAL_DEVICE_M2_PRO}, + {"M2 Max", GGML_METAL_DEVICE_M2_MAX}, + {"M2 Ultra", GGML_METAL_DEVICE_M2_ULTRA}, + {"M3", GGML_METAL_DEVICE_M3}, + {"M3 Pro", GGML_METAL_DEVICE_M3_PRO}, + {"M3 Max", GGML_METAL_DEVICE_M3_MAX}, + {"M3 Ultra", GGML_METAL_DEVICE_M3_ULTRA}, + {"M4", GGML_METAL_DEVICE_M4}, + {"M4 Pro", GGML_METAL_DEVICE_M4_PRO}, + {"M4 Max", GGML_METAL_DEVICE_M4_MAX}, + {"M5", GGML_METAL_DEVICE_M5}, + {"M5 Pro", GGML_METAL_DEVICE_M5_PRO}, + {"M5 Max", GGML_METAL_DEVICE_M5_MAX}, + {"M5 Ultra", GGML_METAL_DEVICE_M5_ULTRA}, + }; + + for (size_t i = 0; i < sizeof(table)/sizeof(table[0]); ++i) { + if (strcmp(suffix, table[i].name) == 0) { + return table[i].id; + } + } + return GGML_METAL_DEVICE_GENERIC; +} + ggml_metal_device_t ggml_metal_device_init(int device) { ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device)); @@ -795,6 +839,8 @@ ggml_metal_device_t ggml_metal_device_init(int device) { dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7]; + dev->props.device_id = ggml_metal_device_id_parse([[dev->mtl_device name] UTF8String]); + dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32; dev->props.max_buffer_size = dev->mtl_device.maxBufferLength; @@ -1061,7 +1107,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_GLU_OP_SWIGLU_OAI: case GGML_GLU_OP_GEGLU_ERF: case GGML_GLU_OP_GEGLU_QUICK: - return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + return ggml_is_contiguous_1(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16); default: return false; } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index a114391c2e8..e2ce56e9e28 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); - const int nth = std::min(1024, ne0); + int nth = std::min(256, ne0); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + if (nth < 256) { + nrptg = std::min((256 + nth - 1) / nth, ne1); + if (nrptg * nth > 256) { + nrptg = 256 / nth; + } + } + + const int nw0 = (ne1 + nrptg - 1) / nrptg; + + ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1); return 1; } @@ -816,9 +827,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1); } else { const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); - const int nth = MIN(args.ne00, nth_max); - const int nk0 = (args.ne00 + nth - 1)/nth; ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1); @@ -1788,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nk0 = ne10/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne11, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1799,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } @@ -1863,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nk0 = ne00/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne01, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1874,7 +1883,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } @@ -3626,16 +3635,26 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) { auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op); - GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + if (KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N); - const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N); + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); - ggml_metal_encoder_set_pipeline(enc, pipeline); - ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); - ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1); - ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); + ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW); + } else { + const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N); + const int64_t quotient = N / n_threads + (N % n_threads > 0 ? 1 : 0); - ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW); + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); + + ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1); + } return 1; } @@ -4039,14 +4058,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) { auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op); - const int nth = std::min(1024, ne0); + if (pipeline.c4) { + args.ne00 = ne00/4; + args.ne0 = ne0/4; + } + + const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + const int nth = MIN(args.ne0, nth_max); + const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel! ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1); return 1; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3882b955847..2bd310d9450 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1421,7 +1421,8 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; -kernel void kernel_reglu_f32( +template +kernel void kernel_reglu( constant ggml_metal_kargs_glu & args, device const char * src0, device const char * src1, @@ -1429,19 +1430,25 @@ kernel void kernel_reglu_f32( uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { const float x0 = src0_row[i0]; const float x1 = src1_row[i0]; - dst_row[i0] = x0*x1*(x0 > 0.0f); + dst_row[i0] = (T)(x0*x1*(x0 > 0.0f)); } } -kernel void kernel_geglu_f32( +typedef decltype(kernel_reglu) kernel_reglu_t; + +template [[host_name("kernel_reglu_f32")]] kernel kernel_reglu_t kernel_reglu; +template [[host_name("kernel_reglu_f16")]] kernel kernel_reglu_t kernel_reglu; + +template +kernel void kernel_geglu( constant ggml_metal_kargs_glu & args, device const char * src0, device const char * src1, @@ -1449,9 +1456,9 @@ kernel void kernel_geglu_f32( uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { const float x0 = src0_row[i0]; @@ -1459,11 +1466,17 @@ kernel void kernel_geglu_f32( const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); - dst_row[i0] = gelu*x1; + dst_row[i0] = (T)(gelu*x1); } } -kernel void kernel_swiglu_f32( +typedef decltype(kernel_geglu) kernel_geglu_t; + +template [[host_name("kernel_geglu_f32")]] kernel kernel_geglu_t kernel_geglu; +template [[host_name("kernel_geglu_f16")]] kernel kernel_geglu_t kernel_geglu; + +template +kernel void kernel_swiglu( constant ggml_metal_kargs_glu & args, device const char * src0, device const char * src1, @@ -1471,9 +1484,9 @@ kernel void kernel_swiglu_f32( uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { const float x0 = src0_row[i0]; @@ -1481,11 +1494,17 @@ kernel void kernel_swiglu_f32( const float silu = x0 / (1.0f + exp(-x0)); - dst_row[i0] = silu*x1; + dst_row[i0] = (T)(silu*x1); } } -kernel void kernel_swiglu_oai_f32( +typedef decltype(kernel_swiglu) kernel_swiglu_t; + +template [[host_name("kernel_swiglu_f32")]] kernel kernel_swiglu_t kernel_swiglu; +template [[host_name("kernel_swiglu_f16")]] kernel kernel_swiglu_t kernel_swiglu; + +template +kernel void kernel_swiglu_oai( constant ggml_metal_kargs_glu & args, device const char * src0, device const char * src1, @@ -1493,9 +1512,9 @@ kernel void kernel_swiglu_oai_f32( uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { float x0 = src0_row[i0]; @@ -1507,11 +1526,17 @@ kernel void kernel_swiglu_oai_f32( float out_glu = x0 / (1.0f + exp(-x0 * args.alpha)); out_glu = out_glu * (1.0f + x1); - dst_row[i0] = out_glu; + dst_row[i0] = (T)out_glu; } } -kernel void kernel_geglu_erf_f32( +typedef decltype(kernel_swiglu_oai) kernel_swiglu_oai_t; + +template [[host_name("kernel_swiglu_oai_f32")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai; +template [[host_name("kernel_swiglu_oai_f16")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai; + +template +kernel void kernel_geglu_erf( constant ggml_metal_kargs_glu & args, device const char * src0, device const char * src1, @@ -1519,9 +1544,9 @@ kernel void kernel_geglu_erf_f32( uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { const float x0 = src0_row[i0]; @@ -1529,11 +1554,17 @@ kernel void kernel_geglu_erf_f32( const float gelu_erf = 0.5f*x0*(1.0f+erf_approx(x0*SQRT_2_INV)); - dst_row[i0] = gelu_erf*x1; + dst_row[i0] = (T)(gelu_erf*x1); } } -kernel void kernel_geglu_quick_f32( +typedef decltype(kernel_geglu_erf) kernel_geglu_erf_t; + +template [[host_name("kernel_geglu_erf_f32")]] kernel kernel_geglu_erf_t kernel_geglu_erf; +template [[host_name("kernel_geglu_erf_f16")]] kernel kernel_geglu_erf_t kernel_geglu_erf; + +template +kernel void kernel_geglu_quick( constant ggml_metal_kargs_glu & args, device const char * src0, device const char * src1, @@ -1541,9 +1572,9 @@ kernel void kernel_geglu_quick_f32( uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; - device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; - device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1); for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { const float x0 = src0_row[i0]; @@ -1551,10 +1582,15 @@ kernel void kernel_geglu_quick_f32( const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0))); - dst_row[i0] = gelu_quick*x1; + dst_row[i0] = (T)(gelu_quick*x1); } } +typedef decltype(kernel_geglu_quick) kernel_geglu_quick_t; + +template [[host_name("kernel_geglu_quick_f32")]] kernel kernel_geglu_quick_t kernel_geglu_quick; +template [[host_name("kernel_geglu_quick_f16")]] kernel kernel_geglu_quick_t kernel_geglu_quick; + kernel void kernel_op_sum_f32( constant ggml_metal_kargs_sum & args, device const float * src0, @@ -2531,6 +2567,7 @@ kernel void kernel_rwkv_wkv7_f32( constant short FC_gated_delta_net_ne20 [[function_constant(FC_GATED_DELTA_NET + 0)]]; constant short FC_gated_delta_net_ne30 [[function_constant(FC_GATED_DELTA_NET + 1)]]; +constant short FC_gated_delta_net_K [[function_constant(FC_GATED_DELTA_NET + 2)]]; #if 1 template @@ -2548,21 +2585,24 @@ kernel void kernel_gated_delta_net_impl( uint3 ntg[[threads_per_threadgroup]]) { #define S_v FC_gated_delta_net_ne20 #define G FC_gated_delta_net_ne30 +#define K FC_gated_delta_net_K const uint tx = tpitg.x; const uint ty = tpitg.y; - const uint i23 = tgpig.z; // B - const uint i21 = tgpig.y; // H - const uint i20 = tgpig.x*NSG + ty; + const uint i23 = tgpig.z; // B (n_seqs) + const uint i21 = tgpig.y; // H (head) + const uint i20 = tgpig.x*NSG + ty; // row within S_v const uint i01 = i21 % args.ne01; const uint i11 = i21 % args.ne11; const float scale = 1.0f / sqrt((float)S_v); + // input state layout (D, K, n_seqs): per-seq stride is K*H*D; we read slot 0. // state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous - device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; + const uint state_in_base = (i23*K*args.ne21 + i21)*S_v*S_v + i20*S_v; + device const float * s_ptr = (device const float *) (s) + state_in_base; float ls[NSG]; @@ -2580,6 +2620,17 @@ kernel void kernel_gated_delta_net_impl( device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21); device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G; + // snapshot slot mapping: target_slot = t - shift. When n_tokens < K, only the last + // n_tokens slots are written; earlier slots are left untouched (caller-owned). + const int shift = (int)args.ne22 - (int)K; + + // output state base offset: after attention scores + const uint attn_size = args.ne22 * args.ne21 * S_v * args.ne23; + // output state per-slot size: S_v * S_v * H * n_seqs + const uint state_size_per_snap = S_v * S_v * args.ne21 * args.ne23; + // per-(seq,head) offset within a slot + const uint state_out_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; + for (short t = 0; t < args.ne22; t++) { float s_k = 0.0f; @@ -2627,17 +2678,30 @@ kernel void kernel_gated_delta_net_impl( b_ptr += args.ne21; g_ptr += args.ne21*G; - } - device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v; + if (K > 1) { + const int target_slot = (int)t - shift; + if (target_slot >= 0 && target_slot < (int)K) { + device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base; + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + dst_state[is] = ls[j]; + } + } + } + } - FOR_UNROLL (short j = 0; j < NSG; j++) { - const short is = tx*NSG + j; - dst_state[is] = ls[j]; + if (K == 1) { + device float * dst_state = (device float *) (dst) + attn_size + state_out_base; + FOR_UNROLL (short j = 0; j < NSG; j++) { + const short is = tx*NSG + j; + dst_state[is] = ls[j]; + } } #undef S_v #undef G +#undef K } typedef decltype(kernel_gated_delta_net_impl<4>) kernel_gated_delta_net_t; @@ -4668,59 +4732,59 @@ kernel void kernel_im2col( template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col; template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col; -// TODO: obsolete -- remove -//typedef void (im2col_ext_t)( -// constant ggml_metal_kargs_im2col & args, -// device const float * x, -// device char * dst, -// uint3 tgpig[[threadgroup_position_in_grid]], -// uint3 tgpg[[threadgroups_per_grid]], -// uint3 tpitg[[thread_position_in_threadgroup]], -// uint3 ntg[[threads_per_threadgroup]]); -// -//template -//kernel void kernel_im2col_ext( -// constant ggml_metal_kargs_im2col & args, -// device const float * x, -// device char * dst, -// uint3 tgpig[[threadgroup_position_in_grid]], -// uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW -// uint3 tpitg[[thread_position_in_threadgroup]], -// uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1] -// const int64_t KHW = (int64_t)args.KHW; -// -// const int64_t d = tgpig[0] / args.CHW; -// const int64_t chw = tgpig[0] % args.CHW; -// const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1) -// const int64_t HW = tgpig[0] % KHW; -// -// const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0]; -// if (tpitg_0 >= args.N) { -// return; -// } -// -// const int64_t tpitg_1 = HW / args.KW; -// const int64_t tpitg_2 = HW % args.KW; -// -// const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0; -// const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1; -// -// const int64_t offset_dst = -// (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW + -// (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2); -// -// device T * pdst = (device T *) (dst); -// -// if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) { -// pdst[offset_dst] = 0.0f; -// } else { -// const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1; -// pdst[offset_dst] = x[offset_src + iih * args.IW + iiw]; -// } -//} -// -//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext; -//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext; +// TODO: optimize +typedef void (im2col_ext_t)( + constant ggml_metal_kargs_im2col & args, + device const float * x, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]); + +template +kernel void kernel_im2col_ext( + constant ggml_metal_kargs_im2col & args, + device const float * x, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1] + const int64_t KHW = (int64_t)args.KHW; + + const int64_t d = tgpig[0] / args.CHW; + const int64_t chw = tgpig[0] % args.CHW; + const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1) + const int64_t HW = tgpig[0] % KHW; + + const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0]; + if (tpitg_0 >= args.N) { + return; + } + + const int64_t tpitg_1 = HW / args.KW; + const int64_t tpitg_2 = HW % args.KW; + + const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0; + const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1; + + const int64_t offset_dst = + (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW + + (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2); + + device T * pdst = (device T *) (dst); + + if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) { + pdst[offset_dst] = 0.0f; + } else { + const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1; + pdst[offset_dst] = x[offset_src + iih * args.IW + iiw]; + } +} + +template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext; +template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext; template kernel void kernel_conv_2d( @@ -4853,15 +4917,32 @@ kernel void kernel_conv_transpose_1d( uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpg[[threadgroups_per_grid]]) { - float v = 0.0f; + // For output position j on the time axis, only input positions + // i such that i*s0 <= j < i*s0 + K + // contribute -- i.e. i in [ceil((j - K + 1)/s0), floor(j/s0)] + // intersected with [0, IL-1]. That's at most ceil(K/s0) values + // (typically 2 for stride==K/2 transposed convs). + const int32_t j = tgpig[0]; + const int32_t s0 = args.s0; + const int32_t K = args.K; + const int32_t IL = args.IL; + + int32_t i_min; + { + int32_t a = j - K + 1; + i_min = a <= 0 ? 0 : (a + s0 - 1) / s0; // ceil(a/s0) for a>0 + } + int32_t i_max = j / s0; + if (i_max > IL - 1) i_max = IL - 1; - for (int64_t c = 0; c < args.IC; c++) { - const int32_t kernel_offset = c * tgpg[1] * args.K + args.K * tgpig[1]; - const int32_t input_offset = c * args.IL; + float v = 0.0f; + if (i_min <= i_max) { + for (int64_t c = 0; c < args.IC; c++) { + const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1]; + const int32_t input_offset = c * IL; - for (int64_t i = 0; i < args.IL; i++) { - if (tgpig[0] >= i * args.s0 && tgpig[0] < i * args.s0 + args.K) { - v += src0[kernel_offset + tgpig[0] - i * args.s0] * src1[input_offset + i]; + for (int32_t i = i_min; i <= i_max; i++) { + v += float(src0[kernel_offset + j - i * s0]) * src1[input_offset + i]; } } } @@ -5059,7 +5140,7 @@ kernel void kernel_upscale_bilinear_f32( for (int64_t sx = x_min; sx < x_max; ++sx) { const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0); const float w = wx * wy; - const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); + device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); sum += (*src_ptr) * w; wsum += w; } @@ -5241,7 +5322,7 @@ kernel void kernel_upscale_bicubic_f32( const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx)); const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3; - const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); + device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); sum += (*src_ptr) * wx * wy; } } @@ -5284,42 +5365,46 @@ kernel void kernel_roll_f32( } } -kernel void kernel_pad_f32( +template +kernel void kernel_pad_impl( constant ggml_metal_kargs_pad & args, device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t k0 = tgpig.x/args.ne1; + const int32_t i1 = tgpig.x - k0*args.ne1; - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; - - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; + const int32_t i03 = i3; + const int32_t i02 = i2; + const int32_t i01 = i1; - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); + device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - if (i0 < args.ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } else { - dst_ptr[i0] = 0.0f; - } + for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) { + const int32_t i0 = k0*1024 + tpitg.x + l0; + if (i0 >= args.ne0) { + break; } - return; - } - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - dst_ptr[i0] = 0.0f; + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + dst_ptr[i0] = src0_ptr[i0]; + } else { + dst_ptr[i0] = 0.0f; + } } } +typedef decltype(kernel_pad_impl) kernel_pad_t; + +template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl; +template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl; + +// TODO: this is slow - optimize kernel void kernel_pad_reflect_1d_f32( constant ggml_metal_kargs_pad_reflect_1d & args, device const char * src0, @@ -7283,23 +7368,27 @@ kernel void kernel_cpy_t_t( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.ne00; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; break; @@ -7331,23 +7420,27 @@ kernel void kernel_cpy_f32_q( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n / (args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; + const int32_t i3 = n / (args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00); quantize_func(src, dst_data[i00]); @@ -7372,24 +7465,28 @@ kernel void kernel_cpy_q_f32( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { T4x4 temp; dequantize_func(src_data + i00/nl, i00%nl, temp); dst_data[i00] = temp; @@ -7425,7 +7522,11 @@ kernel void kernel_concat( const int i3 = tgpig.z; const int i2 = tgpig.y; - const int i1 = tgpig.x; + const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y; + + if (i1 >= args.ne1) { + return; + } int o[4] = {0, 0, 0, 0}; o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index c6aba608736..cd15d573238 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -87,6 +87,10 @@ set(GGML_OPENCL_KERNELS mul_mv_q4_1_f32_flat mul_mv_q4_k_f32 mul_mv_q4_k_f32_flat + mul_mv_q5_0_f32 + mul_mv_q5_0_f32_flat + mul_mv_q5_1_f32 + mul_mv_q5_1_f32_flat mul_mv_q5_k_f32 mul_mv_q5_k_f32_flat mul_mv_q6_k_f32 @@ -110,6 +114,12 @@ set(GGML_OPENCL_KERNELS gemv_moe_q5_0_f32_ns gemm_moe_q5_1_f32_ns gemv_moe_q5_1_f32_ns + gemm_moe_q4_k_f32_ns + gemv_moe_q4_k_f32_ns + gemm_moe_q5_k_f32_ns + gemv_moe_q5_k_f32_ns + gemm_moe_q6_k_f32_ns + gemv_moe_q6_k_f32_ns gemm_moe_mxfp4_f32 gemv_moe_mxfp4_f32 gemm_moe_mxfp4_f32_ns @@ -120,6 +130,8 @@ set(GGML_OPENCL_KERNELS mul_mm_f16_f32_l4_lm mul_mm_q4_0_f32_l4_lm mul_mm_q4_1_f32_l4_lm + mul_mm_q5_0_f32_l4_lm + mul_mm_q5_1_f32_l4_lm mul_mm_q8_0_f32_l4_lm mul_mm_iq4_nl_f32_l4_lm mul_mm_q4_k_f32_l4_lm @@ -158,6 +170,7 @@ set(GGML_OPENCL_KERNELS sqr sqrt ssm_conv + gated_delta_net sub sum_rows cumsum diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 0e511592d53..b67ea46bce8 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -375,6 +375,13 @@ struct ggml_backend_opencl_device_context { ggml_backend_buffer_type buffer_type; cl_context context = nullptr; + + GPU_FAMILY gpu_family = GPU_FAMILY::UNKNOWN; + ADRENO_GPU_GEN adreno_gen = ADRENO_GPU_GEN::ADRENO_UNKNOWN; + + std::regex *opfilter = nullptr; // regex of ops to not claim + std::string opfilter_str = ""; // regex string for opfilter + size_t global_mem_size = 0; }; // backend context @@ -384,6 +391,18 @@ struct ggml_backend_opencl_context { cl_device_id device; std::string device_name; + ggml_cl_version platform_version; + ggml_cl_version opencl_c_version; + + // argsort is loaded in supports_op because its availability depends on how + // many workgroups are allowed, which requires kernel compilation. + bool kernels_loaded_argsort = false; + // flash attn is loaded in supports_op because it contains multiple variants + // and takes time to compile, so we want to only compile it when needed. + bool kernels_loaded_flash_attn = false; + // rest of the kernels are currently always loaded in alloc_buffer. + bool kernels_loaded = false; + std::string driver_version; GPU_FAMILY gpu_family; @@ -395,10 +414,9 @@ struct ggml_backend_opencl_context { size_t max_workgroup_size; bool fp16_support; bool has_vector_subgroup_broadcast; + bool has_qcom_subgroup_shuffle = false; // cl_qcom_subgroup_shuffle bool disable_fusion; - std::regex *opfilter = nullptr; // regex of ops to not claim - bool adreno_has_large_buffer; bool adreno_use_large_buffer; ggml_cl_compiler_version adreno_cl_compiler_version; @@ -410,6 +428,8 @@ struct ggml_backend_opencl_context { size_t image2d_max_width; size_t image2d_max_height; + cl_device_svm_capabilities svm_caps; + cl_context context; cl_command_queue queue; @@ -556,12 +576,18 @@ struct ggml_backend_opencl_context { cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns; cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1; cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns; + cl_kernel kernel_convert_block_q5_0, kernel_restore_block_q5_0; cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns; + cl_kernel kernel_convert_block_q5_1, kernel_restore_block_q5_1; cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns; + cl_kernel kernel_convert_block_q4_k_trans4_ns, kernel_restore_block_q4_k_trans4_ns; + cl_kernel kernel_convert_block_q5_k_trans4_ns, kernel_restore_block_q5_k_trans4_ns; + cl_kernel kernel_convert_block_q6_k_trans4_ns, kernel_restore_block_q6_k_trans4_ns; cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans; cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns; cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans; cl_kernel kernel_convert_block_q6_K_noshuffle, kernel_restore_block_q6_K_noshuffle; + cl_kernel kernel_convert_bf16_to_f16, kernel_convert_f16_to_bf16; cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; cl_kernel kernel_convert_block_q4_0_noshuffle; cl_kernel kernel_restore_block_q4_0_noshuffle; @@ -580,6 +606,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; cl_kernel kernel_mul_mv_q4_1_f32; cl_kernel kernel_mul_mv_q4_1_f32_flat; + cl_kernel kernel_mul_mv_q5_0_f32; + cl_kernel kernel_mul_mv_q5_0_f32_flat; + cl_kernel kernel_mul_mv_q5_1_f32; + cl_kernel kernel_mul_mv_q5_1_f32_flat; cl_kernel kernel_mul_mv_q4_K_f32; cl_kernel kernel_mul_mv_q4_K_f32_flat; cl_kernel kernel_mul_mv_q5_K_f32; @@ -614,11 +644,18 @@ struct ggml_backend_opencl_context { cl_kernel kernel_conv_2d_f32; cl_kernel kernel_conv_2d_f16_f32; cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4; + // [size_idx][kda][tgpp] where size_idx: 0=S_V=16, 1=32, 2=64, 3=128; kda: 0 or 1. + // tgpp 0 = TG variant (COLS_PER_LANE_GROUP=1), tgpp 1 = prefill variant (COLS_PER_LANE_GROUP=4). + cl_kernel kernel_gated_delta_net_f32[4][2][2] = {}; + cl_kernel kernel_timestep_embedding; cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns; cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns; cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns; cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns; + cl_kernel kernel_gemv_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns; + cl_kernel kernel_gemv_moe_q5_k_f32_ns, kernel_gemm_moe_q5_k_f32_ns; + cl_kernel kernel_gemv_moe_q6_k_f32_ns, kernel_gemm_moe_q6_k_f32_ns; cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32; cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns; cl_kernel kernel_moe_reorder_b; @@ -631,6 +668,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mm_f16_f32_l4_lm; cl_kernel kernel_mul_mm_q4_0_f32_l4_lm; cl_kernel kernel_mul_mm_q4_1_f32_l4_lm; + cl_kernel kernel_mul_mm_q5_0_f32_l4_lm; + cl_kernel kernel_mul_mm_q5_1_f32_l4_lm; cl_kernel kernel_mul_mm_q8_0_f32_l4_lm; cl_kernel kernel_mul_mm_q4_k_f32_l4_lm; cl_kernel kernel_mul_mm_q5_k_f32_l4_lm; @@ -638,11 +677,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mm_iq4_nl_f32_l4_lm; std::vector profiling_info; + std::vector profiling_results; - void write_profiling_info() { - FILE * fperf = fopen("cl_profiling.csv", "w"); - if (!fperf) { - GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + void flush_profiling_batch() { + if (profiling_info.empty()) { return; } @@ -666,6 +704,7 @@ struct ggml_backend_opencl_context { CL_CHECK(clGetEventProfilingInfo( info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); CL_CHECK(clReleaseEvent(info.evt)); + info.evt = nullptr; char kernel_name[512]; CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, @@ -683,10 +722,26 @@ struct ggml_backend_opencl_context { info.cmd_complete_duration_ns = cmd_complete - cmd_end; info.cmd_total_duration_ns = cmd_complete - cmd_queued; } + profiling_results.insert(profiling_results.end(), + std::make_move_iterator(profiling_info.begin()), + std::make_move_iterator(profiling_info.end())); + profiling_info.clear(); + } + + void write_profiling_info() { + if (profiling_results.empty()) { + return; + } // Dump a csv + FILE * fperf = fopen("cl_profiling.csv", "w"); + if (!fperf) { + GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + return; + } + fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n"); - for (const ProfilingInfo & info : profiling_info) { + for (const ProfilingInfo & info : profiling_results) { fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", info.op_name.c_str(), info.kernel_name.c_str(), info.cmd_duration_ns/1.e6f, @@ -697,14 +752,14 @@ struct ggml_backend_opencl_context { fclose(fperf); // Dump a simple chrome trace - FILE* ftrace = fopen("cl_trace.json", "w"); + FILE * ftrace = fopen("cl_trace.json", "w"); if (!ftrace) { GGML_LOG_ERROR("Failed to open cl_trace.json\n"); return; } fprintf(ftrace, "[\n"); - for (const ProfilingInfo & info : profiling_info) { + for (const ProfilingInfo & info : profiling_results) { fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n", info.kernel_name.c_str(), info.cmd_queued/1000); fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n", @@ -715,6 +770,7 @@ struct ggml_backend_opencl_context { fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n", info.kernel_name.c_str(), info.cmd_end/1000); } + fprintf(ftrace, "]\n"); fclose(ftrace); } @@ -735,6 +791,9 @@ struct ggml_backend_opencl_context { profiling_info.emplace_back(); populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); + if (profiling_info.size() >= 2048) { + flush_profiling_batch(); + } #else GGML_UNUSED(tensor); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); @@ -775,11 +834,13 @@ struct ggml_backend_opencl_context { #endif // GGML_OPENCL_USE_ADRENO_KERNELS void free() { + clFinish(queue); + ref_count--; if (ref_count == 0) { #ifdef GGML_OPENCL_PROFILING write_profiling_info(); - profiling_info.clear(); + profiling_results.clear(); #endif } } @@ -787,18 +848,21 @@ struct ggml_backend_opencl_context { // All registered devices with a default device in the front. static std::vector g_ggml_backend_opencl_devices; +// All device contexts associated with the devices above. +// The devices live as long as the process, so do the contexts. +static std::vector> g_ggml_backend_opencl_dev_ctxs; inline std::string read_file(const std::string &path) { - std::ifstream ifs(path); - if (!ifs) { - return ""; - } - std::string text; - ifs.seekg(0, std::ios::end); - text.resize(ifs.tellg()); - ifs.seekg(0, std::ios::beg); - ifs.read(&text[0], text.size()); - return text; + std::ifstream ifs(path); + if (!ifs) { + return ""; + } + std::string text; + ifs.seekg(0, std::ios::end); + text.resize(ifs.tellg()); + ifs.seekg(0, std::ios::beg); + ifs.read(&text[0], text.size()); + return text; } static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { @@ -830,12 +894,120 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co return p; } -static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) { +static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) { + // compiler options for general kernels + auto opencl_c_std = + std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor); + std::string compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable -cl-unsafe-math-optimizations" + " -cl-finite-math-only -cl-fast-relaxed-math"; + + // argsort + if (!backend_ctx->kernels_loaded_argsort) { + cl_int err; +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "argsort.cl.h" + }; +#else + const std::string kernel_src = read_file("argsort.cl"); +#endif + backend_ctx->program_argsort_f32_i32 = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err)); + backend_ctx->kernels_loaded_argsort = true; + } +} + +static void load_cl_kernels_flash_attn(ggml_backend_opencl_context *backend_ctx) { + // compiler options for general kernels + auto opencl_c_std = + std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor); + std::string compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable -cl-unsafe-math-optimizations" + " -cl-finite-math-only -cl-fast-relaxed-math"; + + // flash_attn + if (!backend_ctx->kernels_loaded_flash_attn) { + cl_int err; + + #ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src_f16 { + #include "flash_attn_f16.cl.h" + }; + const std::string kernel_src_f32 { + #include "flash_attn_f32.cl.h" + }; + const std::string kernel_src_f32_f16 { + #include "flash_attn_f32_f16.cl.h" + }; + #else + const std::string kernel_src_f16 = read_file("flash_attn_f16.cl"); + const std::string kernel_src_f32 = read_file("flash_attn_f32.cl"); + const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl"); + #endif + + if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) { + const struct { int dk; int dv; int bm; int bn; } fa_dims[] = { + { 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32}, + {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16}, + {192, 192, 16, 16}, {256, 256, 16, 16}, + }; + + for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) { + const int dk = fa_dims[i].dk; + const int dv = fa_dims[i].dv; + const int bm = fa_dims[i].bm; + const int bn = fa_dims[i].bn; + std::string OPTS = compile_opts + + " -D DK=" + std::to_string(dk) + + " -D DV=" + std::to_string(dv) + + " -D BLOCK_M=" + std::to_string(bm) + + " -D BLOCK_N=" + std::to_string(bn); + + cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS); + cl_kernel k_f16, k_f16_q1; + CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err)); + CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err)); + backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16; + backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1; + CL_CHECK(clReleaseProgram(prog_f16)); + + cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS); + cl_kernel k_f32, k_f32_q1; + CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err)); + CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err)); + backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32; + backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1; + CL_CHECK(clReleaseProgram(prog_f32)); + + cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS); + cl_kernel k_f32_f16, k_f32_f16_q1; + CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err)); + CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err)); + backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16; + backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1; + CL_CHECK(clReleaseProgram(prog_f32_f16)); + + backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm; + backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn; + } + backend_ctx->kernels_loaded_flash_attn = true; + } + } +} + +static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { + if (backend_ctx->kernels_loaded) { + return; + } + cl_int err; // compiler options for general kernels auto opencl_c_std = - std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor); + std::string("CL") + std::to_string(backend_ctx->opencl_c_version.major) + "." + std::to_string(backend_ctx->opencl_c_version.minor); std::string compile_opts = std::string("-cl-std=") + opencl_c_std + " -cl-mad-enable -cl-unsafe-math-optimizations" " -cl-finite-math-only -cl-fast-relaxed-math"; @@ -977,10 +1149,20 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_k_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans4_ns", &err), err)); @@ -1006,6 +1188,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_iq4_nl = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_iq4_nl", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_iq4_nl_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_iq4_nl_noshuffle", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_iq4_nl_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_iq4_nl_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_bf16_to_f16 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_bf16_to_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_f16_to_bf16 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_f16_to_bf16", &err), err)); GGML_LOG_CONT("."); } @@ -1313,6 +1497,74 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mv_q5_0_f32 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_q5_0_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_q5_0_f32.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_q5_0_f32 = clCreateKernel(prog, "kernel_mul_mv_q5_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // mul_mv_q5_0_f32_flat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_q5_0_f32_flat.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_q5_0_f32_flat.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_q5_0_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q5_0_f32_flat", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // mul_mv_q5_1_f32 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_q5_1_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_q5_1_f32.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_q5_1_f32 = clCreateKernel(prog, "kernel_mul_mv_q5_1_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // mul_mv_q5_1_f32_flat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_q5_1_f32_flat.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_q5_1_f32_flat.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_q5_1_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q5_1_f32_flat", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // mul_mv_q5_k_f32 { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -1663,6 +1915,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mm_q5_0_f32_l4_lm + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mm_q5_0_f32_l4_lm.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mm_q5_0_f32_l4_lm.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mm_q5_0_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q5_0_f32_l4_lm", &err), err)); + GGML_LOG_CONT("."); + } + + // mul_mm_q5_1_f32_l4_lm + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mm_q5_1_f32_l4_lm.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mm_q5_1_f32_l4_lm.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mm_q5_1_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q5_1_f32_l4_lm", &err), err)); + GGML_LOG_CONT("."); + } + // mul_mm_q8_0_f32_l4_lm { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -1974,110 +2258,27 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } - // flash_attn + // div { - #ifdef GGML_OPENCL_EMBED_KERNELS - const std::string kernel_src_f16 { - #include "flash_attn_f16.cl.h" - }; - const std::string kernel_src_f32 { - #include "flash_attn_f32.cl.h" - }; - const std::string kernel_src_f32_f16 { - #include "flash_attn_f32_f16.cl.h" - }; - #else - const std::string kernel_src_f16 = read_file("flash_attn_f16.cl"); - const std::string kernel_src_f32 = read_file("flash_attn_f32.cl"); - const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl"); - #endif +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "div.cl.h" + }; +#else + const std::string kernel_src = read_file("div.cl"); +#endif + std::string compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable -cl-finite-math-only "; - if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) { - const struct { int dk; int dv; int bm; int bn; } fa_dims[] = { - { 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32}, - {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16}, - {192, 192, 16, 16}, {256, 256, 16, 16}, - }; + backend_ctx->program_div = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); - for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) { - const int dk = fa_dims[i].dk; - const int dv = fa_dims[i].dv; - const int bm = fa_dims[i].bm; - const int bn = fa_dims[i].bn; - std::string OPTS = compile_opts + - " -D DK=" + std::to_string(dk) + - " -D DV=" + std::to_string(dv) + - " -D BLOCK_M=" + std::to_string(bm) + - " -D BLOCK_N=" + std::to_string(bn); - - cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS); - cl_kernel k_f16, k_f16_q1; - CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err)); - CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err)); - backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16; - backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1; - CL_CHECK(clReleaseProgram(prog_f16)); - - cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS); - cl_kernel k_f32, k_f32_q1; - CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err)); - CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err)); - backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32; - backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1; - CL_CHECK(clReleaseProgram(prog_f32)); - - cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS); - cl_kernel k_f32_f16, k_f32_f16_q1; - CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err)); - CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err)); - backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16; - backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1; - CL_CHECK(clReleaseProgram(prog_f32_f16)); - - backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm; - backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn; - } - GGML_LOG_CONT("."); - } - } - - // argsort - { -#ifdef GGML_OPENCL_EMBED_KERNELS - const std::string kernel_src { - #include "argsort.cl.h" - }; -#else - const std::string kernel_src = read_file("argsort.cl"); -#endif - backend_ctx->program_argsort_f32_i32 = - build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); - - CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err)); - GGML_LOG_CONT("."); - } - - // div - { -#ifdef GGML_OPENCL_EMBED_KERNELS - const std::string kernel_src { - #include "div.cl.h" - }; -#else - const std::string kernel_src = read_file("div.cl"); -#endif - std::string compile_opts = std::string("-cl-std=") + opencl_c_std + - " -cl-mad-enable -cl-finite-math-only "; - - backend_ctx->program_div = - build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); - - CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err)); - CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err)); - CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err)); - CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err)); - GGML_LOG_CONT("."); - } + CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err)); + CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err)); + CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err)); + GGML_LOG_CONT("."); + } // sqr { @@ -2384,12 +2585,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err)); if (backend_ctx->program_upscale) { - cl_int err_bilinear; - backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear); - if (err_bilinear != CL_SUCCESS) { + cl_int err_bilinear; + backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear); + if (err_bilinear != CL_SUCCESS) { GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear); backend_ctx->kernel_upscale_bilinear = nullptr; - } + } } else { backend_ctx->kernel_upscale_bilinear = nullptr; } @@ -2459,8 +2660,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } - // conv2d - { + // conv2d + { #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "conv2d.cl.h" @@ -2518,6 +2719,86 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // gated_delta_net: one kernel per (S_V, KDA, tgpp) triple. + { + #ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gated_delta_net.cl.h" + }; + #else + const std::string kernel_src = read_file("gated_delta_net.cl"); + #endif + + const int gdn_sizes[4] = { 16, 32, 64, 128 }; + const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1; + if (sg_size < 0) { + GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n"); + exit(1); + } + + for (int si = 0; si < 4; si++) { + const int S_V = gdn_sizes[si]; + + // MUST match the dispatcher heuristic in ggml_cl_gated_delta_net exactly. + int lanes_per_column; + if (S_V >= 128) { + lanes_per_column = 8; + } else { + lanes_per_column = std::min(S_V, sg_size); + } + + // Round LANES_PER_COLUMN down until it is: + // * power-of-two + // * divides both S_V and sg_size + while (lanes_per_column > 1 && + (((lanes_per_column & (lanes_per_column - 1)) != 0) || + (S_V % lanes_per_column) != 0 || + (sg_size % lanes_per_column) != 0)) { + lanes_per_column >>= 1; + } + + GGML_ASSERT(lanes_per_column >= 1); + GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0)); + GGML_ASSERT((S_V % lanes_per_column) == 0); + GGML_ASSERT((sg_size % lanes_per_column) == 0); + + const bool is_partial_reduce = (lanes_per_column != 1) && (lanes_per_column < sg_size); + int use_qcom_shuffle = 0; + if (is_partial_reduce) { + if (backend_ctx->has_qcom_subgroup_shuffle) { + use_qcom_shuffle = 1; + } + } + for (int kda = 0; kda < 2; kda++) { + for (int tgpp = 0; tgpp < 2; tgpp++) { + const int cpl = (tgpp == 0) ? 1 : 4; + const int spw = (tgpp == 0) ? 1 : 1; + + std::string opts = compile_opts; + opts += " -DS_V=" + std::to_string(S_V); + opts += " -DKDA=" + std::to_string(kda); + opts += " -DSUBGROUP_SIZE=" + std::to_string(sg_size); + opts += " -DLANES_PER_COLUMN=" + std::to_string(lanes_per_column); + opts += " -DCOLS_PER_LANE_GROUP=" + std::to_string(cpl); + opts += " -DUSE_QCOM_SUBGROUP_SHUFFLE=" + std::to_string(use_qcom_shuffle); + + // Since spw=1 is found to be optimal, SUBGROUPS_PER_WG > 1 code in + // the kernel is removed. If you want to experiment with spw > 1, + // Please remember to implement code to handle it. + opts += " -DSUBGROUPS_PER_WG=" + std::to_string(spw); + + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src.c_str(), opts); + + CL_CHECK((backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp] = + clCreateKernel(prog, "kernel_gated_delta_net", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + } + } + } + GGML_LOG_CONT("."); + } + // mul_mv_id_q4_0_f32_8x_flat { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2748,7 +3029,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_q4_1_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl"); #endif @@ -2787,7 +3068,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_iq4_nl_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_iq4_nl_f32.cl"); #endif @@ -2826,7 +3107,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_q8_0_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_q8_0_f32.cl"); #endif @@ -2867,7 +3148,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve #ifdef GGML_OPENCL_EMBED_KERNELS const std::string kernel_src { #include "gemm_noshuffle_q4_k_f32.cl.h" - }; + }; #else const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl"); #endif @@ -3071,6 +3352,108 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // gemv_moe_q4_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q4_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q4_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q4_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q4_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q4_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q5_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q5_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q5_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q5_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q5_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q5_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q6_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q6_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q6_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q6_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q6_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q6_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q6_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q6_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // gemv_moe_mxfp4_f32_ns { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -3221,13 +3604,15 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve } #endif // GGML_OPENCL_USE_ADRENO_KERNELS GGML_LOG_CONT("\n"); + backend_ctx->kernels_loaded = true; } // XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { // XXX static bool initialized = false; // XXX static ggml_backend_opencl_context *backend_ctx = nullptr; -static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev); +static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev); +static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev); namespace /* anonymous */ { extern struct ggml_backend_device_i ggml_backend_opencl_device_i; @@ -3440,13 +3825,13 @@ static std::vector ggml_opencl_probe_devices(ggml_backend_r /* .context = */ dev_ctx.get(), }); - if (!ggml_cl2_init(&found_devices.back())) { + if (!ggml_opencl_is_device_supported(&found_devices.back())) { found_devices.pop_back(); - GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n"); + GGML_LOG_WARN("ggml_opencl: drop unsupported device '%s'.\n", dev->name); continue; } - dev_ctx.release(); + g_ggml_backend_opencl_dev_ctxs.push_back(std::move(dev_ctx)); } if (found_devices.size()) { @@ -3463,8 +3848,70 @@ static std::vector ggml_opencl_probe_devices(ggml_backend_r return found_devices; } -// Initialize device if it is supported (returns nullptr if it is not). -static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { +static void ggml_opencl_print_backend_info(ggml_backend_opencl_device_context * dev_ctx) { + GGML_ASSERT(dev_ctx); + GGML_ASSERT(dev_ctx->backend_ctx); + + auto * backend_ctx = dev_ctx->backend_ctx; + + GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", + backend_ctx->driver_version.c_str()); + GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n", + backend_ctx->has_vector_subgroup_broadcast ? "true" : "false"); + GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", + backend_ctx->fp16_support ? "true" : "false"); + GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", + backend_ctx->alignment); + GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", + backend_ctx->global_mem_size/1024/1024); + GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", + backend_ctx->max_alloc_size/1024/1024); + GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", + backend_ctx->image_max_buffer_size); + GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", + backend_ctx->image2d_max_width, backend_ctx->image2d_max_height); + GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", + backend_ctx->max_workgroup_size); + GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n", + backend_ctx->svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false"); + GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n", + backend_ctx->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false"); + GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n", + backend_ctx->svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false"); + GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n", + backend_ctx->svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false"); + GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n", + backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false"); + + // Print out configurations +#ifdef GGML_OPENCL_SOA_Q + GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); +#endif // GGML_OPENCL_SOA_Q + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); + if (backend_ctx->adreno_xmem_gemm_enabled) { + GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM enabled (temporary weight prepack)\n"); + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + + if (backend_ctx->adreno_use_large_buffer) { + if (!backend_ctx->adreno_has_large_buffer) { + GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n"); + backend_ctx->adreno_use_large_buffer = false; + } else { + GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n"); + } + } + + if (dev_ctx->opfilter) { + // for information only, the actual regex object is created in ggml_opencl_is_device_supported + GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", dev_ctx->opfilter_str.c_str()); + } +} + +// check if device should be accepted +static bool ggml_opencl_is_device_supported(ggml_backend_dev_t dev) { GGML_ASSERT(dev); GGML_ASSERT(dev->context); @@ -3472,138 +3919,160 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { GGML_ASSERT(dev_ctx->platform); GGML_ASSERT(dev_ctx->device); - if (dev_ctx->backend_ctx) { - return dev_ctx->backend_ctx; - } - - auto backend_ctx = std::make_unique(); - backend_ctx->device = dev_ctx->device; - backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; - - // ref_count get increased in ggml_backend_opencl_device_init - // This function is also used to retrieve backend context, so we don't want - // to increase ref_count for each call. We only want to increase ref_count - // when the associated device is initialized - backend_ctx->ref_count = 0; - if (strstr(dev_ctx->device_name.c_str(), "Adreno") || strstr(dev_ctx->device_name.c_str(), "Qualcomm") || strstr(dev_ctx->device_version.c_str(), "Adreno")) { - backend_ctx->gpu_family = GPU_FAMILY::ADRENO; + dev_ctx->gpu_family = GPU_FAMILY::ADRENO; + // Usually device version contains the detailed device name - backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str()); - if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) { - backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str()); + dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str()); + if (dev_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) { + dev_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str()); } - - // Use wave size of 64 for all Adreno GPUs. - backend_ctx->adreno_wave_size = 64; } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) { - backend_ctx->gpu_family = GPU_FAMILY::INTEL; + dev_ctx->gpu_family = GPU_FAMILY::INTEL; } else { - GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str()); - backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; - return nullptr; + GGML_LOG_WARN("ggml_opencl: unsupported GPU '%s'.\n", dev_ctx->device_name.c_str()); + dev_ctx->gpu_family = GPU_FAMILY::UNKNOWN; + return false; + } + + ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform); + + // Check device OpenCL version, OpenCL 2.0 or above is required + ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, dev_ctx->device); + if (opencl_c_version.major < 2) { + GGML_LOG_WARN("ggml_opencl: OpenCL 2.0 or above is required\n"); + return false; } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS - if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) { - GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; " + if (dev_ctx->gpu_family != GPU_FAMILY::ADRENO) { + GGML_LOG_WARN("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; " "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n"); - return nullptr; + return false; } #endif - // Populate backend device name - backend_ctx->device_name = dev_ctx->device_name; + size_t ext_str_size; + clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); - // A local ref of cl_device_id for convenience - cl_device_id device = backend_ctx->device; + char *ext_buffer = (char *)alloca(ext_str_size + 1); + clGetDeviceInfo(dev_ctx->device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); + ext_buffer[ext_str_size] = '\0'; - ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform); + // Check if ext_buffer contains cl_khr_fp16 + bool fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; + if (!fp16_support) { + GGML_LOG_WARN("ggml_opencl: device does not support FP16\n"); + return false; + } - // Check device OpenCL version, OpenCL 2.0 or above is required - ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device); - if (opencl_c_version.major < 2) { - GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n"); - return nullptr; + // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes + // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x) + if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL && + strstr(ext_buffer, "cl_intel_subgroups") == NULL) { + GGML_LOG_WARN("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) " + "(note that subgroups is an optional feature in OpenCL 3.0)\n"); + return false; } - // Check driver version + clGetDeviceInfo(dev_ctx->device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &dev_ctx->global_mem_size, NULL); + + const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER"); + if (str_opfilter) { + dev_ctx->opfilter_str = str_opfilter; + dev_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase); + } + + return true; +} + +// Initialize device if it is supported (returns nullptr if it is not). +static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) { + GGML_ASSERT(dev); + GGML_ASSERT(dev->context); + + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context; + GGML_ASSERT(dev_ctx->platform); + GGML_ASSERT(dev_ctx->device); + + if (dev_ctx->backend_ctx) { + return dev_ctx->backend_ctx; + } + + auto backend_ctx = std::make_unique(); + backend_ctx->device = dev_ctx->device; + backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; + + // ref_count get increased in ggml_backend_opencl_device_init + // This function is also used to retrieve backend context, so we don't want + // to increase ref_count for each call. We only want to increase ref_count + // when the associated device is initialized + backend_ctx->ref_count = 0; + + backend_ctx->gpu_family = dev_ctx->gpu_family; + backend_ctx->adreno_gen = dev_ctx->adreno_gen; + if (backend_ctx->gpu_family == GPU_FAMILY::ADRENO) { + // Use wave size of 64 for all Adreno GPUs. + backend_ctx->adreno_wave_size = 64; + } + + // Populate backend device name + backend_ctx->device_name = dev_ctx->device_name; + + // A local ref of cl_device_id for convenience + cl_device_id device = backend_ctx->device; + + ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform); + ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device); + + backend_ctx->platform_version = platform_version; + backend_ctx->opencl_c_version = opencl_c_version; + + // Check driver version size_t driver_version_str_size; clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size); char *driver_version = (char *)alloca(driver_version_str_size + 1); clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL); driver_version[driver_version_str_size] = '\0'; - GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version); backend_ctx->driver_version = driver_version; backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version); backend_ctx->has_vector_subgroup_broadcast = (backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) || (backend_ctx->adreno_cl_compiler_version.type == DX && backend_ctx->adreno_cl_compiler_version.major >= 17); - GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n", - backend_ctx->has_vector_subgroup_broadcast ? "true" : "false"); size_t ext_str_size; clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); char *ext_buffer = (char *)alloca(ext_str_size + 1); clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated + + // check support for qcom_subgroup_shuffle + if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) { + backend_ctx->has_qcom_subgroup_shuffle = true; + } + // Check if ext_buffer contains cl_khr_fp16 backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; - GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false"); + // check Adreno large buffer support backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL; - // fp16 is required - if (!backend_ctx->fp16_support) { - GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n"); - return nullptr; - } - - // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes - // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x) - if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL && - strstr(ext_buffer, "cl_intel_subgroups") == NULL) { - GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) " - "(note that subgroups is an optional feature in OpenCL 3.0)\n"); - return nullptr; - } - cl_uint base_align_in_bits; CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL)); GGML_ASSERT(base_align_in_bits % 8u == 0); backend_ctx->alignment = base_align_in_bits / 8u; - GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment); - - clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &backend_ctx->global_mem_size, NULL); - GGML_LOG_INFO("ggml_opencl: global mem size: %zu MB\n", backend_ctx->global_mem_size/1024/1024); - - clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); - GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); - clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL); - GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size); + backend_ctx->global_mem_size = dev_ctx->global_mem_size; - clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL); - clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL); - GGML_LOG_INFO("ggml_opencl: device max image2d size: %lu x %lu\n", backend_ctx->image2d_max_width, backend_ctx->image2d_max_height); - - clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL); - GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size); - - // Check SVM. - cl_device_svm_capabilities svm_caps; - CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0)); - GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n", - svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false"); - GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n", - svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false"); - GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n", - svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false"); - GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n", - svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false"); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL)); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL)); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->image2d_max_width, NULL)); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->image2d_max_height, NULL)); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL)); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &backend_ctx->svm_caps, 0)); if (opencl_c_version.major >= 3) { // Assume it is not available for 3.0, since it is optional in 3.0. @@ -3619,36 +4088,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { backend_ctx->non_uniform_workgroups = true; } - // Print out configurations -#ifdef GGML_OPENCL_SOA_Q - GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); -#endif // GGML_OPENCL_SOA_Q - -#ifdef GGML_OPENCL_USE_ADRENO_KERNELS - GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); -#endif // GGML_OPENCL_USE_ADRENO_KERNELS - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // determine whether to use Adreno xmem GEMM backend_ctx->adreno_xmem_gemm_enabled = getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr && backend_ctx->gpu_family == GPU_FAMILY::ADRENO; - if (getenv("GGML_OPENCL_ADRENO_XMEM_GEMM") != nullptr) { - GGML_LOG_INFO("ggml_opencl: Adreno xmem F16xF32 GEMM %s\n", - backend_ctx->adreno_xmem_gemm_enabled ? - "enabled (temporary weight prepack)" : "requested but unsupported by this driver"); - } -#endif // GGML_OPENCL_USE_ADRENO_KERNELS +#endif // determine whether to use large buffer for Adreno backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr && backend_ctx->gpu_family == GPU_FAMILY::ADRENO; - if (backend_ctx->adreno_use_large_buffer) { - if (!backend_ctx->adreno_has_large_buffer) { - GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n"); - backend_ctx->adreno_use_large_buffer = false; - } else { - GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n"); - } - } cl_int err; @@ -3665,8 +4113,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { #endif CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err)); - // Load kernels - load_cl_kernels(backend_ctx.get(), opencl_c_version); + // delay kernel loading until the first buffer is created + // load_cl_kernels(backend_ctx.get()); #ifdef GGML_OPENCL_USE_ADRENO_KERNELS // Allocate intermediate buffers and images @@ -3698,32 +4146,13 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr; - const char * str_opfilter = getenv("GGML_OPENCL_OPFILTER"); - if (str_opfilter) { - backend_ctx->opfilter = new std::regex(str_opfilter, std::regex_constants::icase); - GGML_LOG_INFO("ggml_opencl: opfilter regex = \"%s\"\n", str_opfilter); - } - dev_ctx->backend_ctx = backend_ctx.release(); return dev_ctx->backend_ctx; } -static void ggml_cl2_free(ggml_backend_t backend) { +static void ggml_cl_free(ggml_backend_t backend) { ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context; ctx->free(); - - // The CL context is shared by all backends, release it if all backends have been released - bool should_release_opencl = true; - for (auto device : g_ggml_backend_opencl_devices) { - ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context; - if (ctx_dev->backend_ctx->ref_count > 0) { - should_release_opencl = false; - } - } - - if (should_release_opencl) { - CL_CHECK(clReleaseContext(ctx->context)); - } } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS @@ -4148,6 +4577,8 @@ struct ggml_tensor_extra_cl_iq4_nl { struct ggml_tensor_extra_cl_q4_K { // Quantized values cl_mem q = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem q_img = nullptr; // Scales for each super block. cl_mem s = nullptr; // Scales @@ -4176,12 +4607,18 @@ struct ggml_tensor_extra_cl_q4_K { CL_CHECK(clReleaseMemObject(dm)); dm = nullptr; } + if (q_img != nullptr) { + CL_CHECK(clReleaseMemObject(q_img)); + q_img = nullptr; + } } }; struct ggml_tensor_extra_cl_q5_K { // Lower 4 bits of quantized weights. cl_mem q = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem q_img = nullptr; // Upper 1 bit of quantized weights. cl_mem qh = nullptr; // Scales for each block. @@ -4222,6 +4659,10 @@ struct ggml_tensor_extra_cl_q5_K { CL_CHECK(clReleaseMemObject(dm)); dm = nullptr; } + if (q_img != nullptr) { + CL_CHECK(clReleaseMemObject(q_img)); + q_img = nullptr; + } size_q = 0; size_qh = 0; @@ -4234,6 +4675,8 @@ struct ggml_tensor_extra_cl_q5_K { struct ggml_tensor_extra_cl_q6_K { // Lower 4 bits of quantized weights. cl_mem ql = nullptr; + // Lower 4 bits as image1d_buffer_t + cl_mem ql_img = nullptr; // Upper 2 bits of quantized weights. cl_mem qh = nullptr; // Scales for each block. @@ -4267,6 +4710,10 @@ struct ggml_tensor_extra_cl_q6_K { CL_CHECK(clReleaseMemObject(d)); d = nullptr; } + if (ql_img != nullptr) { + CL_CHECK(clReleaseMemObject(ql_img)); + ql_img = nullptr; + } size_ql = 0; size_qh = 0; @@ -4289,7 +4736,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) { } static void ggml_backend_opencl_free(ggml_backend_t backend) { - ggml_cl2_free(backend); + ggml_cl_free(backend); } static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -4328,14 +4775,17 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) { // enqueued to it won't start until commands in the other devices have // completed. static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) { - if (g_ggml_backend_opencl_devices.size() < 2) - return; // No other devices to synchronize with. + if (g_ggml_backend_opencl_devices.size() < 2) { + return; // No other devices to synchronize with. + } std::vector events; events.reserve(g_ggml_backend_opencl_devices.size()); for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) { - auto * other_backend_ctx = ggml_cl2_init(&backend_dev); + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) backend_dev.context; + auto * other_backend_ctx = dev_ctx->backend_ctx; + if (backend_ctx != other_backend_ctx) { cl_event ev; CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev)); @@ -4488,7 +4938,7 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) { GGML_UNUSED(backend_ctx); int ne01 = tensor->ne[1]; - return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0); + return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 32 == 0); } inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) { @@ -4505,7 +4955,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx; // reject ops that match the opfilter regex - if (backend_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *backend_ctx->opfilter)) { + if (dev_ctx->opfilter && std::regex_match(std::string(ggml_op_desc(op)), *dev_ctx->opfilter)) { return false; } @@ -4607,17 +5057,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: - return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_UNARY_OP_SIGMOID: return ggml_is_contiguous(op->src[0]); case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_EXP: - return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; + return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; case GGML_UNARY_OP_EXPM1: - return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; + return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; case GGML_UNARY_OP_SOFTPLUS: - return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; + return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16; default: return false; } @@ -4666,6 +5116,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32); case GGML_OP_SSM_CONV: return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32); + case GGML_OP_GATED_DELTA_NET: + { + // Match the Vulkan backend: only F32 -> F32, S_v in {16, 32, 64, 128}. + if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) { + return false; + } + const int64_t S_v = op->src[2]->ne[0]; + return S_v == 16 || S_v == 32 || S_v == 64 || S_v == 128; + } case GGML_OP_CONCAT: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_TIMESTEP_EMBEDDING: @@ -4675,9 +5134,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_OP_MUL_MAT: if (op->src[0]->type == GGML_TYPE_F16) { return true; + } else if (op->src[0]->type == GGML_TYPE_BF16) { + return true; } else if (op->src[0]->type == GGML_TYPE_F32) { return op->src[1]->type == GGML_TYPE_F32; } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q4_1 || + op->src[0]->type == GGML_TYPE_Q5_0 || op->src[0]->type == GGML_TYPE_Q5_1 || op->src[0]->type == GGML_TYPE_MXFP4 || op->src[0]->type == GGML_TYPE_IQ4_NL || op->src[0]->type == GGML_TYPE_Q4_K || @@ -4700,7 +5162,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te // the quantizations here currently do not - they are only supported by Adreno with certain shapes if (op->src[0]->type == GGML_TYPE_Q4_1 || op->src[0]->type == GGML_TYPE_Q5_0 || - op->src[0]->type == GGML_TYPE_Q5_1) { + op->src[0]->type == GGML_TYPE_Q5_1 || + op->src[0]->type == GGML_TYPE_Q4_K || + op->src[0]->type == GGML_TYPE_Q5_K || + op->src[0]->type == GGML_TYPE_Q6_K) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (op->src[1]->type == GGML_TYPE_F32) { return use_adreno_moe_kernels(backend_ctx, op->src[0]) @@ -4745,6 +5210,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te case GGML_OP_IM2COL: return true; case GGML_OP_ARGSORT: { + load_cl_kernels_argsort(backend_ctx); + cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32; int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel); @@ -4762,6 +5229,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_FLASH_ATTN_EXT: { + load_cl_kernels_flash_attn(backend_ctx); + const ggml_tensor * q = op->src[0]; const ggml_tensor * k = op->src[1]; const ggml_tensor * v = op->src[2]; @@ -4829,7 +5298,7 @@ static ggml_backend_i ggml_backend_opencl_i = { ggml_backend_t ggml_backend_opencl_init(void) { ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0); - ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); + ggml_backend_opencl_context *backend_ctx = ggml_cl_init(dev); ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_opencl_guid(), @@ -5208,15 +5677,13 @@ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) } static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device); - return (void *) (uintptr_t) backend_ctx->alignment; + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context; + return (void *) (uintptr_t) dev_ctx->backend_ctx->alignment; } static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; - ggml_cl2_init(buffer->buft->device); - if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); @@ -5256,7 +5723,8 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff } static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context; + ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx; cl_context context = backend_ctx->context; cl_command_queue queue = backend_ctx->queue; @@ -5622,7 +6090,24 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, return; } #endif // GGML_OPENCL_USE_ADRENO_KERNELS - return; + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0; + cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk)); + + size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + tensor->extra = extra; + return; } if (tensor->type == GGML_TYPE_Q5_1) { ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; @@ -5723,6 +6208,24 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, return; } #endif // GGML_OPENCL_USE_ADRENO_KERNELS + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1; + cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &n_blk)); + + size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + tensor->extra = extra; return; } if (tensor->type == GGML_TYPE_MXFP4) { @@ -6047,14 +6550,57 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q4_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + CL_CHECK(err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; if (use_adreno_kernels(backend_ctx, tensor)) { kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle; } - #else +#else cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; - #endif +#endif // GGML_OPENCL_USE_ADRENO_KERNELS cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -6157,14 +6703,58 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); CL_CHECK(err); - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + CL_CHECK(err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K; if (use_adreno_kernels(backend_ctx, tensor)) { kernel = backend_ctx->kernel_convert_block_q5_K_noshuffle; } - #else +#else cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K; - #endif +#endif cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -6232,26 +6822,99 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_buffer_region region; - // Subbuffer for ql - region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); - region.size = size_ql; - CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); - auto previous_origin = region.origin; +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // Adreno MoE Q6_K kernel needs special transposed layout + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + size_t moe_size_ql = (size_t)(ggml_nelements(tensor) / 8) * sizeof(uint32_t); // 4 bits per element + size_t moe_size_qh = (size_t)(ggml_nelements(tensor) / 16) * sizeof(uint32_t); // 2 bits per element + size_t moe_size_s = size_s; + size_t moe_size_d = size_d; + + // Subbuffer for ql + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = moe_size_ql; + CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + auto previous_origin = region.origin; + + // Subbuffer for qh + region.origin = align_to(previous_origin + moe_size_ql, backend_ctx->alignment); + region.size = moe_size_qh; + CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for scales + region.origin = align_to(previous_origin + moe_size_qh, backend_ctx->alignment); + region.size = moe_size_s; + CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for d + region.origin = align_to(previous_origin + moe_size_s, backend_ctx->alignment); + region.size = moe_size_d; + CL_CHECK((extra->d = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + cl_kernel kernel = backend_ctx->kernel_convert_block_q6_k_trans4_ns; - // Subbuffer for qh - region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment); - region.size = size_qh; - CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); - previous_origin = region.origin; + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; - // Subbuffer for scales - region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment); - region.size = size_s; - CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); - previous_origin = region.origin; + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; - // Create subbuffer for d. - region.origin = align_to(previous_origin + size_s, backend_ctx->alignment); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + // Create image for ql + cl_image_format img_format_ql = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_ql = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->ql } + }; + extra->ql_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_ql, &img_desc_ql, NULL, &err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + + // Subbuffer for ql + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = size_ql; + CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + auto previous_origin = region.origin; + + // Subbuffer for qh + region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment); + region.size = size_qh; + CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for scales + region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment); + region.size = size_s; + CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Create subbuffer for d. + region.origin = align_to(previous_origin + size_s, backend_ctx->alignment); region.size = size_d; CL_CHECK((extra->d = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); previous_origin = region.origin; @@ -6318,6 +6981,40 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, } #endif // GGML_OPENCL_SOA_Q + // convert bf16 to f16 and store as f16 in device buffer + if (tensor->type == GGML_TYPE_BF16) { + GGML_ASSERT(offset % sizeof(ggml_fp16_t) == 0 && size % sizeof(ggml_fp16_t) == 0 + && "Offset and size must be multiples of 2 for bf16 tensors"); + + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; + GGML_ASSERT(extra); + + cl_ulong n_elements = size / sizeof(ggml_fp16_t); + cl_ulong off_dst = (extra->offset + offset) / sizeof(ggml_fp16_t); + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + size, const_cast(data), &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_convert_bf16_to_f16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->data_device)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &n_elements)); + + size_t global_work_size[] = { (size_t)CEIL_DIV(n_elements, 64)*64, 1, 1 }; + size_t local_work_size[] = { 64, 1, 1 }; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + CL_CHECK(clReleaseEvent(evt)); + + return; + } + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; GGML_ASSERT(extra); @@ -6331,7 +7028,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->extra); - ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context; + ggml_backend_opencl_context *backend_ctx = dev_ctx->backend_ctx; cl_context context = backend_ctx->context; cl_command_queue queue = backend_ctx->queue; @@ -6585,8 +7283,29 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, return; } #endif // GGML_OPENCL_USE_ADRENO_KERNELS - // TODO: normal q5_0 - (void) extra; + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); return; } if (tensor->type == GGML_TYPE_Q5_1) { @@ -6627,8 +7346,29 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, return; } #endif // GGML_OPENCL_USE_ADRENO_KERNELS - // TODO: normal q5_1 - (void) extra; + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); return; } if (tensor->type == GGML_TYPE_MXFP4) { @@ -6825,6 +7565,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_uchar mask_F0 = 0xF0; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { int M = tensor->ne[1]; int K = tensor->ne[0]; @@ -6901,6 +7675,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_uchar mask_F0 = 0xF0; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { int M = tensor->ne[1]; int K = tensor->ne[0]; @@ -6975,6 +7783,43 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q6_k_trans4_ns; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { static ggml_cl_buffer buf_trans_ql; static ggml_cl_buffer buf_trans_qh; @@ -7060,6 +7905,41 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, } #endif // GGML_OPENCL_SOA_Q + if (tensor->type == GGML_TYPE_BF16) { + GGML_ASSERT(offset % sizeof(ggml_fp16_t) == 0 && size % sizeof(ggml_fp16_t) == 0 + && "Offset and size must be multiples of 2 for bf16 tensors"); + + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; + GGML_ASSERT(extra); + + cl_ulong n_elements = size / sizeof(ggml_fp16_t); + cl_ulong off_src = (extra->offset + tensor->view_offs + offset) / sizeof(ggml_fp16_t); + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_convert_f16_to_bf16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &n_elements)); + + size_t global_work_size[] = { (size_t)CEIL_DIV(n_elements, 64)*64, 1, 1 }; + size_t local_work_size[] = { 64, 1, 1 }; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseEvent(evt)); + + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, 0, size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + + return; + } + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; CL_CHECK(clEnqueueReadBuffer( @@ -7070,8 +7950,9 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, } static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_dev_t dev = buffer->buft->device; - ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer->buft->device->context; + ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx; + cl_command_queue queue = backend_ctx->queue; ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; @@ -7111,7 +7992,8 @@ static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer } static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { - ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device); + ggml_backend_opencl_context *backend_ctx = ggml_cl_init(buffer_type->device); + load_cl_kernels(backend_ctx); // clCreateBuffer returns -61 for size 0 size = std::max(size, (size_t)1); @@ -7134,15 +8016,15 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b } static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { - ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); - return backend_ctx->alignment; + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context; + return dev_ctx->backend_ctx->alignment; } static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) { static size_t max_size = -1; if (max_size == (size_t)-1) { - ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); - max_size = backend_ctx->max_alloc_size; + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) buffer_type->device->context; + max_size = dev_ctx->backend_ctx->max_alloc_size; } return max_size; } @@ -7179,14 +8061,13 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_ static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context; - ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) dev_ctx->backend_ctx; static const size_t opencl_extra_margin = 1024ull*1024ull*1024ull; // OpenCL does not provide reliable currently-free device memory. // Use total/global memory as a best-effort upper bound. // Improved safety: Reduce by a 1GiB extra margin for common --fit - *total = backend_ctx->global_mem_size; + *total = dev_ctx->global_mem_size; *free = *total > opencl_extra_margin ? *total - opencl_extra_margin : 0; } @@ -7210,7 +8091,7 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct } static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) { - ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev); + ggml_backend_opencl_context * backend_ctx = ggml_cl_init(dev); // Getting a new reference to the backend, increase ref_count backend_ctx->ref_count++; @@ -7221,6 +8102,8 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co /* .context = */ backend_ctx, }; + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context; + ggml_opencl_print_backend_info(dev_ctx); return backend; GGML_UNUSED(params); @@ -7247,6 +8130,7 @@ static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_bac } static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + ggml_cl_init(dev); return ggml_opencl_supports_op(dev, op); } @@ -7259,8 +8143,8 @@ static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggm // Check cl_context is the same. clEnqueue* commands may not use // buffers from another cl_context. - ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev); - ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device); + ggml_backend_opencl_context * backend_ctx0 = ggml_cl_init(dev); + ggml_backend_opencl_context * backend_ctx1 = ggml_cl_init(buft->device); return backend_ctx0->context == backend_ctx1->context; } @@ -7545,6 +8429,7 @@ static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor kernel = backend_ctx->kernel_cpy_f32_f32; break; case GGML_TYPE_F16: + case GGML_TYPE_BF16: // stored as f16 on device kernel = backend_ctx->kernel_cpy_f16_f16; break; default: @@ -10056,7 +10941,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t size_t local_work_size[] = { lws0, 1, 1 }; size_t * local_work_size_ptr = local_work_size; - if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) { + if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) { local_work_size_ptr = nullptr; } @@ -10505,7 +11390,8 @@ static bool ggml_cl_can_use_adreno_xmem_gemm_f16_f32( if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) { return false; } - if (src0->type != GGML_TYPE_F16 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + if ((src0->type != GGML_TYPE_F16 && src0->type != GGML_TYPE_BF16) || + src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { return false; } if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) { @@ -12223,7 +13109,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co GGML_ASSERT(dst); GGML_ASSERT(dst->extra); - const enum ggml_type src0t = src0->type; + // bf16 is stored as f16 on device + const enum ggml_type src0t = (src0->type == GGML_TYPE_BF16) ? GGML_TYPE_F16 : src0->type; const enum ggml_type src1t = src1->type; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; @@ -12239,6 +13126,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co #ifdef GGML_OPENCL_SOA_Q ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra; + ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra; + ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; ggml_tensor_extra_cl_iq4_nl * extra0_iq4_nl = (ggml_tensor_extra_cl_iq4_nl *)src0->extra; @@ -12574,7 +13463,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); return; } - case GGML_TYPE_Q8_0: { + case GGML_TYPE_Q5_0: { if (ne11 < 32) { break; } @@ -12582,32 +13471,33 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co break; } - kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm; + kernel = backend_ctx->kernel_mul_mm_q5_0_f32_l4_lm; nth0 = 128; // calculated as (BM*BN)/(TM*TN) int batch_stride_a = ne00*ne01; int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_a + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_b + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne01)); // stride_d + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_a)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_b)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_d)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r3)); // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; @@ -12616,7 +13506,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); return; } - case GGML_TYPE_IQ4_NL: { + case GGML_TYPE_Q5_1: { if (ne11 < 32) { break; } @@ -12624,19 +13514,105 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co break; } - kernel = backend_ctx->kernel_mul_mm_iq4_nl_f32_l4_lm; + kernel = backend_ctx->kernel_mul_mm_q5_1_f32_l4_lm; nth0 = 128; // calculated as (BM*BN)/(TM*TN) int batch_stride_a = ne00*ne01; int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3)); + + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + return; + } + case GGML_TYPE_Q8_0: { + if (ne11 < 32) { + break; + } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) { + break; + } + + kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm; + nth0 = 128; // calculated as (BM*BN)/(TM*TN) + + int batch_stride_a = ne00*ne01; + int batch_stride_b = ne10*ne11; + int batch_stride_d = ne0*ne1; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); + + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + return; + } + case GGML_TYPE_IQ4_NL: { + if (ne11 < 32) { + break; + } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) { + break; + } + + kernel = backend_ctx->kernel_mul_mm_iq4_nl_f32_l4_lm; + nth0 = 128; // calculated as (BM*BN)/(TM*TN) + + int batch_stride_a = ne00*ne01; + int batch_stride_b = ne10*ne11; + int batch_stride_d = ne0*ne1; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -13110,6 +14086,137 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co #endif // GGML_OPENCL_SOA_Q break; } + case GGML_TYPE_Q5_0: { +#ifdef GGML_OPENCL_SOA_Q + if (backend_ctx->gpu_family == INTEL) { + nth0 = 16; + nth1 = 1; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + nth1 = 1; + ndst = 4; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + kernel = backend_ctx->kernel_mul_mv_q5_0_f32_flat; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r3)); +#else + if (backend_ctx->gpu_family == INTEL) { + nth0 = 16; + nth1 = 1; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + nth1 = 1; + ndst = 4; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + kernel = backend_ctx->kernel_mul_mv_q5_0_f32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); +#endif // GGML_OPENCL_SOA_Q + break; + } + case GGML_TYPE_Q5_1: { +#ifdef GGML_OPENCL_SOA_Q + if (backend_ctx->gpu_family == INTEL) { + nth0 = 16; + nth1 = 1; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + nth1 = 1; + ndst = 4; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + kernel = backend_ctx->kernel_mul_mv_q5_1_f32_flat; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3)); +#else + if (backend_ctx->gpu_family == INTEL) { + nth0 = 16; + nth1 = 1; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + nth1 = 1; + ndst = 4; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + kernel = backend_ctx->kernel_mul_mv_q5_1_f32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); +#endif // GGML_OPENCL_SOA_Q + break; + } case GGML_TYPE_Q8_0: { #ifdef GGML_OPENCL_SOA_Q kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat; @@ -13550,6 +14657,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 || src0t == GGML_TYPE_Q4_1 || + src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_Q2_K) { @@ -13733,6 +14842,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra; ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra; ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra; + ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra; + ggml_tensor_extra_cl_q5_K * extra0_q5_K = (ggml_tensor_extra_cl_q5_K *)src0->extra; + ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; #endif @@ -13741,6 +14853,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, (void)extra0_q4_1; (void)extra0_q5_0; (void)extra0_q5_1; + (void)extra0_q4_K; + (void)extra0_q5_K; + (void)extra0_q6_K; const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; @@ -13773,6 +14888,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; + GGML_UNUSED(ne2); + const int r2 = ne12/ne02; const int r3 = ne13/ne03; const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows @@ -13787,6 +14904,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const int n_tile_size = 32; const int max_post_router_tile = (ne20 * ne21 / n_tile_size) + ne02; + GGML_UNUSED(max_post_router_tile); + cl_kernel kernel; // subgroup mat vec @@ -13800,7 +14919,575 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_size[3] = {64, 2, 1}; if (ne12 == 1) { // for gemv - kernel = backend_ctx->kernel_gemv_moe_q4_0_f32_ns; + kernel = backend_ctx->kernel_gemv_moe_q4_0_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1; + cl_image_desc image_desc_buf_src1; + image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } // fallback to generic Q4_0 MoE kernel + +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat; + + if (backend_ctx->gpu_family == INTEL) { + sgs = 16; + nsg = 1; + ndst = 8; + } else if (backend_ctx->gpu_family == ADRENO) { + sgs = 64; + nsg = 1; + ndst = 8; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3)); + + break; + } + case GGML_TYPE_Q4_1: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q4_1_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q4_1_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1; + cl_image_desc image_desc_buf_src1; + image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q5_0: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q5_0_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q5_0_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1; + cl_image_desc image_desc_buf_src1; + image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q5_1: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q5_1_f32_ns; cl_mem src1_sub_buffer, buf_src1_image, buf_src2; @@ -13812,7 +15499,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // set thread grid - global_size[0] = static_cast(ne01); + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); global_size[1] = 4; global_size[2] = static_cast(ne20); local_size[1] = 4; @@ -13831,8 +15518,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); @@ -13848,9 +15537,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(clReleaseMemObject(src1_sub_buffer)); CL_CHECK(clReleaseMemObject(buf_src1_image)); CL_CHECK(clReleaseMemObject(buf_src2)); - } else { // for gemm - kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns; + kernel = backend_ctx->kernel_gemm_moe_q5_1_f32_ns; // Reorder router if called from test-backend-ops or when new router is generated. // Otherwise reuse the reordered result from previous mul_mat_id call. @@ -13934,8 +15622,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->q_img)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); @@ -13962,25 +15652,27 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, clReleaseMemObject(buf_dst_image); } return; - } // fallback to generic Q4_0 MoE kernel - -#endif // GGML_OPENCL_USE_ADRENO_KERNELS - kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q8_0: { +#ifdef GGML_OPENCL_SOA_Q + kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat; if (backend_ctx->gpu_family == INTEL) { sgs = 16; - nsg = 1; - ndst = 8; + nsg = 2; + ndst = 4; } else if (backend_ctx->gpu_family == ADRENO) { sgs = 64; - nsg = 1; - ndst = 8; + nsg = 2; + ndst = 4; } else { GGML_ASSERT(false && "TODO: Unknown GPU"); } - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); @@ -13989,25 +15681,57 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1)); +#else + kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32; + + if (backend_ctx->gpu_family == INTEL) { + sgs = 16; + nsg = 2; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + sgs = 64; + nsg = 2; + ndst = 4; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1)); +#endif // GGML_OPENCL_SOA_Q break; } - case GGML_TYPE_Q4_1: { + case GGML_TYPE_Q4_K: { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_moe_kernels(backend_ctx, src0)) { cl_int status; @@ -14016,7 +15740,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_size[3] = {64, 2, 1}; if (ne12 == 1) { // for gemv - kernel = backend_ctx->kernel_gemv_moe_q4_1_f32_ns; + kernel = backend_ctx->kernel_gemv_moe_q4_k_f32_ns; cl_mem src1_sub_buffer, buf_src1_image, buf_src2; @@ -14028,7 +15752,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // set thread grid - global_size[0] = static_cast(ne01); + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); global_size[1] = 4; global_size[2] = static_cast(ne20); local_size[1] = 4; @@ -14047,9 +15771,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->q)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->d)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->s)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); @@ -14067,7 +15792,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(clReleaseMemObject(buf_src2)); } else { // for gemm - kernel = backend_ctx->kernel_gemm_moe_q4_1_f32_ns; + kernel = backend_ctx->kernel_gemm_moe_q4_k_f32_ns; // Reorder router if called from test-backend-ops or when new router is generated. // Otherwise reuse the reordered result from previous mul_mat_id call. @@ -14091,14 +15816,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // Reorder activations - // create a sub_buffer for src1 region.origin = offset1; region.size = ne10 * ne11 * ne12 * sizeof(float); sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); CL_CHECK(status); // Create image for reordered src1 - // Use pre-allocated placeholder region.origin = 0; region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); @@ -14109,10 +15832,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, ®ion, &status); CL_CHECK(status); - cl_image_format image_format_buf_src1; - cl_image_desc image_desc_buf_src1; - image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; - image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); CL_CHECK(status); @@ -14133,7 +15854,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); // MoE kernel prepare - // Create sub buffer for dst region.origin = offsetd; region.size = ne0 * ne1 * ne2 * sizeof(float); sub_buf_dst = clCreateSubBuffer( @@ -14151,9 +15871,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->q_img)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->d)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->s)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); @@ -14183,7 +15904,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, } #endif //GGML_OPENCL_USE_ADRENO_KERNELS } - case GGML_TYPE_Q5_0: { + case GGML_TYPE_Q5_K: { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_moe_kernels(backend_ctx, src0)) { cl_int status; @@ -14192,7 +15913,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_size[3] = {64, 2, 1}; if (ne12 == 1) { // for gemv - kernel = backend_ctx->kernel_gemv_moe_q5_0_f32_ns; + kernel = backend_ctx->kernel_gemv_moe_q5_k_f32_ns; cl_mem src1_sub_buffer, buf_src1_image, buf_src2; @@ -14204,7 +15925,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // set thread grid - global_size[0] = static_cast(ne01); + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); global_size[1] = 4; global_size[2] = static_cast(ne20); local_size[1] = 4; @@ -14223,9 +15944,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->s)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); @@ -14243,7 +15966,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(clReleaseMemObject(buf_src2)); } else { // for gemm - kernel = backend_ctx->kernel_gemm_moe_q5_0_f32_ns; + kernel = backend_ctx->kernel_gemm_moe_q5_k_f32_ns; // Reorder router if called from test-backend-ops or when new router is generated. // Otherwise reuse the reordered result from previous mul_mat_id call. @@ -14285,10 +16008,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, ®ion, &status); CL_CHECK(status); - cl_image_format image_format_buf_src1; - cl_image_desc image_desc_buf_src1; - image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; - image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); CL_CHECK(status); @@ -14327,9 +16048,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs_img)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->dm)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); @@ -14359,7 +16082,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, } #endif //GGML_OPENCL_USE_ADRENO_KERNELS } - case GGML_TYPE_Q5_1: { + case GGML_TYPE_Q6_K: { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_moe_kernels(backend_ctx, src0)) { cl_int status; @@ -14368,7 +16091,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_size[3] = {64, 2, 1}; if (ne12 == 1) { // for gemv - kernel = backend_ctx->kernel_gemv_moe_q5_1_f32_ns; + kernel = backend_ctx->kernel_gemv_moe_q6_k_f32_ns; cl_mem src1_sub_buffer, buf_src1_image, buf_src2; @@ -14380,7 +16103,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // set thread grid - global_size[0] = static_cast(ne01); + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); global_size[1] = 4; global_size[2] = static_cast(ne20); local_size[1] = 4; @@ -14399,10 +16122,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->ql)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->d)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); @@ -14418,8 +16141,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(clReleaseMemObject(src1_sub_buffer)); CL_CHECK(clReleaseMemObject(buf_src1_image)); CL_CHECK(clReleaseMemObject(buf_src2)); + } else { // for gemm - kernel = backend_ctx->kernel_gemm_moe_q5_1_f32_ns; + kernel = backend_ctx->kernel_gemm_moe_q6_k_f32_ns; // Reorder router if called from test-backend-ops or when new router is generated. // Otherwise reuse the reordered result from previous mul_mat_id call. @@ -14450,7 +16174,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // Create image for reordered src1 - // Use pre-allocated placeholder region.origin = 0; region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); @@ -14461,10 +16184,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, ®ion, &status); CL_CHECK(status); - cl_image_format image_format_buf_src1; - cl_image_desc image_desc_buf_src1; - image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; - image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); CL_CHECK(status); @@ -14503,10 +16224,10 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, // Set kernel args int arg_idx = 0; - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs_img)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d)); - CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->ql_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->d)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); @@ -14536,82 +16257,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, } #endif //GGML_OPENCL_USE_ADRENO_KERNELS } - case GGML_TYPE_Q8_0: { -#ifdef GGML_OPENCL_SOA_Q - kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat; - - if (backend_ctx->gpu_family == INTEL) { - sgs = 16; - nsg = 2; - ndst = 4; - } else if (backend_ctx->gpu_family == ADRENO) { - sgs = 64; - nsg = 2; - ndst = 4; - } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); - } - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1)); -#else - kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32; - - if (backend_ctx->gpu_family == INTEL) { - sgs = 16; - nsg = 2; - ndst = 4; - } else if (backend_ctx->gpu_family == ADRENO) { - sgs = 64; - nsg = 2; - ndst = 4; - } else { - GGML_ASSERT(false && "TODO: Unknown GPU"); - } - - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1)); -#endif // GGML_OPENCL_SOA_Q - break; - } case GGML_TYPE_MXFP4: { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_moe_kernels(backend_ctx, src0)) { @@ -14633,7 +16278,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, CL_CHECK(status); // set thread grid - global_size[0] = static_cast(ne01); + global_size[0] = static_cast(((ne01 + 63) / 64) * 64); global_size[1] = 4; global_size[2] = static_cast(ne20); local_size[1] = 4; @@ -16021,6 +17666,185 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) { + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + const ggml_tensor * src_q = dst->src[0]; + const ggml_tensor * src_k = dst->src[1]; + const ggml_tensor * src_v = dst->src[2]; + const ggml_tensor * src_g = dst->src[3]; + const ggml_tensor * src_beta = dst->src[4]; + const ggml_tensor * src_state = dst->src[5]; + + GGML_ASSERT(src_q && src_q->extra); + GGML_ASSERT(src_k && src_k->extra); + GGML_ASSERT(src_v && src_v->extra); + GGML_ASSERT(src_g && src_g->extra); + GGML_ASSERT(src_beta && src_beta->extra); + GGML_ASSERT(src_state && src_state->extra); + + ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context; + + const cl_uint S_v = (cl_uint) src_v->ne[0]; + const cl_uint H_v = (cl_uint) src_v->ne[1]; + const cl_uint n_tokens = (cl_uint) src_v->ne[2]; + const cl_uint n_seqs = (cl_uint) src_v->ne[3]; + const cl_uint K = (cl_uint) src_state->ne[1]; + + int si; + switch (S_v) { + case 16: si = 0; break; + case 32: si = 1; break; + case 64: si = 2; break; + case 128: si = 3; break; + default: + GGML_ASSERT(false && "ggml_cl_gated_delta_net: unsupported S_v"); + } + + const int kda = (src_g->ne[0] == (int64_t) S_v) ? 1 : 0; + + // TODO: Optimize when S_v!=128. Not necessary for now as Qwen3.5/6 are all S_v=128 + // token generation mode (tgpp=0): + // process 1 token at a time, so columns per lane (cpl) == 1 + // prompt processing mode (tgpp=1): + // cpl=4 to process 4 tokens for single-token. 4 is chosen for Adreno 750 as per + // work-item/thread has at most 128 registers. + // All Qwen3.5/6 models are S_v == 128, so LANES_PER_COLUMN == 8 + // such that ROWS_PER_LANE = 128/8 = 16 + // Variables in the kernel: + // k_reg, q_reg, g_exp are all 16 floats + // s_shard has cpl*ROWS_PER_LANE = 4*16 = 64 floats + // Total 112 registers used. + // subgroups_per_workgroup (spw) can be set to 1,2,4,8,16 for tg and 1,2,4 for pp + // for S_v=128. + // Empirically found that when spw=1, we get the best performance for both tg and pp + const int tgpp = (n_tokens == 1) ? 0 : 1; + const int cpl = (tgpp == 0) ? 1 : 4; + // spw needs adjustment when S_v != 128 + const int spw = (tgpp == 0) ? 1 : 1; + + cl_kernel kernel = backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp]; + GGML_ASSERT(kernel != nullptr); + + const cl_uint s_off = S_v * H_v * n_tokens * n_seqs; + + const cl_uint sq1 = (cl_uint)(src_q->nb[1] / sizeof(float)); + const cl_uint sq2 = (cl_uint)(src_q->nb[2] / sizeof(float)); + const cl_uint sq3 = (cl_uint)(src_q->nb[3] / sizeof(float)); + const cl_uint sv1 = (cl_uint)(src_v->nb[1] / sizeof(float)); + const cl_uint sv2 = (cl_uint)(src_v->nb[2] / sizeof(float)); + const cl_uint sv3 = (cl_uint)(src_v->nb[3] / sizeof(float)); + const cl_uint sb1 = (cl_uint)(src_beta->nb[1] / sizeof(float)); + const cl_uint sb2 = (cl_uint)(src_beta->nb[2] / sizeof(float)); + const cl_uint sb3 = (cl_uint)(src_beta->nb[3] / sizeof(float)); + + const cl_uint H_k = (cl_uint) src_q->ne[1]; + const cl_uint rq3 = (cl_uint)(src_v->ne[3] / src_q->ne[3]); + + const float scale = 1.0f / sqrtf((float) S_v); + + ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *) src_q->extra; + ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *) src_k->extra; + ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *) src_v->extra; + ggml_tensor_extra_cl * extra_g = (ggml_tensor_extra_cl *) src_g->extra; + ggml_tensor_extra_cl * extra_beta = (ggml_tensor_extra_cl *) src_beta->extra; + ggml_tensor_extra_cl * extra_state = (ggml_tensor_extra_cl *) src_state->extra; + ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *) dst->extra; + + const cl_ulong off_q = extra_q->offset + src_q->view_offs; + const cl_ulong off_k = extra_k->offset + src_k->view_offs; + const cl_ulong off_v = extra_v->offset + src_v->view_offs; + const cl_ulong off_g = extra_g->offset + src_g->view_offs; + const cl_ulong off_beta = extra_beta->offset + src_beta->view_offs; + const cl_ulong off_state = extra_state->offset + src_state->view_offs; + const cl_ulong off_dst = extra_dst->offset + dst->view_offs; + + int idx = 0; + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_q->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_q)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_k->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_k)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_v->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_v)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_g->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_g)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_beta->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_beta)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_state->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_state)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_dst->data_device)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_dst)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_v)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_tokens)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_seqs)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s_off)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq2)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv2)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb1)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb2)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_k)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &rq3)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(float), &scale)); + CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &K)); + + // Subgroup size is 64 for Adreno and 32 for Intel + const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1; + if (sg_size < 0) { + GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n"); + exit(1); + } + + // For the subgroup-shuffle kernel, we can safely prefer 8 lanes/column for S_v>=128 + // For the subgroup-shuffle kernel: + // S_v >= 128 -> prefer 8 lanes/column (good occupancy & register pressure tradeoff) + // else -> min(S_v, subgroup_size) + int lanes_per_column; + if ((int)S_v >= 128) { + lanes_per_column = 8; + } else { + lanes_per_column = std::min((int)S_v, sg_size); + } + + // Max workgroup size for Adreno 750 is 1024 + const int wg_size = sg_size * spw; + + // Ensure lanes_per_column is a power-of-two and divides both S_v and subgroup_size. + // (Required for lane-group shuffle-xor reduction correctness.) + while (lanes_per_column > 1 && + (((lanes_per_column & (lanes_per_column - 1)) != 0) || + (((int)S_v % lanes_per_column) != 0) || + (sg_size % lanes_per_column) != 0)) { + lanes_per_column >>= 1; + } + GGML_ASSERT(lanes_per_column >= 1); + GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0)); + GGML_ASSERT(((int)S_v % lanes_per_column) == 0); + GGML_ASSERT((sg_size % lanes_per_column) == 0); + + const int cols_per_wg = spw * (sg_size / lanes_per_column) * cpl; + GGML_ASSERT(cols_per_wg > 0); + GGML_ASSERT(((int)S_v % cols_per_wg) == 0); + + size_t global_work_size[3]; + size_t local_work_size[3]; + + global_work_size[0] = (size_t) H_v * (size_t) wg_size; + global_work_size[1] = (size_t) n_seqs; + global_work_size[2] = (size_t) S_v / (size_t) cols_per_wg; + + local_work_size[0] = (size_t) wg_size; + local_work_size[1] = 1; + local_work_size[2] = 1; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + //------------------------------------------------------------------------------ // Op offloading //------------------------------------------------------------------------------ @@ -16236,8 +18060,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_group_norm; break; - case GGML_OP_REPEAT: - if (!any_on_device) { + case GGML_OP_REPEAT: + if (!any_on_device) { return false; } func = ggml_cl_repeat; @@ -16266,6 +18090,14 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor } func = ggml_cl_ssm_conv; break; + case GGML_OP_GATED_DELTA_NET: + if (!any_on_device) { + return false; + } + // GDN has 6 source tensors, so it cannot use the standard + // (src0, src1, dst) func signature. Dispatch directly and return. + ggml_cl_gated_delta_net(backend, tensor); + return true; case GGML_OP_CONCAT: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 8f06d570587..d07f0a1a025 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -117,6 +117,48 @@ struct block_iq4_nl uint8_t qs[QK4_NL / 2]; }; +//------------------------------------------------------------------------------ +// bf16 to f16 +//------------------------------------------------------------------------------ +kernel void kernel_convert_bf16_to_f16( + global const ushort * src, + global half * dst, + ulong off_dst, + ulong n +) { + uint i = get_global_id(0); + if (i >= n) { + return; + } + + dst[i + off_dst] = (half) as_float((uint) src[i] << 16); +} + +//------------------------------------------------------------------------------ +// f16 to bf16 +//------------------------------------------------------------------------------ +kernel void kernel_convert_f16_to_bf16( + global const half * src, + ulong off_src, + global ushort * dst, + ulong n +) { + uint i = get_global_id(0); + if (i >= n) { + return; + } + + float f = (float) src[i + off_src]; + uint bits = as_uint(f); + if ((bits & 0x7fffffffu) > 0x7f800000u) { + // nan to quiet nan + dst[i] = (ushort)((bits >> 16) | 0x40u); + } else { + uint rounded = bits + 0x7fffu + ((bits >> 16) & 1u); + dst[i] = (ushort)(rounded >> 16); + } +} + //------------------------------------------------------------------------------ // kernel_convert_block_q4_0 // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA). @@ -220,6 +262,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK4_0; uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -263,6 +309,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK4_0; uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -401,6 +451,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK4_1; uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -446,6 +500,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK4_1; uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -479,6 +537,53 @@ kernel void kernel_restore_block_q4_1_trans4_ns( ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; } +//------------------------------------------------------------------------------ +// kernel_convert_block_q5_0 +// Convert the block_q5_0 format to 3 separate arrays (AOS -> SOA). +// This kernel does not deshuffle the bits. +//------------------------------------------------------------------------------ +kernel void kernel_convert_block_q5_0( + global struct block_q5_0 * src0, + global uchar * dst_qs, + global uint * dst_qh, + global half * dst_d, + ulong n_blk +) { + if (get_global_id(0) >= n_blk) { + return; + } + + global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0); + global uchar * qs = (global uchar *) dst_qs + (QK5_0/2)*get_global_id(0); + global uint * qh = (global uint *) dst_qh + get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + + *d = b->d; + *qh = *((global uint *)(b->qh)); + + for (int i = 0; i < QK5_0/2; ++i) { + qs[i] = b->qs[i]; + } +} + +kernel void kernel_restore_block_q5_0( + global uchar * src_qs, + global uint * src_qh, + global half * src_d, + global struct block_q5_0 * dst +) { + global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0); + global uchar * qs = (global uchar *) src_qs + (QK5_0/2)*get_global_id(0); + global uint * qh = (global uint *) src_qh + get_global_id(0); + global half * d = (global half *) src_d + get_global_id(0); + + b->d = *d; + *((global uint *)(b->qh)) = *qh; + for (int i = 0; i < QK5_0/2; ++i) { + b->qs[i] = qs[i]; + } +} + kernel void kernel_convert_block_q5_0_trans4_ns( __global struct block_q5_0 * src0, __global uint * dst_qs, @@ -491,6 +596,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK5_0; uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -536,6 +645,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK5_0; uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -570,6 +683,59 @@ kernel void kernel_restore_block_q5_0_trans4_ns( ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; } +//------------------------------------------------------------------------------ +// kernel_convert_block_q5_1 +// Convert the block_q5_1 format to 4 separate arrays (AOS -> SOA). +// This kernel does not deshuffle the bits. +//------------------------------------------------------------------------------ +kernel void kernel_convert_block_q5_1( + global struct block_q5_1 * src0, + global uchar * dst_qs, + global uint * dst_qh, + global half * dst_d, + global half * dst_m, + ulong n_blk +) { + if (get_global_id(0) >= n_blk) { + return; + } + + global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0); + global uchar * qs = (global uchar *) dst_qs + (QK5_1/2)*get_global_id(0); + global uint * qh = (global uint *) dst_qh + get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + global half * m = (global half *) dst_m + get_global_id(0); + + *d = b->d; + *m = b->m; + *qh = *((global uint *)(b->qh)); + + for (int i = 0; i < QK5_1/2; ++i) { + qs[i] = b->qs[i]; + } +} + +kernel void kernel_restore_block_q5_1( + global uchar * src_qs, + global uint * src_qh, + global half * src_d, + global half * src_m, + global struct block_q5_1 * dst +) { + global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0); + global uchar * qs = (global uchar *) src_qs + (QK5_1/2)*get_global_id(0); + global uint * qh = (global uint *) src_qh + get_global_id(0); + global half * d = (global half *) src_d + get_global_id(0); + global half * m = (global half *) src_m + get_global_id(0); + + b->d = *d; + b->m = *m; + *((global uint *)(b->qh)) = *qh; + for (int i = 0; i < QK5_1/2; ++i) { + b->qs[i] = qs[i]; + } +} + kernel void kernel_convert_block_q5_1_trans4_ns( __global struct block_q5_1 * src0, __global uint * dst_qs, @@ -583,6 +749,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK5_1; uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -630,6 +800,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK5_1; uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -664,6 +838,415 @@ kernel void kernel_restore_block_q5_1_trans4_ns( ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; } +kernel void kernel_convert_block_q4_k_trans4_ns( + __global struct block_q4_K * src0, + __global uint * dst_q, + __global half * dst_d, + __global half * dst_dm, + __global uchar * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + if (i01 >= ne01) { + return; + } + + uint ne00_blk = ne00 / QK_K; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q4_K * b = src0 + src_blk_offset; + + dst_d [dst_blk_offset] = b->d; + dst_dm[dst_blk_offset] = b->dm; + + uint4 qv[8]; + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->q[i*32 + 2*j]; + uchar x1 = b->q[i*32 + 2*j + 1]; + + qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qv[p]; + dst_q[base + (p * 4 + 0) * ne01] = v.x; + dst_q[base + (p * 4 + 1) * ne01] = v.y; + dst_q[base + (p * 4 + 2) * ne01] = v.z; + dst_q[base + (p * 4 + 3) * ne01] = v.w; + } + + __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + #pragma unroll + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s_dst[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q4_k_trans4_ns( + __global uint * src_q, + __global half * src_d, + __global half * src_dm, + __global uchar * src_s, + __global struct block_q4_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + if (i01 >= ne01) { + return; + } + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q4_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + b->dm = src_dm[src_blk_offset]; + + __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s_src[i]; + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + uint4 qv[8]; + for (int p = 0; p < 8; ++p) { + qv[p].x = src_q[base + (p * 4 + 0) * ne01]; + qv[p].y = src_q[base + (p * 4 + 1) * ne01]; + qv[p].z = src_q[base + (p * 4 + 2) * ne01]; + qv[p].w = src_q[base + (p * 4 + 3) * ne01]; + } + + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = qv_bytes[i*32 + j]; + uchar hi = qv_bytes[i*32 + j + 16]; + b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } +} + +kernel void kernel_convert_block_q5_k_trans4_ns( + __global struct block_q5_K * src0, + __global uint * dst_qs, + __global uint * dst_qh, + __global half * dst_d, + __global half * dst_dm, + __global uchar * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + if (i01 >= ne01) { + return; + } + + uint ne00_blk = ne00 / QK_K; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q5_K * b = src0 + src_blk_offset; + + dst_d [dst_blk_offset] = b->d; + dst_dm[dst_blk_offset] = b->dm; + + for (int k = 0; k < 8; k++) { + uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0; + for (int bit = 0; bit < 8; bit++) { + b0 |= (uchar)(((b->qh[bit] >> k) & 1) << bit); + b1 |= (uchar)(((b->qh[8 + bit] >> k) & 1) << bit); + b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit); + b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit); + } + uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24); + dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed; + } + + uint4 qv[8]; + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->qs[i*32 + 2*j]; + uchar x1 = b->qs[i*32 + 2*j + 1]; + + qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qv[p]; + dst_qs[base + (p * 4 + 0) * ne01] = v.x; + dst_qs[base + (p * 4 + 1) * ne01] = v.y; + dst_qs[base + (p * 4 + 2) * ne01] = v.z; + dst_qs[base + (p * 4 + 3) * ne01] = v.w; + } + + __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + #pragma unroll + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s_dst[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q5_k_trans4_ns( + __global uint * src_qs, + __global uint * src_qh, + __global half * src_d, + __global half * src_dm, + __global uchar * src_s, + __global struct block_q5_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + if (i01 >= ne01) { + return; + } + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q5_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + b->dm = src_dm[src_blk_offset]; + + for (int j = 0; j < 32; j++) b->qh[j] = 0; + for (int k = 0; k < 8; k++) { + uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01]; + uchar b0 = (uchar)(packed & 0xFF); + uchar b1 = (uchar)((packed >> 8) & 0xFF); + uchar b2 = (uchar)((packed >> 16) & 0xFF); + uchar b3 = (uchar)((packed >> 24) & 0xFF); + for (int bit = 0; bit < 8; bit++) { + b->qh[bit] |= (uchar)(((b0 >> bit) & 1) << k); + b->qh[8 + bit] |= (uchar)(((b1 >> bit) & 1) << k); + b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k); + b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k); + } + } + + __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s_src[i]; + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + uint4 qv[8]; + for (int p = 0; p < 8; ++p) { + qv[p].x = src_qs[base + (p * 4 + 0) * ne01]; + qv[p].y = src_qs[base + (p * 4 + 1) * ne01]; + qv[p].z = src_qs[base + (p * 4 + 2) * ne01]; + qv[p].w = src_qs[base + (p * 4 + 3) * ne01]; + } + + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = qv_bytes[i*32 + j]; + uchar hi = qv_bytes[i*32 + j + 16]; + b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } +} + +kernel void kernel_convert_block_q6_k_trans4_ns( + __global struct block_q6_K * src0, + __global uint * dst_ql, + __global uint * dst_qh, + __global half * dst_d, + __global char * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + if (i01 >= ne01) { + return; + } + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q6_K * b = src0 + src_blk_offset; + + dst_d[dst_blk_offset] = b->d; + + uint4 qlv[8]; + uchar * qlv_bytes = (uchar *)qlv; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->ql[i*64 + 2*j]; + uchar x1 = b->ql[i*64 + 2*j + 1]; + uchar x2 = b->ql[i*64 + 32 + 2*j]; + uchar x3 = b->ql[i*64 + 32 + 2*j + 1]; + qlv_bytes[i*64 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4); + qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0); + } + } + + uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qlv[p]; + dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x; + dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y; + dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z; + dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w; + } + + uint qhv[16] = {0}; + + for (int n = 0; n < 2; ++n) { + for (int l = 0; l < 32; ++l) { + uchar h = b->qh[n*32 + l]; + int u = l / 16; + int bit_pos = (l % 16) * 2; + qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos; + qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos; + qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos; + qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos; + } + } + + uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01; + + for (int p = 0; p < 16; ++p) { + dst_qh[qh_base + p * ne01] = qhv[p]; + } + + __global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16; + #pragma unroll + for (int i = 0; i < 16; ++i) { + s_dst[i] = b->scales[i]; + } +} + +kernel void kernel_restore_block_q6_k_trans4_ns( + __global uint * src_ql, + __global uint * src_qh, + __global half * src_d, + __global char * src_s, + __global struct block_q6_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + if (i01 >= ne01) { + return; + } + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q6_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + + uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + uint4 qlv[8]; + for (int p = 0; p < 8; ++p) { + qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01]; + qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01]; + qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01]; + qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01]; + } + + uchar * qlv_bytes = (uchar *)qlv; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo_02 = qlv_bytes[i*64 + j]; + uchar lo_13 = qlv_bytes[i*64 + j + 16]; + uchar hi_02 = qlv_bytes[i*64 + j + 32]; + uchar hi_13 = qlv_bytes[i*64 + j + 48]; + b->ql[i*64 + 2*j] = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4)); + b->ql[i*64 + 2*j + 1] = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0)); + b->ql[i*64 + 32 + 2*j] = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4)); + b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0)); + } + } + + uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01; + uint qhv[16]; + for (int p = 0; p < 16; ++p) { + qhv[p] = src_qh[qh_base + p * ne01]; + } + + for (int n = 0; n < 2; ++n) { + for (int l = 0; l < 32; ++l) { + int u = l / 16; + int bit_pos = (l % 16) * 2; + uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03); + uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03); + uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03); + uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03); + b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6); + } + } + + __global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16; + for (int i = 0; i < 16; ++i) { + b->scales[i] = s_src[i]; + } +} + //------------------------------------------------------------------------------ // block_mxfp4 //------------------------------------------------------------------------------ @@ -762,6 +1345,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK_MXFP4; uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; @@ -805,6 +1392,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns( uint i01 = get_global_id(0); uint i02 = get_global_id(2); + if (i01 >= ne01) { + return; + } + uint ne00_blk = ne00 / QK_MXFP4; uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; diff --git a/ggml/src/ggml-opencl/kernels/gated_delta_net.cl b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl new file mode 100644 index 00000000000..d11192f5802 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gated_delta_net.cl @@ -0,0 +1,247 @@ +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#ifndef S_V +#define S_V 128 +#endif +#ifndef KDA +#define KDA 0 +#endif +#ifndef SUBGROUP_SIZE +#define SUBGROUP_SIZE 64 +#endif +#ifndef LANES_PER_COLUMN +#define LANES_PER_COLUMN 8 +#endif +#ifndef COLS_PER_LANE_GROUP +#define COLS_PER_LANE_GROUP 1 +#endif +#ifndef SUBGROUPS_PER_WG +#define SUBGROUPS_PER_WG 1 +#endif +#ifndef USE_QCOM_SUBGROUP_SHUFFLE +#define USE_QCOM_SUBGROUP_SHUFFLE 0 +#endif + +#define WG_SIZE (SUBGROUP_SIZE * SUBGROUPS_PER_WG) +#define LANE_GROUPS_PER_SG (SUBGROUP_SIZE / LANES_PER_COLUMN) +#define COLS_PER_SG (LANE_GROUPS_PER_SG * COLS_PER_LANE_GROUP) +#define COLS_PER_WG (SUBGROUPS_PER_WG * COLS_PER_SG) +#define ROWS_PER_LANE (S_V / LANES_PER_COLUMN) + +#if USE_QCOM_SUBGROUP_SHUFFLE +#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable +#endif + +// XOR-based parallel sum +// This does a reduction across groups of LANES_PER_COLUMN +static inline float reduce_add_shmem(float partial, __local float * temp, uint lane) { +#if USE_QCOM_SUBGROUP_SHUFFLE + #pragma unroll + for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) { + partial += qcom_sub_group_shuffle_xor(partial, s, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, partial); + } + return partial; +#else + temp[lane] = partial; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) { + float other = temp[lane ^ s]; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + temp[lane] += other; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + } + const float result = temp[lane]; + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + return result; +#endif +} + +#define REDUCE_PARTIAL(partial, temp_ptr, lid) \ + ((LANES_PER_COLUMN == 1u) ? (partial) : reduce_add_shmem((partial), (temp_ptr), (lid))) + +// force compiler to optimize kernel for a specific fixed work-group size +__attribute__((reqd_work_group_size(WG_SIZE, 1, 1))) +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_32 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_gated_delta_net( + global const char * q_buf, ulong off_q, + global const char * k_buf, ulong off_k, + global const char * v_buf, ulong off_v, + global const char * g_buf, ulong off_g, + global const char * beta_buf, ulong off_beta, + global const char * state_buf, ulong off_state, + global char * dst_buf, ulong off_dst, + uint H_v, + uint n_tokens, + uint n_seqs, + uint s_off, + uint sq1, uint sq2, uint sq3, + uint sv1, uint sv2, uint sv3, + uint sb1, uint sb2, uint sb3, + uint H_k, + uint rq3, + float scale, + uint K) { + + global const float * data_q = (global const float *)(q_buf + off_q); + global const float * data_k = (global const float *)(k_buf + off_k); + global const float * data_v = (global const float *)(v_buf + off_v); + global const float * data_g = (global const float *)(g_buf + off_g); + global const float * data_beta = (global const float *)(beta_buf + off_beta); + global const float * data_state = (global const float *)(state_buf + off_state); + global float * data_dst = (global float *)(dst_buf + off_dst); + + const uint head_id = get_group_id(0); + const uint seq_id = get_group_id(1); + const uint tid = (uint)get_local_id(0); + + const uint sg_id = get_sub_group_id(); // subgroup id + const uint sg_lid = get_sub_group_local_id(); // subgroup lane id + + const uint lane = sg_lid % LANES_PER_COLUMN; + const uint lane_group = sg_lid / LANES_PER_COLUMN; + const uint wg_col_base = get_group_id(2) * COLS_PER_WG; + const uint sg_col_base = wg_col_base + sg_id * COLS_PER_SG; + + const uint iq1 = head_id % H_k; // head index for Q and K + const uint iq3 = seq_id / rq3; // seq index for Q and K + + const uint state_size = S_V * S_V; + const uint state_base = (seq_id * K * H_v + head_id) * state_size; + const uint q_off_base = iq3 * sq3 + iq1 * sq1; + const uint v_off_base = seq_id * sv3 + head_id * sv1; + const uint gb_off_base = seq_id * sb3 + head_id * sb1; + const uint state_out_base = (seq_id * H_v + head_id) * state_size; + const uint state_size_per_snap = state_size * H_v * n_seqs; + + __local float reduce_temp[WG_SIZE]; + __local float * temp_ptr = reduce_temp + sg_id * SUBGROUP_SIZE; + + float s_shard[COLS_PER_LANE_GROUP][ROWS_PER_LANE]; + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + s_shard[cg][r] = data_state[state_base + col * S_V + r * LANES_PER_COLUMN + lane]; + } + } + + const int shift = (int)n_tokens - (int)K; + uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V; + + for (uint t = 0; t < n_tokens; t++) { + const uint q_off = q_off_base + t * sq2; + const uint k_off = q_off; + const uint v_off = v_off_base + t * sv2; + const uint gb_off = gb_off_base + t * sb2; + const float beta_val = data_beta[gb_off]; + + float k_reg[ROWS_PER_LANE]; + float q_reg[ROWS_PER_LANE]; +#if KDA + float g_exp[ROWS_PER_LANE]; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + const uint i = r * LANES_PER_COLUMN + lane; + k_reg[r] = data_k[k_off + i]; + q_reg[r] = data_q[q_off + i]; + g_exp[r] = exp(data_g[gb_off * S_V + i]); + } +#else + const float g_val = exp(data_g[gb_off]); + + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + const uint i = r * LANES_PER_COLUMN + lane; + k_reg[r] = data_k[k_off + i]; + q_reg[r] = data_q[q_off + i]; + } +#endif + + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + float v_val = data_v[v_off + col]; + + float kv_shard = 0.0f; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { +#if KDA + float gs = g_exp[r] * s_shard[cg][r]; + kv_shard += gs * k_reg[r]; +#else + kv_shard += s_shard[cg][r] * k_reg[r]; +#endif + } + +#if !KDA + kv_shard *= g_val; // Applied once instead of ROWS_PER_LANE times +#endif + + const float kv_col = REDUCE_PARTIAL(kv_shard, temp_ptr, sg_lid); + + const float delta_col = (v_val - kv_col) * beta_val; + + float attn_partial = 0.0f; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { +#if KDA + float gs = g_exp[r] * s_shard[cg][r]; +#else + float gs = g_val * s_shard[cg][r]; +#endif + s_shard[cg][r] = gs + k_reg[r] * delta_col; + attn_partial += s_shard[cg][r] * q_reg[r]; + } + const float attn_col = REDUCE_PARTIAL(attn_partial, temp_ptr, sg_lid); + + if (lane == 0) { + data_dst[attn_off + col] = attn_col * scale; + } + } + attn_off += S_V * H_v; + + if (K > 1u) { + const int target_slot = (int)t - shift; + if (target_slot >= 0 && target_slot < (int)K) { + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + const uint slot_base = s_off + (uint)target_slot * state_size_per_snap + state_out_base; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + data_dst[slot_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r]; + } + } + } + } + } + + if (K == 1u) { + #pragma unroll + for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) { + const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group; + #pragma unroll + for (uint r = 0; r < ROWS_PER_LANE; r++) { + data_dst[s_off + state_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r]; + } + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl index e404f392bdd..02cdbdd9fb1 100644 --- a/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl @@ -163,7 +163,7 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns( uint block_id_n = get_global_id(2); // n_tile // Boundary check - if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + if (block_id_n >= total_tiles[0]) { return; } @@ -248,6 +248,10 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns( dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); } + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + // Load poster router and share in LM __local uint out_idx[TILESIZE_N]; diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl index 02290c17eb1..d403ed0cab1 100644 --- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl @@ -115,7 +115,7 @@ kernel void kernel_gemm_moe_q4_0_f32_ns( uint block_id_n = get_global_id(2); // n_tile // Boundary check - if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + if (block_id_n >= total_tiles[0]) { return; } @@ -198,6 +198,10 @@ kernel void kernel_gemm_moe_q4_0_f32_ns( dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); } + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + // Load poster router and share in LM __local uint out_idx[TILESIZE_N]; diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl index e2574ae0187..b2bddf3f73a 100644 --- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl @@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q4_1_f32_ns( uint block_id_n = get_global_id(2); // n_tile // Boundary check - if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + if (block_id_n >= total_tiles[0]) { return; } @@ -200,6 +200,10 @@ kernel void kernel_gemm_moe_q4_1_f32_ns( dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); } + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + // Load poster router and share in LM __local uint out_idx[TILESIZE_N]; diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl new file mode 100644 index 00000000000..ab8228d18ca --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl @@ -0,0 +1,283 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +#define dequantize_q4_k(q4, a_f16, scale, minv) \ + a_f16.s0 = (half)((float)(q4.s0 & 0x000F) * scale - minv); \ + a_f16.s1 = (half)((float)((q4.s0 & 0x00F0) >> 4) * scale - minv); \ + a_f16.s2 = (half)((float)((q4.s0 & 0x0F00) >> 8) * scale - minv); \ + a_f16.s3 = (half)((float)((q4.s0 & 0xF000) >> 12) * scale - minv); \ + a_f16.s4 = (half)((float)(q4.s1 & 0x000F) * scale - minv); \ + a_f16.s5 = (half)((float)((q4.s1 & 0x00F0) >> 4) * scale - minv); \ + a_f16.s6 = (half)((float)((q4.s1 & 0x0F00) >> 8) * scale - minv); \ + a_f16.s7 = (half)((float)((q4.s1 & 0xF000) >> 12) * scale - minv); \ + a_f16.s8 = (half)((float)(q4.s2 & 0x000F) * scale - minv); \ + a_f16.s9 = (half)((float)((q4.s2 & 0x00F0) >> 4) * scale - minv); \ + a_f16.sa = (half)((float)((q4.s2 & 0x0F00) >> 8) * scale - minv); \ + a_f16.sb = (half)((float)((q4.s2 & 0xF000) >> 12) * scale - minv); \ + a_f16.sc = (half)((float)(q4.s3 & 0x000F) * scale - minv); \ + a_f16.sd = (half)((float)((q4.s3 & 0x00F0) >> 4) * scale - minv); \ + a_f16.se = (half)((float)((q4.s3 & 0x0F00) >> 8) * scale - minv); \ + a_f16.sf = (half)((float)((q4.s3 & 0xF000) >> 12) * scale - minv); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q4_k_f32_ns( + __read_only image1d_buffer_t src0_q, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (block_id_n >= total_tiles[0]) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * K_SCALE_SIZE; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; + uint sb = sub / 8; + uint j = sub % 8; + + // Load d and dm for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + half dm_val = src0_dm[d_offset]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = (float)dm_val * (float)mn; + + // First sub-block (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 q (64-bits) in transposed layout + uint2 q4x16; + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half (next 16 elements, same sub-block scale) + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl index 3524cb1bdbd..d1a35d58bb2 100644 --- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl @@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q5_0_f32_ns( uint block_id_n = get_global_id(2); // n_tile // Boundary check - if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + if (block_id_n >= total_tiles[0]) { return; } @@ -202,6 +202,10 @@ kernel void kernel_gemm_moe_q5_0_f32_ns( dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); } + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + // Load poster router and share in LM __local uint out_idx[TILESIZE_N]; diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl index 5fc2a523234..90d345ecf51 100644 --- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl @@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q5_1_f32_ns( uint block_id_n = get_global_id(2); // n_tile // Boundary check - if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + if (block_id_n >= total_tiles[0]) { return; } @@ -204,6 +204,10 @@ kernel void kernel_gemm_moe_q5_1_f32_ns( dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); } + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + // Load poster router and share in LM __local uint out_idx[TILESIZE_N]; diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl new file mode 100644 index 00000000000..13c26f6f3b6 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl @@ -0,0 +1,288 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +#define dequantize_q5_k(qs5x16, qh5x16, a_f16, scale, m) \ + a_f16.s0 = (half)((float)(( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) * scale + m); \ + a_f16.s1 = (half)((float)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m)); \ + a_f16.s2 = (half)((float)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m)); \ + a_f16.s3 = (half)((float)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m)); \ + a_f16.s4 = (half)((float)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m)); \ + a_f16.s5 = (half)((float)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m)); \ + a_f16.s6 = (half)((float)(((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \ + a_f16.s7 = (half)((float)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m)); \ + a_f16.s8 = (half)((float)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) * scale + m)); \ + a_f16.s9 = (half)((float)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m)); \ + a_f16.sa = (half)((float)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m)); \ + a_f16.sb = (half)((float)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m)); \ + a_f16.sc = (half)((float)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m)); \ + a_f16.sd = (half)((float)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m)); \ + a_f16.se = (half)((float)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m)); \ + a_f16.sf = (half)((float)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m)); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q5_k_f32_ns( + __read_only image1d_buffer_t src0_q, + __global uint * src0_qh, + __global uchar * src0_s, + __global half * src0_d, + __global half * src0_dm, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (block_id_n >= total_tiles[0]) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * K_SCALE_SIZE; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; + uint sb = sub / 8; + uint j = sub % 8; + + // Load d and dm for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + half dm_val = src0_dm[d_offset]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = -(float)dm_val * (float)mn; + + // qh is stored at sub-block granularity + uint qh_offset = row + sub * ne01 + expert_id * num_superblocks * 8 * ne01 + get_global_id(0); + uchar4 qhx32 = as_uchar4(src0_qh[qh_offset]); + + // First sub-block (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 q (64-bits) in transposed layout + uint2 q4x16; + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_k(as_ushort4(q4x16), qhx32.lo, reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q5_k(as_ushort4(q4x16), qhx32.hi, reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl new file mode 100644 index 00000000000..85ccebec78c --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl @@ -0,0 +1,267 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 + +#define dequantize_q6_k(qs16, qh16, a_f16, scale) \ + a_f16.s0 = (half)(((float)(( qs16.s0 & 0x000F) | ((uint)(( qh16 ) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s1 = (half)(((float)((( qs16.s0 >> 4) & 0x000F) | ((uint)(( qh16 >> 2) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s2 = (half)(((float)((( qs16.s0 >> 8) & 0x000F) | ((uint)(( qh16 >> 4) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s3 = (half)(((float)((( qs16.s0 >>12) & 0x000F) | ((uint)(( qh16 >> 6) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s4 = (half)(((float)(( qs16.s1 & 0x000F) | ((uint)(( qh16 >> 8) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s5 = (half)(((float)((( qs16.s1 >> 4) & 0x000F) | ((uint)(( qh16 >> 10) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s6 = (half)(((float)((( qs16.s1 >> 8) & 0x000F) | ((uint)(( qh16 >> 12) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s7 = (half)(((float)((( qs16.s1 >>12) & 0x000F) | ((uint)(( qh16 >> 14) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s8 = (half)(((float)(( qs16.s2 & 0x000F) | ((uint)(( qh16 >> 16) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s9 = (half)(((float)((( qs16.s2 >> 4) & 0x000F) | ((uint)(( qh16 >> 18) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sa = (half)(((float)((( qs16.s2 >> 8) & 0x000F) | ((uint)(( qh16 >> 20) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sb = (half)(((float)((( qs16.s2 >>12) & 0x000F) | ((uint)(( qh16 >> 22) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sc = (half)(((float)(( qs16.s3 & 0x000F) | ((uint)(( qh16 >> 24) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sd = (half)(((float)((( qs16.s3 >> 4) & 0x000F) | ((uint)(( qh16 >> 26) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.se = (half)(((float)((( qs16.s3 >> 8) & 0x000F) | ((uint)(( qh16 >> 28) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sf = (half)(((float)((( qs16.s3 >>12) & 0x000F) | ((uint)(( qh16 >> 30) & 0x3) << 4)) - 32.f) * scale); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q6_k_f32_ns( + __read_only image1d_buffer_t src0_ql, + __global uint * src0_qh, + __global char * src0_s, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (block_id_n >= total_tiles[0]) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * 16; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; // 32-element group index + uint sb = sub / 8; // super-block index + uint j = sub % 8; // group within super-block + + // Load d for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + + // Load sub-block scales + global const char * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * 16; + float scale0 = (float)d_val * (float)sc[j * 2]; + float scale1 = (float)d_val * (float)sc[j * 2 + 1]; + + uint qh_base = row + (sub * 2) * ne01 + expert_id * (num_superblocks * 16) * ne01 + get_global_id(0); + uint qh_first16 = src0_qh[qh_base]; + uint qh_second16 = src0_qh[qh_base + ne01]; + + // First half (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 ql nibbles (2 uints) from image + uint2 q4x16; + q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantize first 16 elements (scale0) + dequantize_q6_k(as_ushort4(q4x16), qh_first16, reg_a, scale0); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q6_k(as_ushort4(q4x16), qh_second16, reg_a, scale1); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) { + return; + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl index e4b44c1a56a..75129e20c65 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl @@ -82,6 +82,10 @@ __kernel void kernel_gemv_moe_mxfp4_f32_ns( uint sgid = get_local_id(1); uint slid = get_sub_group_local_id(); + if (i01 >= ne01) { + return; + } + uint i11 = i20 % ne11; uint expert_id = src2[i20]; diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl index 6f4d3f53216..2d28db63ec5 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl @@ -37,6 +37,10 @@ __kernel void kernel_gemv_moe_q4_0_f32_ns( uint sgid = get_local_id(1); uint slid = get_sub_group_local_id(); + if (i01 >= ne01) { + return; + } + uint i11 = i20 % ne11; uint expert_id = src2[i20]; diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl index 3739a215705..b98bdc0f12e 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl @@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q4_1_f32_ns( uint sgid = get_local_id(1); uint slid = get_sub_group_local_id(); + if (i01 >= ne01) { + return; + } + uint i11 = i20 % ne11; uint expert_id = src2[i20]; diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl new file mode 100644 index 00000000000..12464e9826e --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl @@ -0,0 +1,155 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define K_SCALE_SIZE 12 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +static inline float8 q4_k_to_fp32_packed8(ushort2 q4x8, float scale, float minv) { + float8 fp32x8; + fp32x8.s0 = (q4x8.s0 & 0x000F) * scale - minv; + fp32x8.s1 = ((q4x8.s0 & 0x00F0) >> 4) * scale - minv; + fp32x8.s2 = ((q4x8.s0 & 0x0F00) >> 8) * scale - minv; + fp32x8.s3 = ((q4x8.s0 & 0xF000) >> 12) * scale - minv; + fp32x8.s4 = (q4x8.s1 & 0x000F) * scale - minv; + fp32x8.s5 = ((q4x8.s1 & 0x00F0) >> 4) * scale - minv; + fp32x8.s6 = ((q4x8.s1 & 0x0F00) >> 8) * scale - minv; + fp32x8.s7 = ((q4x8.s1 & 0xF000) >> 12) * scale - minv; + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q4_k_f32_ns( + __global uint * src0_q, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + if (i01 >= ne01) { + return; + } + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; + int scales_per_row = num_superblocks * K_SCALE_SIZE; + + // Expert offsets in the transposed noshuffle layout + uint expert_q_offset = expert_id * (ne00 / 8) * ne01; + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; + uint j = ib % 8; + + // Load d and dmin for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + half dm_val = src0_dm[expert_d_offset + sb * ne01 + i01]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = (float)dm_val * (float)mn; + + // Load 4 uints of quants (32 nibbles = 32 elements) + uint q_base = expert_q_offset + ib * ne01 * 4 + i01; + + uint4 regQ; + regQ.s0 = src0_q[q_base]; + regQ.s1 = src0_q[q_base + ne01]; + regQ.s2 = src0_q[q_base + ne01 * 2]; + regQ.s3 = src0_q[q_base + ne01 * 3]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s0), scale, minv); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s1), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s2), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s3), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl index 938054cf982..b43613638a8 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl @@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q5_0_f32_ns( uint sgid = get_local_id(1); uint slid = get_sub_group_local_id(); + if (i01 >= ne01) { + return; + } + uint i11 = i20 % ne11; uint expert_id = src2[i20]; diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl index f33a4ef2757..7a666006e68 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl @@ -39,6 +39,10 @@ __kernel void kernel_gemv_moe_q5_1_f32_ns( uint sgid = get_local_id(1); uint slid = get_sub_group_local_id(); + if (i01 >= ne01) { + return; + } + uint i11 = i20 % ne11; uint expert_id = src2[i20]; diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl new file mode 100644 index 00000000000..7d868d7abd9 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl @@ -0,0 +1,160 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define K_SCALE_SIZE 12 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +static inline float8 q5_k_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) { + float8 fp32x8; + fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) * s + m); + fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m); + fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m); + fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m); + fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) * s + m); + fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m); + fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m); + fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m); + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q5_k_f32_ns( + __global uint * src0_q, + __global uint * src0_qh, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + if (i01 >= ne01) { + return; + } + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; + int scales_per_row = num_superblocks * K_SCALE_SIZE; + + // Expert offsets in the transposed noshuffle layout + uint expert_q_offset = expert_id * (ne00 / 8) * ne01; + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; + uint j = ib % 8; + + // Load d and dmin for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + half dm_val = src0_dm[expert_d_offset + sb * ne01 + i01]; + + // sub_block index = sb * 8 + j + uint expert_qh_offset = expert_id * num_superblocks * 8 * ne01; + uchar4 regQh = as_uchar4(src0_qh[expert_qh_offset + (sb * 8 + j) * ne01 + i01]); + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = -(float)dm_val * (float)mn; + + // Load 4 uints of quants (32 nibbles = 32 elements) + uint q_base = expert_q_offset + ib * ne01 * 4 + i01; + + uint4 regQ; + regQ.s0 = src0_q[q_base]; + regQ.s1 = src0_q[q_base + ne01]; + regQ.s2 = src0_q[q_base + ne01 * 2]; + regQ.s3 = src0_q[q_base + ne01 * 3]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, scale, minv); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl new file mode 100644 index 00000000000..c166bad5ba5 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl @@ -0,0 +1,141 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +static inline float8 q6_k_to_fp32_packed8(ushort2 ql8, ushort qh8, float d_scale) { + float8 fp32x8; + fp32x8.s0 = ((float)(( ql8.s0 & 0x000F) | ((uint)((qh8 ) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s1 = ((float)((( ql8.s0 >> 4) & 0x000F) | ((uint)((qh8 >> 2) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s2 = ((float)((( ql8.s0 >> 8) & 0x000F) | ((uint)((qh8 >> 4) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s3 = ((float)((( ql8.s0 >> 12)& 0x000F) | ((uint)((qh8 >> 6) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s4 = ((float)(( ql8.s1 & 0x000F) | ((uint)((qh8 >> 8) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s5 = ((float)((( ql8.s1 >> 4) & 0x000F) | ((uint)((qh8 >>10) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s6 = ((float)((( ql8.s1 >> 8) & 0x000F) | ((uint)((qh8 >>12) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s7 = ((float)((( ql8.s1 >> 12)& 0x000F) | ((uint)((qh8 >>14) & 0x3) << 4)) - 32.f) * d_scale; + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q6_k_f32_ns( + __global uint * src0_ql, + __global uint * src0_qh, + __global char * src0_s, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + if (i01 >= ne01) { + return; + } + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; // 8 sub-blocks of 32 per super-block + int scales_per_row = num_superblocks * 16; + + // Expert offsets in the transposed noshuffle layout + uint expert_ql_offset = expert_id * (ne00 / 8) * ne01; // 32 uints per super-block + uint expert_qh_offset = expert_id * (ne00 / 16) * ne01; // 16 uints per super-block + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; // super-block index + uint j = ib % 8; // 32-element group within super-block + + // Load d for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + + // Load 2 sub-block scales + global const char * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * 16; + float scale0 = (float)d_val * (float)sc[j * 2]; + float scale1 = (float)d_val * (float)sc[j * 2 + 1]; + + // Load 4 uints of ql + uint ql_base = expert_ql_offset + (ib * 4) * ne01 + i01; + uint4 regQL; + regQL.s0 = src0_ql[ql_base]; + regQL.s1 = src0_ql[ql_base + ne01]; + regQL.s2 = src0_ql[ql_base + ne01 * 2]; + regQL.s3 = src0_ql[ql_base + ne01 * 3]; + + // Load 2 uints of qh + uint qh_base = expert_qh_offset + (ib * 2) * ne01 + i01; + uint2 regQH; + regQH.s0 = src0_qh[qh_base]; + regQH.s1 = src0_qh[qh_base + ne01]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s0), (ushort)(regQH.s0 & 0xFFFF), scale0); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s1), (ushort)(regQH.s0 >> 16), scale0); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s2), (ushort)(regQH.s1 & 0xFFFF), scale1); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s3), (ushort)(regQH.s1 >> 16), scale1); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl new file mode 100644 index 00000000000..1e980a478a8 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q5_0_f32_l4_lm.cl @@ -0,0 +1,173 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define LOAD_VEC_A 8 +#define LOAD_VEC_B 4 + +#define BM 64 +#define BN 64 +#define BK 32 +#define TM 4 +#define TN 8 + +kernel void kernel_mul_mm_q5_0_f32_l4_lm( + global uchar4 * src0_qs, + global uint * src0_qh, + global half * src0_d, + global float4 * src1, + ulong offset1, + global float * dst, + ulong offsetd, + + int ne00, + int ne01, + int ne02, + int ne11, + int ne12, + + int stride_a, + int stride_b, + int stride_d, + + int batch_stride_a, + int batch_stride_b, + int batch_stride_d, + + int r2, + int r3 +) { + src1 = (global float4*)((global char*)src1 + offset1); + dst = (global float *)((global char*)dst + offsetd); + + local float buf_a[BM * BK]; + local float buf_b[BN * BK]; + + const int batch_idx = get_global_id(2); + + const int i13 = batch_idx / ne12; + const int i12 = batch_idx % ne12; + + const int i03 = i13 / r3; + const int i02 = i12 / r2; + + const int batch_idx_a = i03 * ne02 + i02; + + const int ir = get_group_id(0); + const int ic = get_group_id(1); + + const int tid = get_local_id(0); + const int th_r = tid % (BM / TM); + const int th_c = tid / (BM / TM); + + const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A); + const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A); + const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B); + const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B); + + const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK; + const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK; + + int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A; + int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B; + + float sums[TM * TN]; + float cache_a[TM]; + float cache_b[TN]; + + for (int i = 0; i < TM * TN; i++) { + sums[i] = 0.0f; + } + + for (int block = 0; block < ne00; block += BK) { + for (int l = 0; l < BM; l += loadstride_a) { + if (ir*BM + loadc_a + l < ne01) { + int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; + int ib = idx / 4; + int iqs = idx % 4; + + float d = (float)src0_d[ib]; + uint qh_val = src0_qh[ib]; + + global uchar4 * qs_ptr = src0_qs + ib*4 + iqs; + uchar4 q = *qs_ptr; + + uint qh_lo = qh_val >> (iqs * 4); + uint qh_hi = qh_val >> (iqs * 4 + 16); + + uchar4 b_lo = (uchar4)((uchar)qh_lo, (uchar)(qh_lo >> 1), (uchar)(qh_lo >> 2), (uchar)(qh_lo >> 3)) & (uchar)1; + uchar4 b_hi = (uchar4)((uchar)qh_hi, (uchar)(qh_hi >> 1), (uchar)(qh_hi >> 2), (uchar)(qh_hi >> 3)) & (uchar)1; + + float4 v1 = (convert_float4((q & (uchar)0x0F) | (b_lo << (uchar)4)) - 16.0f) * d; + float4 v2 = (convert_float4((q >> (uchar)4) | (b_hi << (uchar)4)) - 16.0f) * d; + + buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = v1.s0; + buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = v1.s1; + buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = v1.s2; + buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = v1.s3; + buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0; + buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1; + buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2; + buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3; + } else { + buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f; + } + } + + for (int l = 0; l < BN; l += loadstride_b) { + if (ic*BN + loadc_b + l < ne11) { + int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b; + buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0; + buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1; + buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2; + buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3; + } else { + buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + pos_a += BK / LOAD_VEC_A; + pos_b += BK / LOAD_VEC_B; + + for (int i = 0; i < BK; i++) { + for (int j = 0; j < TM; j++) { + cache_a[j] = buf_a[(i) * BM + th_r * TM + j]; + } + + for (int j = 0; j < TN; j++) { + cache_b[j] = buf_b[(i) * BN + th_c * TN + j]; + } + + for (int cc = 0; cc < TN; cc++) { + for (int cr = 0; cr < TM; cr++) { + const int sums_idx = cc*TM + cr; + sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int dr = ir * BM + th_r * TM; + const int dc = ic * BN + th_c * TN; + + const int offsets = batch_idx * batch_stride_d; + + for (int cc = 0; cc < TN; cc++) { + for (int cr = 0; cr < TM; cr++) { + if (dr + cr < ne01 && dc + cc < ne11) { + dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr]; + } + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl new file mode 100644 index 00000000000..ba06be54697 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q5_1_f32_l4_lm.cl @@ -0,0 +1,175 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define LOAD_VEC_A 8 +#define LOAD_VEC_B 4 + +#define BM 64 +#define BN 64 +#define BK 32 +#define TM 4 +#define TN 8 + +kernel void kernel_mul_mm_q5_1_f32_l4_lm( + global uchar4 * src0_qs, + global uint * src0_qh, + global half * src0_d, + global half * src0_m, + global float4 * src1, + ulong offset1, + global float * dst, + ulong offsetd, + + int ne00, + int ne01, + int ne02, + int ne11, + int ne12, + + int stride_a, + int stride_b, + int stride_d, + + int batch_stride_a, + int batch_stride_b, + int batch_stride_d, + + int r2, + int r3 +) { + src1 = (global float4*)((global char*)src1 + offset1); + dst = (global float *)((global char*)dst + offsetd); + + local float buf_a[BM * BK]; + local float buf_b[BN * BK]; + + const int batch_idx = get_global_id(2); + + const int i13 = batch_idx / ne12; + const int i12 = batch_idx % ne12; + + const int i03 = i13 / r3; + const int i02 = i12 / r2; + + const int batch_idx_a = i03 * ne02 + i02; + + const int ir = get_group_id(0); + const int ic = get_group_id(1); + + const int tid = get_local_id(0); + const int th_r = tid % (BM / TM); + const int th_c = tid / (BM / TM); + + const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A); + const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A); + const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B); + const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B); + + const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK; + const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK; + + int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A; + int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B; + + float sums[TM * TN]; + float cache_a[TM]; + float cache_b[TN]; + + for (int i = 0; i < TM * TN; i++) { + sums[i] = 0.0f; + } + + for (int block = 0; block < ne00; block += BK) { + for (int l = 0; l < BM; l += loadstride_a) { + if (ir*BM + loadc_a + l < ne01) { + int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; + int ib = idx / 4; + int iqs = idx % 4; + + float d = (float)src0_d[ib]; + float m = (float)src0_m[ib]; + uint qh_val = src0_qh[ib]; + + global uchar4 * qs = src0_qs + ib*4 + iqs; + uchar4 q = *qs; + + uint qh_lo = qh_val >> (iqs * 4); + uint qh_hi = qh_val >> (iqs * 4 + 16); + + uchar4 b_lo = (uchar4)((uchar)qh_lo, (uchar)(qh_lo >> 1), (uchar)(qh_lo >> 2), (uchar)(qh_lo >> 3)) & (uchar)1; + uchar4 b_hi = (uchar4)((uchar)qh_hi, (uchar)(qh_hi >> 1), (uchar)(qh_hi >> 2), (uchar)(qh_hi >> 3)) & (uchar)1; + + float4 v1 = convert_float4((q & (uchar)0x0F) | (b_lo << (uchar)4)) * d + m; + float4 v2 = convert_float4((q >> (uchar)4) | (b_hi << (uchar)4)) * d + m; + + buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = v1.s0; + buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = v1.s1; + buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = v1.s2; + buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = v1.s3; + buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = v2.s0; + buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = v2.s1; + buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = v2.s2; + buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = v2.s3; + } else { + buf_a[(loadr_a * 4 + 0) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 1) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 2) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 3) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 16) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 17) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 18) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * 4 + 19) * BM + loadc_a + l] = 0.0f; + } + } + + for (int l = 0; l < BN; l += loadstride_b) { + if (ic*BN + loadc_b + l < ne11) { + int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b; + buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0; + buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1; + buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2; + buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3; + } else { + buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + pos_a += BK / LOAD_VEC_A; + pos_b += BK / LOAD_VEC_B; + + for (int i = 0; i < BK; i++) { + for (int j = 0; j < TM; j++) { + cache_a[j] = buf_a[(i) * BM + th_r * TM + j]; + } + + for (int j = 0; j < TN; j++) { + cache_b[j] = buf_b[(i) * BN + th_c * TN + j]; + } + + for (int cc = 0; cc < TN; cc++) { + for (int cr = 0; cr < TM; cr++) { + const int sums_idx = cc*TM + cr; + sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int dr = ir * BM + th_r * TM; + const int dc = ic * BN + th_c * TN; + + const int offsets = batch_idx * batch_stride_d; + + for (int cc = 0; cc < TN; cc++) { + for (int cr = 0; cr < TM; cr++) { + if (dr + cr < ne01 && dc + cc < ne11) { + dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr]; + } + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl new file mode 100644 index 00000000000..6d8c9e8f037 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32.cl @@ -0,0 +1,241 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#define QK5_0 32 + +struct block_q5_0 { + half d; + uchar qh[4]; + uchar qs[QK5_0 / 2]; +}; + +inline float block_q5_0_dot_y( + global const struct block_q5_0 * qb_curr, + float sumy, + float16 yl, + int il, + global const float * yb +) { + float d = qb_curr->d; + + float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + + global const ushort * qs = ((global const ushort *)((global const uchar *) qb_curr + 6 + il)); + + acc.s0 += yl.s0 * (qs[0] & 0x000F); + acc.s0 += yl.s1 * (qs[0] & 0x0F00); + acc.s0 += yl.s8 * (qs[0] & 0x00F0); + acc.s3 += yl.s9 * (qs[0] & 0xF000); + + acc.s0 += yl.s2 * (qs[1] & 0x000F); + acc.s1 += yl.s3 * (qs[1] & 0x0F00); + acc.s2 += yl.sa * (qs[1] & 0x00F0); + acc.s3 += yl.sb * (qs[1] & 0xF000); + + acc.s0 += yl.s4 * (qs[2] & 0x000F); + acc.s1 += yl.s5 * (qs[2] & 0x0F00); + acc.s2 += yl.sc * (qs[2] & 0x00F0); + acc.s3 += yl.sd * (qs[2] & 0xF000); + + acc.s0 += yl.s6 * (qs[3] & 0x000F); + acc.s1 += yl.s7 * (qs[3] & 0x0F00); + acc.s2 += yl.se * (qs[3] & 0x00F0); + acc.s3 += yl.sf * (qs[3] & 0xF000); + + uint qh_val = *((global const uint *)((global const uchar *) qb_curr + 2)); + uchar qh_lo = (uchar)((qh_val >> il) & 0xFF); + uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF); + + float qh_sum = 0.0f; + qh_sum += yb[0] * (float)((qh_lo >> 0) & 1); + qh_sum += yb[1] * (float)((qh_lo >> 1) & 1); + qh_sum += yb[2] * (float)((qh_lo >> 2) & 1); + qh_sum += yb[3] * (float)((qh_lo >> 3) & 1); + qh_sum += yb[4] * (float)((qh_lo >> 4) & 1); + qh_sum += yb[5] * (float)((qh_lo >> 5) & 1); + qh_sum += yb[6] * (float)((qh_lo >> 6) & 1); + qh_sum += yb[7] * (float)((qh_lo >> 7) & 1); + qh_sum += yb[16] * (float)((qh_hi >> 0) & 1); + qh_sum += yb[17] * (float)((qh_hi >> 1) & 1); + qh_sum += yb[18] * (float)((qh_hi >> 2) & 1); + qh_sum += yb[19] * (float)((qh_hi >> 3) & 1); + qh_sum += yb[20] * (float)((qh_hi >> 4) & 1); + qh_sum += yb[21] * (float)((qh_hi >> 5) & 1); + qh_sum += yb[22] * (float)((qh_hi >> 6) & 1); + qh_sum += yb[23] * (float)((qh_hi >> 7) & 1); + + return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum - 16.0f * sumy); +} + +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 4 // each subgroup works on 4 rows +#define N_SIMDGROUP 1 // number of subgroups in a thread group +#define N_SIMDWIDTH 16 // assuming subgroup size is 16 +#elif defined (ADRENO_GPU) +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 64 +#endif + +inline void mul_vec_q_n_f32( + global void * src0, + global float * src1, + global float * dst, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + const ulong nb = ne00/QK5_0; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = get_group_id(2); + + int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST; + + int i12 = im%ne12; + int i13 = im/ne12; + + ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + global struct block_q5_0 * x = (global struct block_q5_0 *) src0 + offset0; + global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; + + float16 yl; + float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f); + + int ix = get_sub_group_local_id()/2; + int il = 8*(get_sub_group_local_id()%2); + + global float * yb = y + ix * QK5_0 + il; + + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) { + float sumy = 0; + + sumy += yb[0]; + sumy += yb[1]; + sumy += yb[2]; + sumy += yb[3]; + sumy += yb[4]; + sumy += yb[5]; + sumy += yb[6]; + sumy += yb[7]; + + sumy += yb[16]; + sumy += yb[17]; + sumy += yb[18]; + sumy += yb[19]; + sumy += yb[20]; + sumy += yb[21]; + sumy += yb[22]; + sumy += yb[23]; + + + yl.s0 = yb[0]; + yl.s1 = yb[1]/256.f; + + yl.s2 = yb[2]; + yl.s3 = yb[3]/256.f; + + yl.s4 = yb[4]; + yl.s5 = yb[5]/256.f; + + yl.s6 = yb[6]; + yl.s7 = yb[7]/256.f; + + yl.s8 = yb[16]/16.f; + yl.s9 = yb[17]/4096.f; + + yl.sa = yb[18]/16.f; + yl.sb = yb[19]/4096.f; + + yl.sc = yb[20]/16.f; + yl.sd = yb[21]/4096.f; + + yl.se = yb[22]/16.f; + yl.sf = yb[23]/4096.f; + + sumf.s0 += block_q5_0_dot_y(x+ib+0*nb, sumy, yl, il, yb); + sumf.s1 += block_q5_0_dot_y(x+ib+1*nb, sumy, yl, il, yb); + sumf.s2 += block_q5_0_dot_y(x+ib+2*nb, sumy, yl, il, yb); + sumf.s3 += block_q5_0_dot_y(x+ib+3*nb, sumy, yl, il, yb); + + yb += QK5_0 * (N_SIMDWIDTH/2); + } + + float4 tot = (float4)( + sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1), + sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3) + ); + + if (get_sub_group_local_id() == 0) { + if (first_row + 0 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; + } + if (first_row + 1 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; + } + if (first_row + 2 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; + } + if (first_row + 3 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + } + } +} + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_q5_0_f32( + global void * src0, + ulong offset0, + global float * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global void*)((global char*)src0 + offset0); + src1 = (global float*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3); +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl new file mode 100644 index 00000000000..34ec133d398 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_0_f32_flat.cl @@ -0,0 +1,243 @@ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#define QK5_0 32 + +inline float block_q5_0_dot_y_flat( + global const uchar * x, + global const uint * qh_ptr, + global const half * dh, + float sumy, + float16 yl, + int il, + global const float * yb +) { + float d = *dh; + global const ushort * qs = ((global const ushort *)(x + il)); + + float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + + acc.s0 += yl.s0 * (qs[0] & 0x000F); + acc.s0 += yl.s1 * (qs[0] & 0x0F00); + acc.s0 += yl.s8 * (qs[0] & 0x00F0); + acc.s3 += yl.s9 * (qs[0] & 0xF000); + + acc.s0 += yl.s2 * (qs[1] & 0x000F); + acc.s1 += yl.s3 * (qs[1] & 0x0F00); + acc.s2 += yl.sa * (qs[1] & 0x00F0); + acc.s3 += yl.sb * (qs[1] & 0xF000); + + acc.s0 += yl.s4 * (qs[2] & 0x000F); + acc.s1 += yl.s5 * (qs[2] & 0x0F00); + acc.s2 += yl.sc * (qs[2] & 0x00F0); + acc.s3 += yl.sd * (qs[2] & 0xF000); + + acc.s0 += yl.s6 * (qs[3] & 0x000F); + acc.s1 += yl.s7 * (qs[3] & 0x0F00); + acc.s2 += yl.se * (qs[3] & 0x00F0); + acc.s3 += yl.sf * (qs[3] & 0xF000); + + uint qh_val = *qh_ptr; + uchar qh_lo = (uchar)((qh_val >> il) & 0xFF); + uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF); + + float qh_sum = 0.0f; + qh_sum += yb[0] * (float)((qh_lo >> 0) & 1); + qh_sum += yb[1] * (float)((qh_lo >> 1) & 1); + qh_sum += yb[2] * (float)((qh_lo >> 2) & 1); + qh_sum += yb[3] * (float)((qh_lo >> 3) & 1); + qh_sum += yb[4] * (float)((qh_lo >> 4) & 1); + qh_sum += yb[5] * (float)((qh_lo >> 5) & 1); + qh_sum += yb[6] * (float)((qh_lo >> 6) & 1); + qh_sum += yb[7] * (float)((qh_lo >> 7) & 1); + qh_sum += yb[16] * (float)((qh_hi >> 0) & 1); + qh_sum += yb[17] * (float)((qh_hi >> 1) & 1); + qh_sum += yb[18] * (float)((qh_hi >> 2) & 1); + qh_sum += yb[19] * (float)((qh_hi >> 3) & 1); + qh_sum += yb[20] * (float)((qh_hi >> 4) & 1); + qh_sum += yb[21] * (float)((qh_hi >> 5) & 1); + qh_sum += yb[22] * (float)((qh_hi >> 6) & 1); + qh_sum += yb[23] * (float)((qh_hi >> 7) & 1); + + return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum - 16.0f * sumy); +} + +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 4 // each subgroup works on 4 rows +#define N_SIMDGROUP 1 // number of subgroups in a thread group +#define N_SIMDWIDTH 16 // assuming subgroup size is 16 +#elif defined (ADRENO_GPU) +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 64 +#endif + +inline void mul_vec_q_n_f32_flat( + global void * src0_qs, + global void * src0_qh, + global void * src0_d, + global float * src1, + global float * dst, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + const ulong nb = ne00/QK5_0; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = get_group_id(2); + + int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST; + + int i12 = im%ne12; + int i13 = im/ne12; + + ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + ulong offset0_qs = offset0 * (QK5_0/2); + + global uchar * x = (global uchar *) src0_qs + offset0_qs; + global uint * qh = (global uint *) src0_qh + offset0; + global half * d = (global half *) src0_d + offset0; + global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; + + float16 yl; + float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f); + + int ix = get_sub_group_local_id()/2; + int il = 8*(get_sub_group_local_id()%2); + + global float * yb = y + ix * QK5_0 + il; + + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) { + float sumy = 0; + + sumy += yb[0]; + sumy += yb[1]; + sumy += yb[2]; + sumy += yb[3]; + sumy += yb[4]; + sumy += yb[5]; + sumy += yb[6]; + sumy += yb[7]; + + sumy += yb[16]; + sumy += yb[17]; + sumy += yb[18]; + sumy += yb[19]; + sumy += yb[20]; + sumy += yb[21]; + sumy += yb[22]; + sumy += yb[23]; + + + yl.s0 = yb[0]; + yl.s1 = yb[1]/256.f; + + yl.s2 = yb[2]; + yl.s3 = yb[3]/256.f; + + yl.s4 = yb[4]; + yl.s5 = yb[5]/256.f; + + yl.s6 = yb[6]; + yl.s7 = yb[7]/256.f; + + yl.s8 = yb[16]/16.f; + yl.s9 = yb[17]/4096.f; + + yl.sa = yb[18]/16.f; + yl.sb = yb[19]/4096.f; + + yl.sc = yb[20]/16.f; + yl.sd = yb[21]/4096.f; + + yl.se = yb[22]/16.f; + yl.sf = yb[23]/4096.f; + + sumf.s0 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 0*nb*(QK5_0/2), qh + ib + 0*nb, d + ib + 0*nb, sumy, yl, il, yb); + sumf.s1 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 1*nb*(QK5_0/2), qh + ib + 1*nb, d + ib + 1*nb, sumy, yl, il, yb); + sumf.s2 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 2*nb*(QK5_0/2), qh + ib + 2*nb, d + ib + 2*nb, sumy, yl, il, yb); + sumf.s3 += block_q5_0_dot_y_flat(x + ib*(QK5_0/2) + 3*nb*(QK5_0/2), qh + ib + 3*nb, d + ib + 3*nb, sumy, yl, il, yb); + + yb += QK5_0 * (N_SIMDWIDTH/2); + } + + float4 tot = (float4)( + sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1), + sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3) + ); + + if (get_sub_group_local_id() == 0) { + if (first_row + 0 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; + } + if (first_row + 1 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; + } + if (first_row + 2 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; + } + if (first_row + 3 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + } + } +} + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_q5_0_f32_flat( + global void * src0_qs, + global void * src0_qh, + global void * src0_d, + global float * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + src1 = (global float*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + mul_vec_q_n_f32_flat(src0_qs, src0_qh, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3); +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl new file mode 100644 index 00000000000..1480f675038 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32.cl @@ -0,0 +1,243 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#define QK5_1 32 + +struct block_q5_1 { + half d; + half m; + uchar qh[4]; + uchar qs[QK5_1 / 2]; +}; + +inline float block_q5_1_dot_y( + global const struct block_q5_1 * qb_curr, + float sumy, + float16 yl, + int il, + global const float * yb +) { + float d = qb_curr->d; + float m = qb_curr->m; + + float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + + global const ushort * qs = ((global const ushort *)((global const uchar *) qb_curr + 8 + il)); + + acc.s0 += yl.s0 * (qs[0] & 0x000F); + acc.s0 += yl.s1 * (qs[0] & 0x0F00); + acc.s0 += yl.s8 * (qs[0] & 0x00F0); + acc.s3 += yl.s9 * (qs[0] & 0xF000); + + acc.s0 += yl.s2 * (qs[1] & 0x000F); + acc.s1 += yl.s3 * (qs[1] & 0x0F00); + acc.s2 += yl.sa * (qs[1] & 0x00F0); + acc.s3 += yl.sb * (qs[1] & 0xF000); + + acc.s0 += yl.s4 * (qs[2] & 0x000F); + acc.s1 += yl.s5 * (qs[2] & 0x0F00); + acc.s2 += yl.sc * (qs[2] & 0x00F0); + acc.s3 += yl.sd * (qs[2] & 0xF000); + + acc.s0 += yl.s6 * (qs[3] & 0x000F); + acc.s1 += yl.s7 * (qs[3] & 0x0F00); + acc.s2 += yl.se * (qs[3] & 0x00F0); + acc.s3 += yl.sf * (qs[3] & 0xF000); + + uint qh_val = *((global const uint *)((global const uchar *) qb_curr + 4)); + uchar qh_lo = (uchar)((qh_val >> il) & 0xFF); + uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF); + + float qh_sum = 0.0f; + qh_sum += yb[0] * (float)((qh_lo >> 0) & 1); + qh_sum += yb[1] * (float)((qh_lo >> 1) & 1); + qh_sum += yb[2] * (float)((qh_lo >> 2) & 1); + qh_sum += yb[3] * (float)((qh_lo >> 3) & 1); + qh_sum += yb[4] * (float)((qh_lo >> 4) & 1); + qh_sum += yb[5] * (float)((qh_lo >> 5) & 1); + qh_sum += yb[6] * (float)((qh_lo >> 6) & 1); + qh_sum += yb[7] * (float)((qh_lo >> 7) & 1); + qh_sum += yb[16] * (float)((qh_hi >> 0) & 1); + qh_sum += yb[17] * (float)((qh_hi >> 1) & 1); + qh_sum += yb[18] * (float)((qh_hi >> 2) & 1); + qh_sum += yb[19] * (float)((qh_hi >> 3) & 1); + qh_sum += yb[20] * (float)((qh_hi >> 4) & 1); + qh_sum += yb[21] * (float)((qh_hi >> 5) & 1); + qh_sum += yb[22] * (float)((qh_hi >> 6) & 1); + qh_sum += yb[23] * (float)((qh_hi >> 7) & 1); + + return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum) + sumy * m; +} + +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 4 // each subgroup works on 4 rows +#define N_SIMDGROUP 1 // number of subgroups in a thread group +#define N_SIMDWIDTH 16 // assuming subgroup size is 16 +#elif defined (ADRENO_GPU) +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 64 +#endif + +inline void mul_vec_q_n_f32( + global void * src0, + global float * src1, + global float * dst, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + const ulong nb = ne00/QK5_1; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = get_group_id(2); + + int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST; + + int i12 = im%ne12; + int i13 = im/ne12; + + ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + global struct block_q5_1 * x = (global struct block_q5_1 *) src0 + offset0; + global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; + + float16 yl; + float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f); + + int ix = get_sub_group_local_id()/2; + int il = 8*(get_sub_group_local_id()%2); + + global float * yb = y + ix * QK5_1 + il; + + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) { + float sumy = 0; + + sumy += yb[0]; + sumy += yb[1]; + sumy += yb[2]; + sumy += yb[3]; + sumy += yb[4]; + sumy += yb[5]; + sumy += yb[6]; + sumy += yb[7]; + + sumy += yb[16]; + sumy += yb[17]; + sumy += yb[18]; + sumy += yb[19]; + sumy += yb[20]; + sumy += yb[21]; + sumy += yb[22]; + sumy += yb[23]; + + + yl.s0 = yb[0]; + yl.s1 = yb[1]/256.f; + + yl.s2 = yb[2]; + yl.s3 = yb[3]/256.f; + + yl.s4 = yb[4]; + yl.s5 = yb[5]/256.f; + + yl.s6 = yb[6]; + yl.s7 = yb[7]/256.f; + + yl.s8 = yb[16]/16.f; + yl.s9 = yb[17]/4096.f; + + yl.sa = yb[18]/16.f; + yl.sb = yb[19]/4096.f; + + yl.sc = yb[20]/16.f; + yl.sd = yb[21]/4096.f; + + yl.se = yb[22]/16.f; + yl.sf = yb[23]/4096.f; + + sumf.s0 += block_q5_1_dot_y(x+ib+0*nb, sumy, yl, il, yb); + sumf.s1 += block_q5_1_dot_y(x+ib+1*nb, sumy, yl, il, yb); + sumf.s2 += block_q5_1_dot_y(x+ib+2*nb, sumy, yl, il, yb); + sumf.s3 += block_q5_1_dot_y(x+ib+3*nb, sumy, yl, il, yb); + + yb += QK5_1 * (N_SIMDWIDTH/2); + } + + float4 tot = (float4)( + sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1), + sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3) + ); + + if (get_sub_group_local_id() == 0) { + if (first_row + 0 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; + } + if (first_row + 1 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; + } + if (first_row + 2 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; + } + if (first_row + 3 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + } + } +} + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_q5_1_f32( + global void * src0, + ulong offset0, + global float * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global void*)((global char*)src0 + offset0); + src1 = (global float*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3); +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl new file mode 100644 index 00000000000..57c2f140958 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q5_1_f32_flat.cl @@ -0,0 +1,247 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#define QK5_1 32 + +inline float block_q5_1_dot_y_flat( + global const uchar * x, + global const uint * qh_ptr, + global const half * dh, + global const half * mh, + float sumy, + float16 yl, + int il, + global const float * yb +) { + float d = *dh; + float m = *mh; + global const ushort * qs = ((global const ushort *)(x + il)); + + float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + + acc.s0 += yl.s0 * (qs[0] & 0x000F); + acc.s0 += yl.s1 * (qs[0] & 0x0F00); + acc.s0 += yl.s8 * (qs[0] & 0x00F0); + acc.s3 += yl.s9 * (qs[0] & 0xF000); + + acc.s0 += yl.s2 * (qs[1] & 0x000F); + acc.s1 += yl.s3 * (qs[1] & 0x0F00); + acc.s2 += yl.sa * (qs[1] & 0x00F0); + acc.s3 += yl.sb * (qs[1] & 0xF000); + + acc.s0 += yl.s4 * (qs[2] & 0x000F); + acc.s1 += yl.s5 * (qs[2] & 0x0F00); + acc.s2 += yl.sc * (qs[2] & 0x00F0); + acc.s3 += yl.sd * (qs[2] & 0xF000); + + acc.s0 += yl.s6 * (qs[3] & 0x000F); + acc.s1 += yl.s7 * (qs[3] & 0x0F00); + acc.s2 += yl.se * (qs[3] & 0x00F0); + acc.s3 += yl.sf * (qs[3] & 0xF000); + + uint qh_val = *qh_ptr; + uchar qh_lo = (uchar)((qh_val >> il) & 0xFF); + uchar qh_hi = (uchar)((qh_val >> (il + 16)) & 0xFF); + + float qh_sum = 0.0f; + qh_sum += yb[0] * (float)((qh_lo >> 0) & 1); + qh_sum += yb[1] * (float)((qh_lo >> 1) & 1); + qh_sum += yb[2] * (float)((qh_lo >> 2) & 1); + qh_sum += yb[3] * (float)((qh_lo >> 3) & 1); + qh_sum += yb[4] * (float)((qh_lo >> 4) & 1); + qh_sum += yb[5] * (float)((qh_lo >> 5) & 1); + qh_sum += yb[6] * (float)((qh_lo >> 6) & 1); + qh_sum += yb[7] * (float)((qh_lo >> 7) & 1); + qh_sum += yb[16] * (float)((qh_hi >> 0) & 1); + qh_sum += yb[17] * (float)((qh_hi >> 1) & 1); + qh_sum += yb[18] * (float)((qh_hi >> 2) & 1); + qh_sum += yb[19] * (float)((qh_hi >> 3) & 1); + qh_sum += yb[20] * (float)((qh_hi >> 4) & 1); + qh_sum += yb[21] * (float)((qh_hi >> 5) & 1); + qh_sum += yb[22] * (float)((qh_hi >> 6) & 1); + qh_sum += yb[23] * (float)((qh_hi >> 7) & 1); + + return d * (acc.s0 + acc.s1 + acc.s2 + acc.s3 + 16.0f * qh_sum) + sumy * m; +} + +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 4 // each subgroup works on 4 rows +#define N_SIMDGROUP 1 // number of subgroups in a thread group +#define N_SIMDWIDTH 16 // assuming subgroup size is 16 +#elif defined (ADRENO_GPU) +#define N_DST 4 +#define N_SIMDGROUP 1 +#define N_SIMDWIDTH 64 +#endif + +inline void mul_vec_q_n_f32_flat( + global void * src0_qs, + global void * src0_qh, + global void * src0_d, + global void * src0_m, + global float * src1, + global float * dst, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + const ulong nb = ne00/QK5_1; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = get_group_id(2); + + int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST; + + int i12 = im%ne12; + int i13 = im/ne12; + + ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + ulong offset0_qs = offset0 * (QK5_1/2); + + global uchar * x = (global uchar *) src0_qs + offset0_qs; + global uint * qh = (global uint *) src0_qh + offset0; + global half * d = (global half *) src0_d + offset0; + global half * ms = (global half *) src0_m + offset0; + global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; + + float16 yl; + float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f); + + int ix = get_sub_group_local_id()/2; + int il = 8*(get_sub_group_local_id()%2); + + global float * yb = y + ix * QK5_1 + il; + + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) { + float sumy = 0; + + sumy += yb[0]; + sumy += yb[1]; + sumy += yb[2]; + sumy += yb[3]; + sumy += yb[4]; + sumy += yb[5]; + sumy += yb[6]; + sumy += yb[7]; + + sumy += yb[16]; + sumy += yb[17]; + sumy += yb[18]; + sumy += yb[19]; + sumy += yb[20]; + sumy += yb[21]; + sumy += yb[22]; + sumy += yb[23]; + + + yl.s0 = yb[0]; + yl.s1 = yb[1]/256.f; + + yl.s2 = yb[2]; + yl.s3 = yb[3]/256.f; + + yl.s4 = yb[4]; + yl.s5 = yb[5]/256.f; + + yl.s6 = yb[6]; + yl.s7 = yb[7]/256.f; + + yl.s8 = yb[16]/16.f; + yl.s9 = yb[17]/4096.f; + + yl.sa = yb[18]/16.f; + yl.sb = yb[19]/4096.f; + + yl.sc = yb[20]/16.f; + yl.sd = yb[21]/4096.f; + + yl.se = yb[22]/16.f; + yl.sf = yb[23]/4096.f; + + sumf.s0 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 0*nb*(QK5_1/2), qh + ib + 0*nb, d + ib + 0*nb, ms + ib + 0*nb, sumy, yl, il, yb); + sumf.s1 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 1*nb*(QK5_1/2), qh + ib + 1*nb, d + ib + 1*nb, ms + ib + 1*nb, sumy, yl, il, yb); + sumf.s2 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 2*nb*(QK5_1/2), qh + ib + 2*nb, d + ib + 2*nb, ms + ib + 2*nb, sumy, yl, il, yb); + sumf.s3 += block_q5_1_dot_y_flat(x + ib*(QK5_1/2) + 3*nb*(QK5_1/2), qh + ib + 3*nb, d + ib + 3*nb, ms + ib + 3*nb, sumy, yl, il, yb); + + yb += QK5_1 * (N_SIMDWIDTH/2); + } + + float4 tot = (float4)( + sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1), + sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3) + ); + + if (get_sub_group_local_id() == 0) { + if (first_row + 0 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0; + } + if (first_row + 1 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1; + } + if (first_row + 2 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2; + } + if (first_row + 3 < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3; + } + } +} + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_q5_1_f32_flat( + global void * src0_qs, + global void * src0_qh, + global void * src0_d, + global void * src0_m, + global float * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne10, + int ne12, + int ne0, + int ne1, + int r2, + int r3 +) { + src1 = (global float*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + mul_vec_q_n_f32_flat(src0_qs, src0_qh, src0_d, src0_m, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3); +} diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15443aa554a..15d231f70c0 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -13,6 +13,10 @@ #include // for qsort #include // for GGML_ASSERT +#ifdef GGML_USE_OPENMP +#include +#endif + #define GROUP_MAX_EPS 1e-15f #define GROUP_MAX_EPS_IQ3_XXS 1e-8f #define GROUP_MAX_EPS_IQ2_S 1e-8f @@ -3064,70 +3068,121 @@ void iq2xs_init_impl(enum ggml_type type) { } kmap_q2xs[index] = i; } - int8_t pos[8]; - int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + // The neighbour search runs in three passes: + // 1. Parallel: for each i, qsort and count its neighbours into n_per_i, + // and reduce the totals (num_neighbors, num_not_in_map). + // 2. Serial: prefix-sum n_per_i into offsets[], so each i has a + // pre-assigned slice of kneighbors_q2xs to write into. + // 3. Parallel: redo the qsort and write each i's neighbour list at + // offsets[i]. + int * n_per_i = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(n_per_i); int num_neighbors = 0, num_not_in_map = 0; - for (int i = 0; i < kmap_size; ++i) { - if (kmap_q2xs[i] >= 0) continue; - ++num_not_in_map; - for (int k = 0; k < 8; ++k) { - int l = (i >> 2*k) & 0x3; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); - int d2 = 0; - for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); - int n = 0; int d2 = dist2[0]; - int nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - ++n; - } - num_neighbors += n; +#ifdef GGML_USE_OPENMP + #pragma omp parallel reduction(+:num_neighbors,num_not_in_map) +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[8]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) { + n_per_i[i] = 0; + continue; + } + ++num_not_in_map; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + n_per_i[i] = n; + num_neighbors += n; + } + free(dist2); } //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq2_data[gindex].neighbours = kneighbors_q2xs; + + int * offsets = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(offsets); int counter = 0; for (int i = 0; i < kmap_size; ++i) { - if (kmap_q2xs[i] >= 0) continue; - for (int k = 0; k < 8; ++k) { - int l = (i >> 2*k) & 0x3; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); - int d2 = 0; - for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); - kmap_q2xs[i] = -(counter + 1); - int d2 = dist2[0]; - uint16_t * start = &kneighbors_q2xs[counter++]; - int n = 0, nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - kneighbors_q2xs[counter++] = dist2[2*j+1]; - ++n; - } - *start = n; - } - free(dist2); + if (kmap_q2xs[i] >= 0) { + offsets[i] = -1; + continue; + } + offsets[i] = counter; + counter += 1 + n_per_i[i]; + } + +#ifdef GGML_USE_OPENMP + #pragma omp parallel +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[8]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) continue; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + int local_counter = offsets[i]; + kmap_q2xs[i] = -(local_counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q2xs[local_counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q2xs[local_counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); + } + free(offsets); + free(n_per_i); } void iq2xs_free_impl(enum ggml_type type) { @@ -3663,70 +3718,115 @@ void iq3xs_init_impl(int grid_size) { } kmap_q3xs[index] = i; } - int8_t pos[4]; - int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + // See explanation of parallelism in iq2xs_init_impl + int * n_per_i = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(n_per_i); int num_neighbors = 0, num_not_in_map = 0; - for (int i = 0; i < kmap_size; ++i) { - if (kmap_q3xs[i] >= 0) continue; - ++num_not_in_map; - for (int k = 0; k < 4; ++k) { - int l = (i >> 3*k) & 0x7; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); - int d2 = 0; - for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); - int n = 0; int d2 = dist2[0]; - int nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - ++n; - } - num_neighbors += n; +#ifdef GGML_USE_OPENMP + #pragma omp parallel reduction(+:num_neighbors,num_not_in_map) +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[4]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) { + n_per_i[i] = 0; + continue; + } + ++num_not_in_map; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + n_per_i[i] = n; + num_neighbors += n; + } + free(dist2); } //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq3_data[gindex].neighbours = kneighbors_q3xs; + + int * offsets = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(offsets); int counter = 0; for (int i = 0; i < kmap_size; ++i) { - if (kmap_q3xs[i] >= 0) continue; - for (int k = 0; k < 4; ++k) { - int l = (i >> 3*k) & 0x7; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); - int d2 = 0; - for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); - kmap_q3xs[i] = -(counter + 1); - int d2 = dist2[0]; - uint16_t * start = &kneighbors_q3xs[counter++]; - int n = 0, nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - kneighbors_q3xs[counter++] = dist2[2*j+1]; - ++n; - } - *start = n; - } - free(dist2); + if (kmap_q3xs[i] >= 0) { + offsets[i] = -1; + continue; + } + offsets[i] = counter; + counter += 1 + n_per_i[i]; + } + +#ifdef GGML_USE_OPENMP + #pragma omp parallel +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[4]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int local_counter = offsets[i]; + kmap_q3xs[i] = -(local_counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q3xs[local_counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q3xs[local_counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); + } + free(offsets); + free(n_per_i); } void iq3xs_free_impl(int grid_size) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 1cb8f563d85..d3805772183 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() { return &guid; } +struct ggml_backend_rpc_device_context { + std::string endpoint; + uint32_t device; + std::string name; + std::string description; + uint64_t last_graph_uid; +}; + struct ggml_backend_rpc_buffer_type_context { std::string endpoint; uint32_t device; @@ -211,7 +219,6 @@ struct ggml_backend_rpc_context { std::string endpoint; uint32_t device; std::string name; - uint64_t last_graph_uid; }; struct ggml_backend_rpc_buffer_context { @@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend); + ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context; GGML_ASSERT(cgraph->n_nodes > 0); - bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid; + bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid; if (reuse) { rpc_msg_graph_recompute_req request; request.device = rpc_ctx->device; @@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request)); RPC_STATUS_ASSERT(status); } else { - rpc_ctx->last_graph_uid = cgraph->uid; + rpc_dev_ctx->last_graph_uid = cgraph->uid; std::vector input; serialize_graph(rpc_ctx->device, cgraph, input); auto sock = get_socket(rpc_ctx->endpoint); @@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) { /* .endpoint = */ endpoint, /* .device = */ device, /* .name = */ dev_name, - /* .last_graph_uid = */ 0, }; auto reg = ggml_backend_rpc_add_server(endpoint); ggml_backend_t backend = new ggml_backend { @@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir } } -// device interface - -struct ggml_backend_rpc_device_context { - std::string endpoint; - uint32_t device; - std::string name; - std::string description; -}; - static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) { ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; @@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) { std::string dev_name = "RPC" + std::to_string(dev_id); std::string dev_desc = std::string(endpoint); ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context { - /* .endpoint = */ endpoint, - /* .device = */ ind, - /* .name = */ dev_name, - /* .description = */ dev_desc + /* .endpoint = */ endpoint, + /* .device = */ ind, + /* .name = */ dev_name, + /* .description = */ dev_desc, + /* .last_graph_uid = */ 0, }; ggml_backend_dev_t dev = new ggml_backend_device { diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 96bc1c98bd9..d8bb3638dfd 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -45,6 +45,7 @@ namespace syclexp = sycl::ext::oneapi::experimental; #define GGML_COMMON_IMPL_SYCL #define SYCL_FLASH_ATTN //remove it to disable FLASH_ATTENTION in building. #define SYCL_FAST_FP16 //don't change. remove it will break fattn-tile.hpp building +#define GGML_SYCL_FA_ALL_QUANTS //define it to enable all quantization types in flash attention. undefine it to only support F16, Q4_0 and Q8_0 in flash attention. /* suppress warning spam */ #pragma clang diagnostic push @@ -224,6 +225,7 @@ struct sycl_device_info { int max_wg_per_cu; // max work groups per compute unit - refer to // cudaOccupancyMaxActiveBlocksPerMultiprocessor bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory size_t total_vram; sycl_hw_info hw_info; optimize_feature opt_feature; @@ -238,10 +240,14 @@ struct ggml_sycl_device_info { std::array default_tensor_split = {}; int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0}; + + bool ext_oneapi_level_zero = true; // sycl::backend::ext_oneapi_level_zero used by all enumerated GPU devices }; const ggml_sycl_device_info & ggml_sycl_info(); +static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128; + struct ggml_sycl_pool { virtual ~ggml_sycl_pool() = default; diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 576f19d79ae..65593402e7d 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -107,6 +107,19 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k, #endif } +template +static void dequantize_row_q3_K_sycl_reorder(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K_reorder(vx, y, item_ct1, nb); + }); +} + template static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k, dpct::queue_ptr stream) { @@ -652,7 +665,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { case GGML_TYPE_Q2_K: return dequantize_row_q2_K_sycl; case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_sycl; + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q3_K_sycl_reorder; + } else { + return dequantize_row_q3_K_sycl; + } case GGML_TYPE_Q4_K: if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { return dequantize_row_q4_K_sycl_reorder; @@ -730,7 +747,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { case GGML_TYPE_Q2_K: return dequantize_row_q2_K_sycl; case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_sycl; + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q3_K_sycl_reorder; + } else { + return dequantize_row_q3_K_sycl; + } case GGML_TYPE_Q4_K: if (dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index 2324bfacd22..ca8cd96c08c 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -20,6 +20,10 @@ typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs, const int iqs, dfloat2 &v); +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m); +#endif + static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib, const int iqs, dfloat2 &v) { const block_q4_0 * x = (const block_q4_0 *) vx; @@ -90,6 +94,474 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, #endif // GGML_SYCL_F16 } +static __dpct_inline__ void dequantize_q4_K(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_q4_K * x = (const block_q4_K *) vx; + const sycl::half2 dm = x[ib].dm; + const float dall = dm[0]; + const float dmin = dm[1]; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int il = idx / 64; + const int in = idx % 64; + const int is = 2 * il + (in >= 32 ? 1 : 0); + const int off = in & 31; + const int qsi = 32 * il + off; + + uint8_t sc; + uint8_t m; + get_scale_min_k4(is, x[ib].scales, sc, m); + + const uint8_t q = x[ib].qs[qsi]; + const uint8_t qv = (in >= 32) ? (q >> 4) : (q & 0xF); + return sycl::fma((dfloat) qv, (dfloat) (dall * sc), (dfloat) (-dmin * m)); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("Q4_K dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_q2_K(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_q2_K * x = (const block_q2_K *) vx; + const float dall = x[ib].dm[0]; + const float dmin = x[ib].dm[1]; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int n = idx / 128; + const int r = idx % 128; + const int g = r / 32; + const int l = r % 32; + const int is = 8 * n + l / 16; + + const uint8_t q = x[ib].qs[32 * n + l]; + const uint8_t sc = x[ib].scales[is + 2 * g]; + const float d = dall * (sc & 0xF); + const float m = dmin * (sc >> 4); + + return sycl::fma((dfloat) ((q >> (2 * g)) & 3), (dfloat) d, (dfloat) (-m)); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("Q2_K dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_q3_K(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_q3_K * x = (const block_q3_K *) vx; + const float d_all = x[ib].d; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int n = idx / 128; + const int r = idx % 128; + const int j = r / 32; + const int l = r % 32; + + const int is0 = l / 16; + const int is = 8 * n + 2 * j + is0; + const int shift = 2 * j; + const uint8_t m = 1 << (4 * n + j); + + const int8_t us = is < 4 ? (x[ib].scales[is - 0] & 0xF) | (((x[ib].scales[is + 8] >> 0) & 3) << 4) : + is < 8 ? (x[ib].scales[is - 0] & 0xF) | (((x[ib].scales[is + 4] >> 2) & 3) << 4) : + is < 12 ? (x[ib].scales[is - 8] >> 4) | (((x[ib].scales[is + 0] >> 4) & 3) << 4) : + (x[ib].scales[is - 8] >> 4) | (((x[ib].scales[is - 4] >> 6) & 3) << 4); + + const float dl = d_all * (us - 32); + const uint8_t q = x[ib].qs[32 * n + l]; + const uint8_t h = x[ib].hmask[l]; + const int8_t qv = ((q >> shift) & 3) - ((h & m) ? 0 : 4); + + return (dfloat) (dl * qv); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("Q3_K dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_q5_K(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_q5_K * x = (const block_q5_K *) vx; + const float dall = x[ib].dm[0]; + const float dmin = x[ib].dm[1]; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int il = idx / 64; + const int in = idx % 64; + const int is = 2 * il + (in >= 32 ? 1 : 0); + const int ir = (in & 31) / 2; + const int iq = in & 1; + + const uint8_t q = x[ib].qs[32 * il + 2 * ir + iq]; + const uint8_t h = x[ib].qh[2 * ir + iq]; + const uint8_t qv = (in >= 32) ? (q >> 4) : (q & 0xF); + + uint8_t sc; + uint8_t m; + get_scale_min_k4(is, x[ib].scales, sc, m); + + const float d = dall * sc; + const float mn = dmin * m; + const uint8_t hm = 1 << (2 * il + (in >= 32 ? 1 : 0)); + + return sycl::fma((dfloat) (qv + ((h & hm) ? 16 : 0)), (dfloat) d, (dfloat) (-mn)); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("Q5_K dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_q6_K(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_q6_K * x = (const block_q6_K *) vx; + const float d = x[ib].d; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ip = idx / 128; + const int in = idx % 128; + const int il = in & 31; + const int ig = in / 32; + const int is = 8 * ip + il / 16; + + const uint8_t ql0 = x[ib].ql[64 * ip + il]; + const uint8_t ql1 = x[ib].ql[64 * ip + il + 32]; + const uint8_t qh = x[ib].qh[32 * ip + il]; + const int8_t * sc = x[ib].scales + is; + + uint8_t qv; + int8_t scale; + if (ig == 0) { + qv = (ql0 & 0xF) | (((qh >> 0) & 3) << 4); + scale = sc[0]; + } else if (ig == 1) { + qv = (ql1 & 0xF) | (((qh >> 2) & 3) << 4); + scale = sc[2]; + } else if (ig == 2) { + qv = (ql0 >> 4) | (((qh >> 4) & 3) << 4); + scale = sc[4]; + } else { + qv = (ql1 >> 4) | (((qh >> 6) & 3) << 4); + scale = sc[6]; + } + + return (dfloat) (d * scale * ((int8_t) qv - 32)); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("Q6_K dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_mxfp4(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_mxfp4 * x = (const block_mxfp4 *) vx; + const float d = ggml_sycl_e8m0_to_fp32(x[ib].e); + const uint8_t q = x[ib].qs[iqs]; + + v.x() = d * kvalues_mxfp4[q & 0xF] * 0.5f; + v.y() = d * kvalues_mxfp4[q >> 4] * 0.5f; +} + +static __dpct_inline__ void dequantize_q1_0(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_q1_0 * x = (const block_q1_0 *) vx; + const dfloat d = x[ib].d; + + const int bit_index_0 = iqs + 0; + const int bit_index_1 = iqs + 1; + + const int bit_0 = (x[ib].qs[bit_index_0 / 8] >> (bit_index_0 % 8)) & 1; + const int bit_1 = (x[ib].qs[bit_index_1 / 8] >> (bit_index_1 % 8)) & 1; + + v.x() = (2 * bit_0 - 1) * d; + v.y() = (2 * bit_1 - 1) * d; +} + +static __dpct_inline__ void dequantize_nvfp4(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_nvfp4 & xb = ((const block_nvfp4 *) vx)[ib]; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int sub = idx / QK_NVFP4_SUB; + const int j = idx % QK_NVFP4_SUB; + const int jh = j % (QK_NVFP4_SUB / 2); + + const float d = ggml_sycl_ue4m3_to_fp32(xb.d[sub]); + const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + jh]; + const uint8_t qv = (j < (QK_NVFP4_SUB / 2)) ? (q & 0x0F) : (q >> 4); + + return d * kvalues_mxfp4[qv]; + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +} + +static __dpct_inline__ void dequantize_iq2_xxs(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq2_xxs * x = (const block_iq2_xxs *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const uint16_t * q2 = x[ib].qs + 4 * ib8; + const uint8_t * aux8 = (const uint8_t *) q2; + const uint8_t * grid = (const uint8_t *) (iq2xxs_grid + aux8[il]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float) x[ib].d * (0.5f + (aux32 >> 28)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> (7 * il)) & 127]; + + return d * grid[j] * ((signs & kmask_iq2xs[j]) ? -1.f : 1.f); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ2_XXS dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq2_xs(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const uint16_t * q2 = x[ib].qs + 4 * ib8; + const uint8_t * grid = (const uint8_t *) (iq2xs_grid + (q2[il] & 511)); + const float d = (float) x[ib].d * (0.5f + ((x[ib].scales[ib8] >> (4 * (il / 2))) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + + return d * grid[j] * ((signs & kmask_iq2xs[j]) ? -1.f : 1.f); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ2_XS dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq2_s(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq2_s * x = (const block_iq2_s *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const uint16_t grid_id = x[ib].qs[4 * ib8 + il] | ((x[ib].qh[ib8] << (8 - 2 * il)) & 0x300); + const uint8_t * grid = (const uint8_t *) (iq2s_grid + grid_id); + const float d = (float) x[ib].d * (0.5f + ((x[ib].scales[ib8] >> (4 * (il / 2))) & 0xf)) * 0.25f; + const uint8_t signs = x[ib].qs[QK_K / 8 + 4 * ib8 + il]; + + return d * grid[j] * ((signs & kmask_iq2xs[j]) ? -1.f : 1.f); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ2_S dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq3_xxs(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq3_xxs * x = (const block_iq3_xxs *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const uint8_t * q3 = x[ib].qs + 8 * ib8; + const uint16_t * gas = (const uint16_t *) (x[ib].qs + QK_K / 4) + 2 * ib8; + const uint8_t * grid1 = (const uint8_t *) (iq3xxs_grid + q3[2 * il + 0]); + const uint8_t * grid2 = (const uint8_t *) (iq3xxs_grid + q3[2 * il + 1]); + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = (float) x[ib].d * (0.5f + (aux32 >> 28)) * 0.5f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> (7 * il)) & 127]; + + if (j < 4) { + return d * grid1[j] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f); + } + return d * grid2[j - 4] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ3_XXS dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq3_s(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq3_s * x = (const block_iq3_s *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const uint8_t * qs = x[ib].qs + 8 * ib8; + const uint16_t grid1_id = qs[2 * il + 0] | ((x[ib].qh[ib8] << (8 - 2 * il)) & 256); + const uint16_t grid2_id = qs[2 * il + 1] | ((x[ib].qh[ib8] << (7 - 2 * il)) & 256); + const uint8_t * grid1 = (const uint8_t *) (iq3s_grid + grid1_id); + const uint8_t * grid2 = (const uint8_t *) (iq3s_grid + grid2_id); + const float d = (float) x[ib].d * (1 + 2 * ((x[ib].scales[ib8 / 2] >> (4 * (ib8 % 2))) & 0xf)); + const uint8_t signs = x[ib].signs[4 * ib8 + il]; + + if (j < 4) { + return d * grid1[j] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f); + } + return d * grid2[j - 4] * ((signs & kmask_iq2xs[j + 0]) ? -1.f : 1.f); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ3_S dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq1_s(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq1_s * x = (const block_iq1_s *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const float delta = (x[ib].qh[ib8] & 0x8000) ? (-1.f - IQ1S_DELTA) : (-1.f + IQ1S_DELTA); + const float d = (float) x[ib].d * (2 * ((x[ib].qh[ib8] >> 12) & 7) + 1); + const uint16_t grid_id = x[ib].qs[4 * ib8 + il] | (((x[ib].qh[ib8] >> (3 * il)) & 7) << 8); + const uint32_t g = iq1s_grid_gpu[grid_id]; + const int8_t qv = (j < 4) ? ((g >> (8 * j)) & 0x0F) : ((g >> (8 * (j - 4) + 4)) & 0x0F); + + return d * (qv + delta); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ1_S dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq1_m(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq1_m * x = (const block_iq1_m *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int il = r / 8; + const int j = r % 8; + + const uint16_t * sc = (const uint16_t *) x[ib].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + const int ib16 = 2 * ib8 + il / 2; + const float d = (float) scale.f16 * (2 * ((sc[ib16 / 4] >> (3 * (ib16 % 4))) & 0x7) + 1); + + const uint8_t qh = x[ib].qh[2 * ib8 + il / 2]; + const float delta = (qh & (0x08 << (4 * (il % 2)))) ? (-1.f - IQ1M_DELTA) : (-1.f + IQ1M_DELTA); + + const uint16_t grid_id = x[ib].qs[4 * ib8 + il] | (((qh >> (4 * (il % 2))) & 7) << 8); + const uint32_t g = iq1s_grid_gpu[grid_id]; + const int8_t qv = (j < 4) ? ((g >> (8 * j)) & 0x0F) : ((g >> (8 * (j - 4) + 4)) & 0x0F); + + return d * (qv + delta); + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ1_M dequantize not supported for QK_K != 256"); +#endif +} + +static __dpct_inline__ void dequantize_iq4_nl(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_iq4_nl * x = (const block_iq4_nl *) vx; + const float d = (float) x[ib].d; + + auto dequantize_one = [&](const int idx) -> dfloat { + if (idx < 16) { + return d * kvalues_iq4nl[x[ib].qs[idx] & 0xF]; + } + return d * kvalues_iq4nl[x[ib].qs[idx - 16] >> 4]; + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +} + +static __dpct_inline__ void dequantize_iq4_xs(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { +#if QK_K == 256 + const block_iq4_xs * x = (const block_iq4_xs *) vx; + + auto dequantize_one = [&](const int idx) -> dfloat { + const int ib8 = idx / 32; + const int r = idx % 32; + const int byte_idx = (r < 16) ? r : (r - 16); + const uint8_t q = x[ib].qs[16 * ib8 + byte_idx]; + const uint8_t qv = (r < 16) ? (q & 0x0F) : (q >> 4); + + const float d = (float) x[ib].d * ((((x[ib].scales_l[ib8 / 2] >> (4 * (ib8 % 2))) & 0xf) | + (((x[ib].scales_h >> (2 * ib8)) & 3) << 4)) - 32); + return d * kvalues_iq4nl[qv]; + }; + + v.x() = dequantize_one(iqs + 0); + v.y() = dequantize_one(iqs + 1); +#else + GGML_ABORT("IQ4_XS dequantize not supported for QK_K != 256"); +#endif +} + static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib, const int iqs, dfloat2 &v) { const block_q5_0 * x = (const block_q5_0 *) vx; @@ -390,6 +862,63 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri } +template +static void dequantize_block_q3_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> & item_ct1, int64_t n_blocks) { +#if QK_K == 256 + const int64_t i = item_ct1.get_group(2); + if (i >= n_blocks) { + return; + } + + const uint8_t * base = static_cast(vx); + const size_t qs_offset = i * (QK_K / 4); + const size_t hmask_offset = n_blocks * (QK_K / 4) + i * (QK_K / 8); + const size_t scales_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + i * 12; + const size_t d_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + n_blocks * 12 + + i * sizeof(ggml_half); + + const uint8_t * qs = base + qs_offset; + const uint8_t * hmask = base + hmask_offset; + const uint8_t * scales = base + scales_offset; + const float d_all = static_cast(*reinterpret_cast(base + d_offset)); + + const int64_t r = item_ct1.get_local_id(2) / 4; + const int64_t tid = r / 2; + const int64_t is0 = r % 2; + const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); + const int64_t n = tid / 4; + const int64_t j = tid - 4 * n; + const int64_t is = 8 * n + 2 * j + is0; + const int shift = 2 * j; + uint8_t m = 1 << (4 * n + j); + + uint8_t us = is < 4 + ? (scales[is - 0] & 0xF) | (((scales[is + 8] >> 0) & 3) << 4) + : is < 8 + ? (scales[is - 0] & 0xF) | (((scales[is + 4] >> 2) & 3) << 4) + : is < 12 + ? (scales[is - 8] >> 4) | (((scales[is + 0] >> 4) & 3) << 4) + : (scales[is - 8] >> 4) | (((scales[is - 4] >> 6) & 3) << 4); + + const float dl = d_all * (us - 32); + + dst_t * y = yy + i * QK_K + 128 * n + 32 * j; + const uint8_t * q = qs + 32 * n; + const uint8_t * hm = hmask; + + for (int l = l0; l < l0 + 4; ++l) { + y[l] = dl * ((int8_t) ((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); + } +#else + GGML_UNUSED(vx); + GGML_UNUSED(yy); + GGML_UNUSED(item_ct1); + GGML_UNUSED(n_blocks); + GGML_ABORT("Q3_K reorder dequantize not supported for QK_K != 256"); +#endif +} + #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 5577bf73b28..d80b0a38219 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -3,6 +3,13 @@ #include "dequantize.hpp" #include "presets.hpp" +#if defined(__INTEL_LLVM_COMPILER) + #if __has_include() + #include + #define GGML_SYCL_DMMV_HAS_BF16 + #endif +#endif + static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ const sycl::half *x = (const sycl::half *)vx; @@ -11,6 +18,16 @@ static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat v.y() = x[ib + iqs + 1]; } +#ifdef GGML_SYCL_DMMV_HAS_BF16 +static void convert_bf16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const sycl::ext::oneapi::bfloat16 *x = (const sycl::ext::oneapi::bfloat16 *)vx; + + // automatic bfloat16 -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} +#endif + static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ const float * x = (const float *) vx; @@ -217,6 +234,28 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, } } +#ifdef GGML_SYCL_DMMV_HAS_BF16 +static void convert_mul_mat_vec_bf16_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + // The qk=1 kernel iterates with stride 2*GGML_SYCL_DMMV_X, so ncols must be a + // multiple of that โ€” not just GGML_SYCL_DMMV_X โ€” to avoid out-of-bounds reads. + GGML_ASSERT(ncols % (2*GGML_SYCL_DMMV_X) == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<1, 1, convert_bf16>(vx, y, dst, ncols, + nrows, item_ct1); + }); + } +} +#endif + /* DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register @@ -462,6 +501,103 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, } } +static void dequantize_mul_mat_vec_q3_k_reorder(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + // SOA base pointers for the reordered layout: + // [qs: nb * (QK_K/4)] [hmask: nb * (QK_K/8)] [scales: nb * 12] [d: nb * sizeof(half)] + const int nb = nrows * num_blocks_per_row; + const uint8_t * qs_base = (const uint8_t *)vx; + const uint8_t * hmask_base = qs_base + (size_t)nb * (QK_K / 4); + const uint8_t * scales_base = hmask_base + (size_t)nb * (QK_K / 8); + const sycl::half * d_base = (const sycl::half *)(scales_base + (size_t)nb * 12); + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + const int bi = ib0 + i; + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = qs_base + bi * (QK_K / 4) + q_offset; + const uint8_t * h = hmask_base + bi * (QK_K / 8) + l0; + + const uint16_t * a = (const uint16_t *)(scales_base + bi * 12); + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = d_base[bi]; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + } +#else + GGML_UNUSED(vx); + GGML_UNUSED(yy); + GGML_UNUSED(ncols); + GGML_UNUSED(item_ct1); + GGML_ABORT("Q3_K reorder DMMV not supported for QK_K != 256"); +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + /* DPCT1110:6: The total declared local variable size in device function dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register @@ -1401,6 +1537,22 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, }); } +static void dequantize_mul_mat_vec_q3_K_sycl_reorder(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q3_k_reorder(vx, y, dst, ncols, nrows, item_ct1); + }); +} + static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, float *dst, const int ncols, const int nrows, @@ -1497,7 +1649,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec( bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 || + src0->type == GGML_TYPE_BF16; if (src1_convert_f16) { scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2, @@ -1541,7 +1694,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec( dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && + ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + dequantize_mul_mat_vec_q3_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + } else { + dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + } break; case GGML_TYPE_Q4_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && @@ -1565,6 +1723,11 @@ void ggml_sycl_op_dequantize_mul_mat_vec( case GGML_TYPE_F16: convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; +#ifdef GGML_SYCL_DMMV_HAS_BF16 + case GGML_TYPE_BF16: + convert_mul_mat_vec_bf16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; +#endif default: printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-sycl/fattn-common.hpp b/ggml/src/ggml-sycl/fattn-common.hpp index 03f0c2623c8..c6cc13cfb00 100644 --- a/ggml/src/ggml-sycl/fattn-common.hpp +++ b/ggml/src/ggml-sycl/fattn-common.hpp @@ -1031,7 +1031,7 @@ void launch_fattn( auto KV_max_ptr_ct1 = KV_max.ptr; cgh.parallel_for(sycl::nd_range<3>(blocks_num_KV_max * block_dim_KV_max, block_dim_KV_max), - [=](sycl::nd_item<3> item_ct1) { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] { GGML_UNUSED(item_ct1); flash_attn_mask_to_KV_max( mask_data_ct0, KV_max_ptr_ct1, iter_k, s31, s33, @@ -1149,7 +1149,7 @@ void launch_fattn( auto K_ne_ct6 = K->ne[2]; cgh.parallel_for(sycl::nd_range<3>(blocks_num_combine * block_dim_combine, block_dim_combine), - [=](sycl::nd_item<3> item_ct1) { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] { GGML_UNUSED(item_ct1); flash_attn_stream_k_fixup(KQV_data_ct0, dst_tmp_meta_ptr_ct1, Q_ne_ct2, Q_ne_ct3, Q_ne_ct4, @@ -1169,7 +1169,7 @@ void launch_fattn( auto KQV_data_ct2 = (float *) KQV->data; cgh.parallel_for(sycl::nd_range<3>(blocks_num_combine * block_dim_combine, block_dim_combine), - [=](sycl::nd_item<3> item_ct1) { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] { GGML_UNUSED(item_ct1); flash_attn_combine_results( dst_tmp_ptr_ct0, dst_tmp_meta_ptr_ct1, KQV_data_ct2, parallel_blocks, diff --git a/ggml/src/ggml-sycl/gated_delta_net.cpp b/ggml/src/ggml-sycl/gated_delta_net.cpp index ebc587524bf..9c2449aba0c 100644 --- a/ggml/src/ggml-sycl/gated_delta_net.cpp +++ b/ggml/src/ggml-sycl/gated_delta_net.cpp @@ -6,7 +6,7 @@ #include -template +template void gated_delta_net_sycl(const float * q, const float * k, const float * v, @@ -28,7 +28,8 @@ void gated_delta_net_sycl(const float * q, int64_t sb3, const sycl::uint3 neqk1_magic, const sycl::uint3 rq3_magic, - float scale) { + float scale, + int K) { auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); const uint32_t h_idx = item_ct1.get_group(2); const uint32_t sequence = item_ct1.get_group(1); @@ -43,9 +44,13 @@ void gated_delta_net_sycl(const float * q, float * attn_data = dst; float * state = dst + attn_score_elems; - const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v; - state += state_offset; - curr_state += state_offset; + // input state layout (D, K, n_seqs) โ€” seq stride is K * D = K * H * S_v * S_v. + // output state layout (per-slot D * n_seqs) โ€” same per-(seq,head) offset as before. + const int64_t state_in_offset = sequence * K * H * S_v * S_v + h_idx * S_v * S_v; + const int64_t state_out_offset = (sequence * H + h_idx) * S_v * S_v; + const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output + state += state_out_offset; + curr_state += state_in_offset + col * S_v; attn_data += (sequence * n_tokens * H + h_idx) * S_v; constexpr int warp_size = ggml_sycl_get_physical_warp_size() < S_v ? ggml_sycl_get_physical_warp_size() : S_v; @@ -55,9 +60,13 @@ void gated_delta_net_sycl(const float * q, #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; - s_shard[r] = curr_state[col * S_v + i]; + s_shard[r] = curr_state[i]; } + // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots + // are written; earlier slots are left untouched (caller-owned). + const int shift = (int) n_tokens - K; + for (int t = 0; t < n_tokens; t++) { const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1; const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1; @@ -131,17 +140,32 @@ void gated_delta_net_sycl(const float * q, } attn_data += S_v * H; - } + // Write state back to global memory + if constexpr (keep_rs_t) { + const int target_slot = t - shift; + if (target_slot >= 0 && target_slot < K) { + float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset; #pragma unroll - for (int r = 0; r < rows_per_lane; r++) { - const int i = r * warp_size + lane; - state[col * S_v + i] = s_shard[r]; + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + curr_state[col * S_v + i] = s_shard[r]; + } + } + } + } + + if constexpr (!keep_rs_t) { +#pragma unroll + for (int r = 0; r < rows_per_lane; r++) { + const int i = r * warp_size + lane; + state[col * S_v + i] = s_shard[r]; + } } } -template +template static void launch_gated_delta_net(const float * q_d, const float * k_d, const float * v_d, @@ -165,6 +189,7 @@ static void launch_gated_delta_net(const float * q_d, int64_t neqk1, int64_t rq3, float scale, + int K, dpct::queue_ptr stream) { //TODO: Add chunked kernel for even faster pre-fill const int warp_size = ggml_sycl_info().devices[ggml_sycl_get_device()].warp_size; @@ -182,9 +207,9 @@ static void launch_gated_delta_net(const float * q_d, constexpr int sv = 16; stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - gated_delta_net_sycl(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, + gated_delta_net_sycl(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, - sb3, neqk1_magic, rq3_magic, scale); + sb3, neqk1_magic, rq3_magic, scale, K); }); } break; @@ -193,9 +218,9 @@ static void launch_gated_delta_net(const float * q_d, constexpr int sv = 32; stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - gated_delta_net_sycl(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, + gated_delta_net_sycl(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, - sb3, neqk1_magic, rq3_magic, scale); + sb3, neqk1_magic, rq3_magic, scale, K); }); } break; @@ -204,9 +229,9 @@ static void launch_gated_delta_net(const float * q_d, constexpr int sv = 64; stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - gated_delta_net_sycl( + gated_delta_net_sycl( q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, - sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); }); } break; @@ -216,9 +241,9 @@ static void launch_gated_delta_net(const float * q_d, constexpr int sv = 128; stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - gated_delta_net_sycl( + gated_delta_net_sycl( q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, - sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); + sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); }); } break; @@ -290,14 +315,30 @@ void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor * dpct::queue_ptr stream = ctx.stream(); + // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count. + const int K = (int) src_state->ne[1]; + const bool keep_rs = K > 1; + if (kda) { - launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, - S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1, rq3, scale, stream); + if (keep_rs) { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } else { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } } else { - launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, - S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, - sb1, sb2, sb3, neqk1, rq3, scale, stream); + if (keep_rs) { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } else { + launch_gated_delta_net(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, + S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, + sb1, sb2, sb3, neqk1, rq3, scale, K, stream); + } } } diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index ca457454775..298f247f84e 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -129,11 +129,11 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr GGML_UNUSED(ctx); } -template +template static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const src0_t *src0_dd, const int32_t *src1_dd, - float *dst_dd, queue_ptr stream) { + dst_t *dst_dd, queue_ptr stream) { GGML_TENSOR_BINARY_OP_LOCALS @@ -170,7 +170,7 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32 ); GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); @@ -191,6 +191,66 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); break; + case GGML_TYPE_I32: + get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const int32_t *)dst->src[0]->data, + src1_i32, (int32_t *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q1_0: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_MXFP4: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_NVFP4: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ2_XXS: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ2_XS: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ2_S: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ3_XXS: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ1_S: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ1_M: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ3_S: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ4_NL: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_IQ4_XS: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q2_K: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q3_K: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; case GGML_TYPE_Q4_0: get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); @@ -199,6 +259,10 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); break; + case GGML_TYPE_Q4_K: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; case GGML_TYPE_Q5_0: get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); @@ -207,6 +271,14 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); break; + case GGML_TYPE_Q5_K: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q6_K: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; case GGML_TYPE_Q8_0: get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, src1_i32, (float *)dst->data, ctx.stream()); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f5d10b56de0..96138f57ebe 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,11 @@ #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC # include #endif +#if SYCL_EXT_ONEAPI_VIRTUAL_MEM +# include +# include +# define GGML_SYCL_USE_VMM +#endif #include #include "ggml.h" @@ -70,8 +76,10 @@ int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; +int g_ggml_sycl_enable_vmm = 1; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; +int g_ggml_sycl_use_async_mem_op_requested = 1; int g_ggml_sycl_enable_level_zero = 0; int g_ggml_sycl_enable_flash_attention = 1; @@ -95,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() { // GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); // #endif for (int i = 0; i < info.device_count; ++i) { - info.devices[i].vmm = 0; dpct::device_info prop; - sycl::device device = dpct::dev_mgr::instance().get_device(i); + auto & device = dpct::dev_mgr::instance().get_device(i); SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( prop, device))); +#if !defined(GGML_SYCL_USE_VMM) + info.devices[i].vmm = 0; +#else + info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem); + if (info.devices[i].vmm) { + // NB: SYCL's get_mem_granularity always returns the _minimum_ granularity, + // but the L0 API requires a larger page size for allocs above 2 MiB and + // rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic]. + // Here we clamp it to 2 MiB for simplicity, but other devices may require + // calling zeVirtualMemQueryPageSize or yet unexposed public API. + const size_t physical_page = 2ull << 20; // 2 MiB + info.devices[i].vmm_granularity = std::max( + sycl::ext::oneapi::experimental::get_mem_granularity( + device, sycl::context(device)), + physical_page); + } +#endif + info.default_tensor_split[i] = total_vram; total_vram += prop.get_global_mem_size(); @@ -116,6 +141,12 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units(); info.devices[i].hw_info = get_device_hw_info(&device); + // Only check GPU devices; CPU devices use OpenCL and would otherwise + // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set. + if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) { + GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i); + info.ext_oneapi_level_zero = false; + } } for (int id = 0; id < info.device_count; ++id) { @@ -227,28 +258,13 @@ static void ggml_check_sycl() try { g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); + g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1); + g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero); #else g_ggml_sycl_enable_level_zero = 0; #endif - if (g_ggml_sycl_enable_level_zero) { - // Verify all GPU devices use the Level Zero backend before enabling L0 APIs. - // Only check GPU devices; CPU devices use OpenCL and would otherwise - // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set. - for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) { - auto & q = dpct::dev_mgr::instance().get_device(i).default_queue(); - if (!q.get_device().is_gpu()) { - continue; - } - if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) { - GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i); - g_ggml_sycl_enable_level_zero = 0; - break; - } - } - } #ifdef SYCL_FLASH_ATTN g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); @@ -284,6 +300,11 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n"); #endif +#if defined(GGML_SYCL_USE_VMM) + GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n"); +#else + GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n"); +#endif GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); @@ -302,8 +323,15 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn); #else GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); +#endif +#if defined(GGML_SYCL_USE_VMM) + GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm); +#else + GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); + g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); + GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested); #ifdef SYCL_FLASH_ATTN GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention); @@ -319,11 +347,11 @@ static void ggml_check_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be - // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in - // other places. + // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder + // staging path while preserving queue ordering semantics. Graph support still depends on the extension being + // available, but it no longer needs to control the non-graph fast path. #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC - g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph; + g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph; if (g_ggml_sycl_use_async_mem_op) { for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) { if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) { @@ -761,7 +789,7 @@ catch (sycl::exception const &exc) { } static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; + return SYCL_BUFFER_ALIGNMENT; GGML_UNUSED(buft); } @@ -1184,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg } static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; + return SYCL_BUFFER_ALIGNMENT; GGML_UNUSED(buft); } @@ -1469,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { } }; +// pool with virtual memory management +#if defined(GGML_SYCL_USE_VMM) +struct ggml_sycl_pool_vmm : public ggml_sycl_pool { + static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + + int device; + sycl::context ctx; + sycl::device dev; + + uintptr_t pool_addr = 0; + size_t pool_used = 0; + size_t pool_size = 0; + size_t granularity; + + // physical_mem owns the commits (unlike cuMemMap) + struct mapping { + sycl::ext::oneapi::experimental::physical_mem phys; + void * map_ptr; + }; + std::vector mappings; + + explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) : + device(device_), + ctx(qptr_->get_context()), + dev(qptr_->get_device()), + granularity(ggml_sycl_info().devices[device_].vmm_granularity) { + } + + ~ggml_sycl_pool_vmm() { + if (pool_addr == 0) { + return; + } + + // Per spec, unmap must (a) match the exact (ptr, size) of an earlier + // physical_mem::map() call and (b) precede destruction of the + // physical_mem objects (their dtors won't unmap). + for (auto & m : mappings) { + SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap( + m.map_ptr, m.phys.size(), ctx))); + } + SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem( + pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx))); + } + + void * alloc(size_t size, size_t * actual_size) override { + // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types + size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT); + + size_t avail = pool_size - pool_used; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = GGML_PAD(size - avail, granularity); + + GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE); + + // allocate more physical memory + std::optional phys; + SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size))); + + // reserve virtual address space (if not already reserved) + if (pool_addr == 0) { + SYCL_CHECK(CHECK_TRY_ERROR( + pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem( + SYCL_POOL_VMM_MAX_SIZE, ctx))); + } + + // map at the end of the pool + void * map_ptr = nullptr; + SYCL_CHECK(CHECK_TRY_ERROR( + map_ptr = phys->map(pool_addr + pool_size, reserve_size, + sycl::ext::oneapi::experimental::address_access_mode::read_write))); + + // stash these so we could unmap this exact range in dtor + mappings.push_back({ + std::move(*phys), + map_ptr, + }); + + // add to the pool + pool_size += reserve_size; + +#ifdef DEBUG_SYCL_MALLOC + GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + device, (unsigned long long) (pool_size/1024/1024), + (unsigned long long) (reserve_size/1024/1024)); +#endif + } + + GGML_ASSERT(pool_addr != 0); + + void * ptr = reinterpret_cast(pool_addr + pool_used); + *actual_size = size; + pool_used += size; + +#ifdef DEBUG_SYCL_MALLOC + GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr); +#endif + + return ptr; + } + + void free(void * ptr, size_t size) override { +#ifdef DEBUG_SYCL_MALLOC + GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr); +#endif + + pool_used -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == reinterpret_cast(pool_addr + pool_used)); + } +}; +#endif // defined(GGML_SYCL_USE_VMM) + struct ggml_sycl_pool_host : public ggml_sycl_pool { queue_ptr qptr; int device; @@ -1549,20 +1692,19 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_host(que } std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { - // TBD: NO VMM support - // if (ggml_sycl_info().devices[device].vmm) { - // return std::unique_ptr(new ggml_sycl_pool_vmm(device)); - // } - return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); +#if defined(GGML_SYCL_USE_VMM) + if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) { + return std::unique_ptr(new ggml_sycl_pool_vmm(qptr, device)); + } +#endif // defined(GGML_SYCL_USE_VMM) + return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); } + std::unique_ptr ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) { return std::unique_ptr(new ggml_sycl_fattn_kv_buffers(qptr, device)); } -// TBD pool with virtual memory management -// struct ggml_sycl_pool_vmm : public ggml_sycl_pool - /// kernels typedef void (*ggml_sycl_op_mul_mat_t)( ggml_backend_sycl_context & ctx, @@ -2385,21 +2527,25 @@ inline void ggml_sycl_op_mul_mat_sycl( const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get(); + { + const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10; + const bool use_mkl_direct = gemm_flops < 256 * 256 * 256; #if GGML_SYCL_DNNL - if (!g_ggml_sycl_disable_dnn) { - DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i, - DnnlGemmWrapper::to_dt(), src1_ddf1_i, DnnlGemmWrapper::to_dt(), - dst_dd_i, DnnlGemmWrapper::to_dt(), stream); - } - else + if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) { + DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i, + DnnlGemmWrapper::to_dt(), src1_ddf1_i, DnnlGemmWrapper::to_dt(), + dst_dd_i, DnnlGemmWrapper::to_dt(), stream); + } + else #endif - { - const float alpha = 1.0f; - const float beta = 0.0f; - SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( - *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, - src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, - dpct::get_value(&beta, *stream), dst_dd_i, ldc))); + { + const float alpha = 1.0f; + const float beta = 0.0f; + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, + src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, + dpct::get_value(&beta, *stream), dst_dd_i, ldc))); + } } } GGML_UNUSED(dst); @@ -3403,6 +3549,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; + case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -3426,6 +3573,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -3448,6 +3596,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_F16: + case GGML_TYPE_BF16: return true; default: return false; @@ -3644,6 +3793,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d return true; } +static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { + GGML_ASSERT(size % sizeof(block_q3_K) == 0); + GGML_ASSERT(offset % sizeof(block_q3_K) == 0); + + const int nblocks = size / sizeof(block_q3_K); + + sycl_reorder_temp_buffer tmp(stream, size); + if (!tmp) { + GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size); + return false; + } + uint8_t * tmp_buf = static_cast(tmp.ptr); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } + + auto * qs_ptr = data_device; + auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks; + auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks; + sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks); + + auto reorder_event = stream->parallel_for(nblocks, [=](auto i) { + const block_q3_K * x = (const block_q3_K *) tmp_buf; + const int ib = i; + + for (int j = 0; j < QK_K / 4; ++j) { + qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j]; + } + + for (int j = 0; j < QK_K / 8; ++j) { + hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j]; + } + + for (int j = 0; j < 12; ++j) { + scales_ptr[ib * 12 + j] = x[ib].scales[j]; + } + + d_ptr[ib] = x[ib].d; + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + return true; +} + static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { GGML_ASSERT(size % sizeof(block_q5_K) == 0); GGML_ASSERT(offset % sizeof(block_q5_K) == 0); @@ -3756,6 +3953,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream); case GGML_TYPE_Q8_0: return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream); + case GGML_TYPE_Q3_K: + return reorder_qw_q3_k(data_device, size, 0, stream); case GGML_TYPE_Q4_K: return reorder_qw_q4_k(data_device, size, 0, stream); case GGML_TYPE_Q5_K: @@ -3811,8 +4010,13 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be + // a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only + // need ne[0] % DMMV_X == 0. + const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ? + 2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X; return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1; + src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1; } static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3916,35 +4120,17 @@ struct mmid_row_mapping { __dpct_inline__ static void k_copy_src1_to_contiguous( const char *__restrict__ src1_original, char *__restrict__ src1_contiguous, - int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping, - const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0, + const mmid_row_mapping *__restrict__ row_mapping, int64_t ne11, int64_t ne10, size_t nb11, size_t nb12, - const sycl::nd_item<3> &item_ct1, int &src1_row) { - int32_t iid1 = item_ct1.get_group(2); - int32_t id = item_ct1.get_group(1); - - const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0); + const sycl::nd_item<3> &item_ct1) { + const int32_t src1_row = item_ct1.get_group(2); - if (row_id_i != i02) { - return; - } + const int32_t iid1 = row_mapping[src1_row].i2; + const int32_t id = row_mapping[src1_row].i1; const int64_t i11 = id % ne11; const int64_t i12 = iid1; - if (item_ct1.get_local_id(2) == 0) { - src1_row = - dpct::atomic_fetch_add( - cur_src1_row, 1); - row_mapping[src1_row] = {id, iid1}; - } - /* - DPCT1065:194: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better - performance if there is no access to global memory. - */ - item_ct1.barrier(); - const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12); float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11); @@ -4019,6 +4205,47 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused( src1_row_stride, stream); } +// counting sort of the routed rows by expert id (row_id_i, as chosen by the router): +// builds a projection of a memory layout where each expert's slice is contiguous +static void mmid_counting_sort_rows( + const ggml_tensor * ids, const char * ids_host, + int64_t n_ids, int64_t n_as, int64_t n_routed_rows, + std::vector & expert_counts, + std::vector & expert_row_offsets, + std::vector & routed_row_src) { + + // frequencies: how many routed rows each expert "owns" + expert_counts.assign(n_as, 0); + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]); + GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); + expert_counts[row_id_i]++; + } + } + + // where each expert's slice starts (row indices) and the previous ends + expert_row_offsets.assign(n_as + 1, 0); + for (int64_t i02 = 0; i02 < n_as; i02++) { + expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02]; + } + + std::vector expert_row_next = expert_row_offsets; + routed_row_src.resize(n_routed_rows); + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]); + GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); + + // find and validate the next free row for a given expert (row_id_i) + const int64_t routed_row = expert_row_next[row_id_i]++; + GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]); + GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]); + routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1}; + } + } +} + static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, ggml_tensor *dst) try { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3); @@ -4097,99 +4324,91 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, src1_row.data = src1_contiguous.get(); dst_row.data = dst_contiguous.get(); - for (int64_t i02 = 0; i02 < n_as; i02++) { - int64_t num_src1_rows = 0; - for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { - for (int64_t id = 0; id < n_ids; id++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + // how many "owned" routed rows to pass to each expert + std::vector expert_row_counts; + // where each expert's slice starts and the previous ends (row indices, right-exclusive) + std::vector expert_row_offsets; + // the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous + std::vector routed_row_src; - GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); + mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows, + expert_row_counts, expert_row_offsets, routed_row_src); - if (row_id_i != i02) { - continue; - } + ggml_sycl_pool_alloc dev_row_mapping(ctx.pool(), n_routed_rows); + SYCL_CHECK(CHECK_TRY_ERROR( + stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping)))); - num_src1_rows++; - } - } + const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device]; + assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0); + + { + sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size)); + sycl::range<3> grid_dims(1, 1, n_routed_rows); + stream->submit([&](sycl::handler &cgh) { + char *__restrict src1_contiguous_get = + src1_contiguous.get(); + mmid_row_mapping *__restrict dev_row_mapping_get = + dev_row_mapping.get(); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_copy_src1_to_contiguous( + src1_original, src1_contiguous_get, + dev_row_mapping_get, + ne11, ne10, nb11, nb12, + item_ct1); + }); + }); + } + + for (int64_t i02 = 0; i02 < n_as; i02++) { + const int64_t num_src1_rows = expert_row_counts[i02]; if (num_src1_rows == 0) { continue; } - - ggml_sycl_pool_alloc dev_cur_src1_row(ctx.pool(), 1); - ggml_sycl_pool_alloc dev_row_mapping(ctx.pool(), num_src1_rows); - SYCL_CHECK(CHECK_TRY_ERROR( - stream->memset(dev_cur_src1_row.get(), 0, sizeof(int)))); - - const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device]; - assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0); - - { - sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size)); - sycl::range<3> grid_dims(1, n_ids, ids->ne[1]); - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor src1_row_acc(cgh); - - char *__restrict src1_contiguous_get = - src1_contiguous.get(); - int *__restrict dev_cur_src1_row_get = - dev_cur_src1_row.get(); - mmid_row_mapping *__restrict dev_row_mapping_get = - dev_row_mapping.get(); - size_t ids_nb_ct6 = ids->nb[1]; - size_t ids_nb_ct7 = ids->nb[0]; - - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_copy_src1_to_contiguous( - src1_original, src1_contiguous_get, - dev_cur_src1_row_get, - dev_row_mapping_get, ids_dev, i02, - ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12, - item_ct1, src1_row_acc); - }); - }); - } + const int64_t expert_row_offset = expert_row_offsets[i02]; src0_row.data = src0_original + i02*nb02; GGML_ASSERT(nb11 == sizeof(float)*ne10); GGML_ASSERT(nb1 == sizeof(float)*ne0); + src1_row.data = src1_contiguous.get() + expert_row_offset*nb11; src1_row.ne[1] = num_src1_rows; src1_row.nb[1] = nb11; src1_row.nb[2] = num_src1_rows*nb11; src1_row.nb[3] = num_src1_rows*nb11; + dst_row.data = dst_contiguous.get() + expert_row_offset*nb1; dst_row.ne[1] = num_src1_rows; dst_row.nb[1] = nb1; dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1; ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + } - { - sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size)); - sycl::range<3> grid_dims(1, 1, num_src1_rows); - stream->submit([&](sycl::handler &cgh) { - const char *__restrict dst_contiguous_get = - dst_contiguous.get(); - const mmid_row_mapping *__restrict dev_row_mapping_get = - dev_row_mapping.get(); - - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_copy_dst_from_contiguous(dst_original, - dst_contiguous_get, - dev_row_mapping_get, - ne0, nb1, nb2, item_ct1); - }); - }); - } + { + sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size)); + sycl::range<3> grid_dims(1, 1, n_routed_rows); + stream->submit([&](sycl::handler &cgh) { + const char *__restrict dst_contiguous_get = + dst_contiguous.get(); + const mmid_row_mapping *__restrict dev_row_mapping_get = + dev_row_mapping.get(); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_copy_dst_from_contiguous(dst_original, + dst_contiguous_get, + dev_row_mapping_get, + ne0, nb1, nb2, item_ct1); + }); + }); } } } @@ -5082,13 +5301,31 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { + case GGML_TYPE_I32: case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_F32: + case GGML_TYPE_Q1_0: + case GGML_TYPE_MXFP4: + case GGML_TYPE_NVFP4: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_0: return true; default: diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 49998f13ba8..abd1e49a70e 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -770,6 +770,26 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, } } +static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, + const int nrows, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + + // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; + + const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); + const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, + nd_item); + }); + }); +} + static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -1153,7 +1173,15 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && + ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n"); + reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, + stream); + } else { + GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl\n"); + mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } break; case GGML_TYPE_Q4_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp index 806028ef3a3..95287f17510 100644 --- a/ggml/src/ggml-sycl/quants.hpp +++ b/ggml/src/ggml-sycl/quants.hpp @@ -58,6 +58,31 @@ template <> struct block_q_t { static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } }; +template <> struct block_q_t { + struct traits { + static constexpr uint32_t qk = QK_K; + static constexpr uint32_t qi = QI3_K; + static constexpr uint32_t qr = QR3_K; + static constexpr uint32_t vdr_mmvq = 1; + }; + + // Reordered layout: [qs (QK_K/4 per block)] [hmask (QK_K/8 per block)] [scales] [d] + static constexpr std::pair get_block_offset(const int block_index, const int n_blocks) { + auto qs_offset = block_index * (QK_K / 4); + auto hmask_offset = n_blocks * (QK_K / 4) + block_index * (QK_K / 8); + return { qs_offset, hmask_offset }; + } + + static constexpr std::pair get_d_offset(int nrows, int ncols, const int block_index) { + auto nblocks = (nrows * (ncols / QK_K)); + auto total_qs_bytes = nblocks * (QK_K / 4) + nblocks * (QK_K / 8); + return { total_qs_bytes + block_index * 12, + total_qs_bytes + nblocks * 12 + block_index * sizeof(ggml_half) }; + } + + static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } +}; + template <> struct block_q_t { struct traits { static constexpr uint32_t qk = QK_K; diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index d7770047424..4b58b09ab2c 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -85,6 +85,32 @@ static __dpct_inline__ int get_int_from_uint8_aligned( (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } +static __dpct_inline__ int byte_sub_4(const int a, const int b) { + const uint32_t ua = static_cast(a); + const uint32_t ub = static_cast(b); + return static_cast(((ua | 0x80808080u) - ub) ^ 0x80808080u); +} + +static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq_scalar( + const int vl, const int vh, const int u0, const int u1, const int8_t sc0, + const int8_t sc1, const float d, const float d80, const float d81) { + static_assert(QR6_K == 2, "q6_K MMVQ scalar fast path assumes QR6_K == 2"); + + const int vil0 = (vl >> 0) & 0x0F0F0F0F; + const int vih0 = ((vh >> 0) << 4) & 0x30303030; + const int vi0 = byte_sub_4(vil0 | vih0, 0x20202020); + + const int vil1 = (vl >> 4) & 0x0F0F0F0F; + const int vih1 = ((vh >> 4) << 4) & 0x30303030; + const int vi1 = byte_sub_4(vil1 | vih1, 0x20202020); + + const float sumf = + d80 * (dpct::dp4a(vi0, u0, 0) * sc0) + + d81 * (dpct::dp4a(vi1, u1, 0) * sc1); + + return d * sumf; +} + static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, const uint8_t *values, int &val1, int &val2) { @@ -279,24 +305,8 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, const int *__restrict__ u, const int8_t *__restrict__ scales, const float &d, const float *__restrict__ d8) { - - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - const int sc = scales[4*i]; - - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; - - const int vih = ((vh >> (4*i)) << 4) & 0x30303030; - - const int vi = dpct::vectorized_binary( - (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32 - - sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d*sumf; + return vec_dot_q6_K_q8_1_impl_mmvq_scalar( + vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]); } // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called @@ -384,6 +394,41 @@ template <> struct reorder_vec_dot_q_sycl { } }; +template <> struct reorder_vec_dot_q_sycl { + static constexpr ggml_type gtype = GGML_TYPE_Q3_K; + + using q3_k_block = ggml_sycl_reordered::block_q_t; + using q3_k_traits = typename q3_k_block::traits; + + __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, + const std::pair d_offset, const int8_t * q8_1_quant_ptr, + const sycl::half2 * q8_1_ds, const int & iqs) { + const uint8_t * base = static_cast(vbq); + const uint8_t * qs = base + ibx_offset.first; + const uint8_t * hmask = base + ibx_offset.second; + const uint8_t * scales = base + d_offset.first; + const ggml_half d = *reinterpret_cast(base + d_offset.second); + + const int bq8_offset = QR3_K * (iqs / (QI3_K / 2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1 / 2); + + const int vl = get_int_from_uint8(qs, iqs); + const int vh = ~get_int_from_uint8(hmask, iqs % (QI3_K / 2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int8_t * quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1; + u[i] = get_int_from_int8_aligned(quant_base_ptr, iqs % QI8_1); + d8[i] = (*(q8_1_ds + bq8_offset + i))[0]; + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, scales, scale_offset, static_cast(d), d8); + } +}; + static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales, const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { @@ -542,23 +587,8 @@ template <> struct reorder_vec_dot_q_sycl { __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u, const int8_t * __restrict__ scales, const float d, const float * __restrict__ d8) { - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - const int sc = scales[4 * i]; - - const int vil = (vl >> (4 * i)) & 0x0F0F0F0F; - - const int vih = ((vh >> (4 * i)) << 4) & 0x30303030; - - const int vi = dpct::vectorized_binary((vil | vih), 0x20202020, - dpct::sub_sat()); // vi = (vil | vih) - 32 - - sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d * sumf; + return vec_dot_q6_K_q8_1_impl_mmvq_scalar( + vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]); } __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair ibx_offset, @@ -579,16 +609,15 @@ template <> struct reorder_vec_dot_q_sycl { const int8_t * scs = scales + scale_offset; - int u[QR6_K]; - float d8[QR6_K]; + const int u0 = get_int_from_int8_aligned( + q8_1_quant_ptr + bq8_offset * QK8_1, iqs % QI8_1); + const int u1 = get_int_from_int8_aligned( + q8_1_quant_ptr + (bq8_offset + 2) * QK8_1, iqs % QI8_1); + const float d80 = (*(q8_1_ds + bq8_offset + 0))[0]; + const float d81 = (*(q8_1_ds + bq8_offset + 2))[0]; -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1); - const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i); - d8[i] = ds_values[0]; - } - return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8); + return vec_dot_q6_K_q8_1_impl_mmvq_scalar( + vl, vh, u0, u1, scs[0], scs[4], *d, d80, d81); } }; #define VDR_Q4_0_Q8_1_MMVQ 2 @@ -1167,16 +1196,15 @@ vec_dot_q6_K_q8_1(const void *__restrict__ vbq, const int8_t * scales = bq6_K->scales + scale_offset; - int u[QR6_K]; - float d8[QR6_K]; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2 * i].ds[0]; - } + const int u0 = get_int_from_int8_aligned( + bq8_1[bq8_offset + 0].qs, iqs % QI8_1); + const int u1 = get_int_from_int8_aligned( + bq8_1[bq8_offset + 2].qs, iqs % QI8_1); + const float d80 = bq8_1[bq8_offset + 0].ds[0]; + const float d81 = bq8_1[bq8_offset + 2].ds[0]; - return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); + return vec_dot_q6_K_q8_1_impl_mmvq_scalar( + vl, vh, u0, u1, scales[0], scales[4], bq6_K->d, d80, d81); } diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 715a263a6d0..2d9e85794ad 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -8,6 +8,11 @@ endif() find_package(Vulkan COMPONENTS glslc REQUIRED) +if (DEFINED ENV{VULKAN_SDK}) + list(APPEND CMAKE_PREFIX_PATH "$ENV{VULKAN_SDK}") +endif() +find_package(SPIRV-Headers CONFIG REQUIRED) + if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") # Parallel build object files add_definitions(/MP) @@ -74,6 +79,12 @@ if (Vulkan_FOUND) "GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT" ) + test_shader_extension_support( + "GL_NV_cooperative_matrix_decode_vector" + "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp" + "GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT" + ) + test_shader_extension_support( "GL_EXT_integer_dot_product" "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/integer_dot.comp" diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 8c4cf9ef1db..e7d04634b8a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -21,6 +21,19 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #include +// Fallback definitions for VK_NV_cooperative_matrix_decode_vector in case the +// installed Vulkan headers predate the extension. +#ifndef VK_NV_cooperative_matrix_decode_vector +#define VK_NV_cooperative_matrix_decode_vector 1 +#define VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME "VK_NV_cooperative_matrix_decode_vector" +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV ((VkStructureType)1000689000) +typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV { + VkStructureType sType; + void* pNext; + VkBool32 cooperativeMatrixDecodeVector; +} VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV; +#endif + // SPIR-V Headers: different SDK installations expose different include paths. // LunarG Vulkan SDK on Windows typically provides . // Linux packages, MSYS2 and MinGW often use the Khronos layout . @@ -49,9 +62,10 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #include #include #include -#include +#include #include #include +#include #include #if defined(_MSC_VER) @@ -146,8 +160,9 @@ struct vk_pipeline_struct { uint32_t align; // true if fields have been set by ggml_vk_create_pipeline bool initialized {}; - // set to true to request the pipeline is compiled - std::atomic needed {}; + // true while a compile is in flight, used to dedupe concurrent claims. + // Protected by device->compile_mutex. + bool compile_pending {}; // set to true when the shader has been compiled std::atomic compiled {}; // number of registers used, extracted from pipeline executable properties @@ -399,6 +414,7 @@ enum vk_conv_shapes { CONV_SHAPE_128x128, CONV_SHAPE_64x32, CONV_SHAPE_32x256, + CONV_SHAPE_64x128, CONV_SHAPE_COUNT, }; @@ -413,6 +429,7 @@ vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = { { 128, 128, 16 }, // CONV_SHAPE_128x128 { 64, 32, 32 }, // CONV_SHAPE_64x32 { 32, 256, 16 }, // CONV_SHAPE_32x256 + { 64, 128, 16 }, // CONV_SHAPE_64x128 }; enum dmmv_wg_sizes { @@ -448,14 +465,16 @@ struct vk_fa_pipeline_state { }; struct vk_conv2d_pipeline_state { - vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH) - : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH) {} + vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned) + : s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {} uint32_t s0, s1, p0, p1, d0, d1, KW, KH; + // when set, shader can skip K/CRS/NPQ bounds checks and address clamps + uint32_t aligned; bool operator<(const vk_conv2d_pipeline_state &b) const { - return std::tie(s0, s1, p0, p1, d0, d1, KW, KH) < - std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH); + return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) < + std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned); } }; @@ -500,6 +519,12 @@ static constexpr std::initializer_list topk_moe_late_softmax { GGM GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE }; +// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder +// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion. +static constexpr std::initializer_list snake_pattern { GGML_OP_MUL, GGML_OP_SIN, + GGML_OP_SQR, GGML_OP_MUL, + GGML_OP_ADD }; + //node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ] //node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] //node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] @@ -596,6 +621,14 @@ static constexpr std::initializer_list> rms_norm_mul_rope_vie struct vk_device_struct { std::recursive_mutex mutex; + mutable std::shared_mutex pinned_memory_mutex; + + // Guards compile_pending, all_pipelines, and the dynamic pipeline maps + // (flash_attn, fa_mask_opt, solve_tri, conv2d, etc). The actual compile + // runs with no lock held, so different pipelines can compile in parallel. + // Lock order is device->mutex -> compile_mutex, never the reverse. + std::mutex compile_mutex; + std::condition_variable compile_cv; vk::PhysicalDevice physical_device; vk::PhysicalDeviceProperties properties; @@ -669,6 +702,8 @@ struct vk_device_struct { uint32_t coopmat_int_k; bool coopmat2; + bool coopmat2_bf16_support {}; + bool coopmat2_decode_vector; bool pipeline_executable_properties_support {}; @@ -759,9 +794,10 @@ struct vk_device_struct { vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_roll_f32; - vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; - vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32; - vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32; + vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32; + vk_pipeline pipeline_repeat_i16; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32; + vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32; vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32; @@ -836,6 +872,7 @@ struct vk_device_struct { vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines]; vk_pipeline pipeline_topk_f32[num_topk_pipelines]; vk_pipeline pipeline_sum_rows_f32; + vk_pipeline pipeline_fwht_f32[4]; vk_pipeline pipeline_cumsum_f32; vk_pipeline pipeline_cumsum_small_f32; vk_pipeline pipeline_cumsum_multipass1_f32; @@ -847,6 +884,9 @@ struct vk_device_struct { vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_conv_transpose_1d_f32; + vk_pipeline pipeline_snake_f32; + vk_pipeline pipeline_snake_f16; + vk_pipeline pipeline_snake_bf16; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; @@ -855,6 +895,8 @@ struct vk_device_struct { vk_pipeline pipeline_ssm_scan_f32_d128; vk_pipeline pipeline_ssm_scan_f32_d256; vk_pipeline pipeline_ssm_conv_f32; + vk_pipeline pipeline_ssm_conv_silu_f32; + vk_pipeline pipeline_ssm_conv_bias_silu_f32; vk_pipeline pipeline_opt_step_adamw_f32; vk_pipeline pipeline_opt_step_sgd_f32; std::map pipeline_conv2d_f32[CONV_SHAPE_COUNT]; @@ -1121,6 +1163,13 @@ struct vk_op_push_constants { float param4; }; +struct vk_op_fwht_push_constants { + uint32_t n_rows; + uint32_t src_offset; + uint32_t dst_offset; + float scale; +}; + struct vk_op_count_experts_push_constants { uint32_t ne00; uint32_t ne01; @@ -1353,6 +1402,8 @@ struct vk_op_rope_push_constants { uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t a_offset; + uint32_t d_offset; }; static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128"); @@ -1472,6 +1523,11 @@ struct vk_op_conv_transpose_1d_push_constants { int32_t s0; }; +struct vk_op_snake_push_constants { + uint32_t ne0; + uint32_t ne1; +}; + struct vk_op_pool2d_push_constants { uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; @@ -1506,6 +1562,7 @@ struct vk_op_gated_delta_net_push_constants { uint32_t sb1, sb2, sb3; uint32_t neq1, rq3; float scale; + uint32_t K; }; struct vk_op_ssm_scan_push_constants { @@ -1681,7 +1738,7 @@ struct ggml_vk_garbage_collector { }; static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx); -static void ggml_vk_load_shaders(vk_device& device); +static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr); static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx); static bool vk_memory_logger_enabled = false; @@ -2018,6 +2075,15 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk GGML_UNUSED(src3); } +template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + p.dst_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + struct ggml_backend_vk_buffer_context { vk_device_ref device; vk_buffer dev_buffer; @@ -2058,9 +2124,9 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); std::string type = device ? "device" : "host"; auto it = allocations.find(buf->buffer); - total_device -= device ? it->second : 0; - total_host -= device ? 0 : it->second; if (it != allocations.end()) { + total_device -= device ? it->second : 0; + total_host -= device ? 0 : it->second; VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); allocations.erase(it); } else { @@ -2139,10 +2205,135 @@ static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { ctx->device->device.resetFences({ ctx->fence }); } -// variables to track number of compiles in progress -static uint32_t compile_count = 0; -static std::mutex compile_count_mutex; -static std::condition_variable compile_count_cond; +static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367; +static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447; +static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4; + +// Remove SPV_NV_cooperative_matrix_decode_vector usage from a SPIR-V module so it +// can be loaded on drivers that only support SPV_NV_cooperative_matrix2. Drops the +// OpExtension declaration, the CooperativeMatrixDecodeVectorNV OpCapability, and the +// DecodeVectorFunc operand from any OpCooperativeMatrixLoadTensorNV instruction. +// Returns true when the input used the extension (and `out` was populated with a +// stripped copy); returns false otherwise without touching `out`. +static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count, std::vector & out) { + static const char kDecodeVectorExt[] = "SPV_NV_cooperative_matrix_decode_vector"; + + if (word_count < 5) { + return false; + } + + bool uses_decode_vector = false; + for (size_t pos = 5; pos < word_count; ) { + uint32_t word = code[pos]; + uint32_t wc = word >> spv::WordCountShift; + uint32_t op = word & spv::OpCodeMask; + GGML_ASSERT(wc > 0 && pos + wc <= word_count); + if (op == spv::OpExtension && wc >= 2) { + const char * s = reinterpret_cast(&code[pos + 1]); + if (strcmp(s, kDecodeVectorExt) == 0) { + uses_decode_vector = true; + break; + } + } + pos += wc; + } + + if (!uses_decode_vector) { + return false; + } + + VK_LOG_DEBUG("ggml_vk_strip_decode_vector: stripping SPV_NV_cooperative_matrix_decode_vector"); + + // Bulk-copy unchanged runs and only break the run when an instruction needs to + // be dropped or patched. Use reserve + insert/push_back so the destination buffer + // is touched exactly once (no zero-initialization pass from resize()). + out.clear(); + out.reserve(word_count); + + size_t run_start = 0; + auto flush_run = [&](size_t up_to) { + if (up_to > run_start) { + out.insert(out.end(), code + run_start, code + up_to); + } + }; + + for (size_t pos = 5; pos < word_count; ) { + uint32_t word = code[pos]; + uint32_t wc = word >> spv::WordCountShift; + uint32_t op = word & spv::OpCodeMask; + GGML_ASSERT(wc > 0 && pos + wc <= word_count); + + if (op == spv::OpExtension && wc >= 2) { + const char * s = reinterpret_cast(&code[pos + 1]); + if (strcmp(s, kDecodeVectorExt) == 0) { + flush_run(pos); + pos += wc; + run_start = pos; + continue; + } + } + + if (op == spv::OpCapability && wc == 2 && code[pos + 1] == kSpvCapabilityCooperativeMatrixDecodeVectorNV) { + flush_run(pos); + pos += wc; + run_start = pos; + continue; + } + + if (op == kSpvOpCooperativeMatrixLoadTensorNV) { + // [opcode/wc][ResultType][Result][Pointer][Object][TensorLayout][MemOperand mask][mem extras...][TA mask][ta extras...] + GGML_ASSERT(wc >= 8); + + uint32_t mem_mask = code[pos + 6]; + size_t cur = pos + 7; + // Each of these MemoryAccess bits (when set) carries one trailing operand. + cur += (mem_mask & 0x2) ? 1 : 0; // Aligned + cur += (mem_mask & 0x8) ? 1 : 0; // MakePointerAvailable + cur += (mem_mask & 0x10) ? 1 : 0; // MakePointerVisible + cur += (mem_mask & 0x10000) ? 1 : 0; // AliasScopeINTELMask + cur += (mem_mask & 0x20000) ? 1 : 0; // NoAliasINTELMask + GGML_ASSERT(cur < pos + wc); + + uint32_t ta_mask = code[cur]; + if ((ta_mask & kSpvTensorAddressingDecodeVectorFuncBit) == 0) { + pos += wc; + continue; // leave instruction inside the current unchanged run + } + + flush_run(pos); + + // Append unchanged prefix of the instruction (header through the mem-extras). + size_t inst_start = out.size(); + size_t pre_n = cur - pos; + out.insert(out.end(), code + pos, code + pos + pre_n); + + // Emit TA mask with the DecodeVectorFunc bit cleared. + out.push_back(ta_mask & ~kSpvTensorAddressingDecodeVectorFuncBit); + + // TA extras: TensorView (0x1) and DecodeFunc (0x2) are kept verbatim; + // DecodeVectorFunc (0x4) is dropped along with its trailing id operand. + size_t keep_ta_extras = ((ta_mask & 0x1) ? 1 : 0) + ((ta_mask & 0x2) ? 1 : 0); + if (keep_ta_extras) { + out.insert(out.end(), code + cur + 1, code + cur + 1 + keep_ta_extras); + } + + GGML_ASSERT(wc == pre_n + 1 + keep_ta_extras + 1); + + // Patch the instruction header with the new (one-shorter) word count. + uint32_t new_wc = wc - 1; + out[inst_start] = (new_wc << spv::WordCountShift) | op; + + pos += wc; + run_start = pos; + continue; + } + + pos += wc; + } + + flush_run(word_count); + return true; +} static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, std::array wg_denoms, std::vector specialization_constants, @@ -2215,6 +2406,18 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data()); } +#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + if (device->coopmat2 && !device->coopmat2_decode_vector) { + const uint32_t * src = spirv.empty() ? reinterpret_cast(spv_data) : spirv.data(); + size_t src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size(); + std::vector stripped; + if (ggml_vk_strip_decode_vector(src, src_n, stripped)) { + spirv = std::move(stripped); + shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data()); + } + } +#endif + pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); vk::PushConstantRange pcr( @@ -2296,7 +2499,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin std::cerr << "ggml_vulkan: " << e.what() << std::endl; throw e; } - pipeline->compiled = true; if (vk_instance.debug_utils_support) { vk::DebugUtilsObjectNameInfoEXT duoni; @@ -2345,14 +2547,13 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin } } - device->all_pipelines.push_back(pipeline); - { - std::lock_guard guard(compile_count_mutex); - assert(compile_count > 0); - compile_count--; + std::lock_guard guard(device->compile_mutex); + device->all_pipelines.push_back(pipeline); + pipeline->compiled = true; + pipeline->compile_pending = false; } - compile_count_cond.notify_all(); + device->compile_cv.notify_all(); } static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { @@ -2368,8 +2569,7 @@ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")"); ctx->pipeline_descriptor_set_requirements += n; if (!pipeline->compiled) { - pipeline->needed = true; - ggml_vk_load_shaders(ctx->device); + ggml_vk_load_shaders(ctx->device, pipeline); } ggml_pipeline_allocate_descriptor_sets(ctx); } @@ -2943,7 +3143,7 @@ struct vk_fa_tuning_params { }; static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type); -static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc); +static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16); static vk_fa_tuning_params get_fa_tuning_params_scalar(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc) { @@ -3083,6 +3283,13 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_ FaCodePath path = device->coopmat2 ? FA_COOPMAT2 : device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR; + if (path == FA_COOPMAT2 && k_type == GGML_TYPE_BF16 && !device->coopmat2_bf16_support) { + path = FA_COOPMAT1; + } + if (path == FA_COOPMAT1 && k_type == GGML_TYPE_BF16 && !device->coopmat_bf16_support) { + path = FA_SCALAR; + } + if (path == FA_COOPMAT1 && device->architecture == vk_device_architecture::NVIDIA_TURING) { // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090 path = FA_SCALAR; @@ -3092,7 +3299,7 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_ bool shape_ok = (f32acc && device->coopmat_support_16x16x16_f32acc) || (!f32acc && device->coopmat_support_16x16x16_f16acc); const vk_fa_tuning_params params = get_fa_tuning_params_coopmat1(device, hsk, hsv, n_rows, n_kv, k_type, v_type, f32acc); - bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc); + bool shmem_ok = ggml_vk_flash_attn_coopmat_shmem_support(device, params, hsk, hsv, f32acc, k_type); if (!shape_ok || !shmem_ok) { path = FA_SCALAR; @@ -3138,8 +3345,8 @@ static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const static std::vector get_fa_spec_constants(const vk_fa_pipeline_state& state) { const auto fa_block_bytes = [](ggml_type t) -> uint32_t { - // decodeBufF32 uses a block of vec4s for a better memory access pattern. - return t == GGML_TYPE_F32 ? 16u : (uint32_t) ggml_type_size(t); + if (t == GGML_TYPE_F32) return 16u; + return (uint32_t) ggml_type_size(t); }; return { /* 0 WorkGroupSize */ state.workgroup_size, @@ -3361,10 +3568,26 @@ static bool ggml_vk_fa_scalar_uses_mmq(const vk_device& device, ggml_type k_type #endif } -static void ggml_vk_load_shaders(vk_device& device) { +// load_shaders walks the pipeline list under compile_mutex and either claims +// the requested pipeline for compilation or, if another thread is already +// compiling it, drops the lock and waits on compile_cv. Compiles themselves +// run unlocked. +struct CompileTask { + vk_pipeline pipeline; + size_t spv_size; + const void * spv_data; + std::string entrypoint; + uint32_t parameter_count; + std::array wg_denoms; + std::vector specialization_constants; + bool disable_robustness; + bool require_full_subgroups; + uint32_t required_subgroup_size; +}; + +static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); - std::lock_guard guard(device->mutex); // some shaders have a minimum subgroup size const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u); const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); @@ -3394,6 +3617,15 @@ static void ggml_vk_load_shaders(vk_device& device) { l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; uint32_t l_align, m_align, s_align; + + vk_pipeline wait_pipeline; + CompileTask claimed_task {}; + bool has_claimed_task = false; + + // The rest of the walk reads and writes shared device state, so hold the + // lock until we're done deciding what to compile. + std::unique_lock compile_lock(device->compile_mutex); + if (device->coopmat2) { // spec constants and tile sizes for non-quant matmul/matmul_id l_warptile = { 256, 128, 256, 64, 1 }; @@ -3579,7 +3811,6 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_matmul_id_bf16 = std::make_shared(); } - std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { @@ -3613,23 +3844,33 @@ static void ggml_vk_load_shaders(vk_device& device) { #endif } - if (!pipeline->needed || pipeline->compiled) { + // We only care about the pipeline this call asked for; the rest + // (including the 64-bit indexing variant) are handled by their + // own request_descriptor_sets / load_shaders calls. + if (pipeline.get() != requested.get()) { continue; } - // TODO: We're no longer benefitting from the async compiles (shaders are - // compiled individually, as needed) and this complexity can be removed. - { - // wait until fewer than N compiles are in progress - uint32_t N = std::max(1u, std::thread::hardware_concurrency()); - std::unique_lock guard(compile_count_mutex); - while (compile_count >= N) { - compile_count_cond.wait(guard); - } - compile_count++; + + if (pipeline->compiled) { + continue; } - compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, - parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); + wait_pipeline = pipeline; + + if (!pipeline->compile_pending) { + pipeline->compile_pending = true; + claimed_task.pipeline = pipeline; + claimed_task.spv_size = spv_size; + claimed_task.spv_data = spv_data; + claimed_task.entrypoint = entrypoint; + claimed_task.parameter_count = parameter_count; + claimed_task.wg_denoms = wg_denoms; + claimed_task.specialization_constants = specialization_constants; + claimed_task.disable_robustness = disable_robustness; + claimed_task.require_full_subgroups = require_full_subgroups; + claimed_task.required_subgroup_size = required_subgroup_size; + has_claimed_task = true; + } } }; @@ -3653,10 +3894,16 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t fa_sgs = fa.first.subgroup_size; const bool fa_ds = fa.first.subgroup_size == 0; + const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; const bool use_mmq = ggml_vk_fa_scalar_uses_mmq(device, fa.first.k_type); const void * spv_data = nullptr; size_t spv_size = 0; - if (use_mmq) { + const char *name = nullptr; + if (bf16_kv) { + spv_data = flash_attn_f32_f16_fp32_data; + spv_size = flash_attn_f32_f16_fp32_len; + name = aligned ? "flash_attn_f32_bf16_aligned" : "flash_attn_f32_bf16"; + } else if (use_mmq) { #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->fp16) { if (f32acc) { spv_data = flash_attn_f32_f16_int8_data; spv_size = flash_attn_f32_f16_int8_len; } @@ -3666,6 +3913,7 @@ static void ggml_vk_load_shaders(vk_device& device) { spv_size = flash_attn_f32_f16_fp32_int8_len; } #endif + name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; } else { if (device->fp16) { if (f32acc) { spv_data = flash_attn_f32_f16_data; spv_size = flash_attn_f32_f16_len; } @@ -3674,8 +3922,8 @@ static void ggml_vk_load_shaders(vk_device& device) { spv_data = flash_attn_f32_f16_fp32_data; spv_size = flash_attn_f32_f16_fp32_len; } + name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; } - const char *name = aligned ? "flash_attn_f32_f16_aligned" : "flash_attn_f32_f16"; ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, @@ -3693,11 +3941,25 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t fa_sgs = fa.first.subgroup_size; const bool fa_ds = fa.first.subgroup_size == 0; + const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; + const void * spv_data; size_t spv_size; - if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data; spv_size = flash_attn_f32_f16_cm1_len; } - else { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; } - const char *name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1"; + const char *name; + if (bf16_kv) { +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (!device->coopmat_bf16_support) continue; + spv_data = flash_attn_f32_f16_bf16_cm1_data; + spv_size = flash_attn_f32_f16_bf16_cm1_len; + name = aligned ? "flash_attn_f32_bf16_aligned_cm1" : "flash_attn_f32_bf16_cm1"; +#else + continue; +#endif + } else { + if (f32acc) { spv_data = flash_attn_f32_f16_cm1_data; spv_size = flash_attn_f32_f16_cm1_len; } + else { spv_data = flash_attn_f32_f16_f16acc_cm1_data; spv_size = flash_attn_f32_f16_f16acc_cm1_len; } + name = aligned ? "flash_attn_f32_f16_aligned_cm1" : "flash_attn_f32_f16_cm1"; + } ggml_vk_create_pipeline(device, fa.second, name, spv_size, spv_data, "main", 7, sizeof(vk_flash_attn_push_constants), {Br, 1, 1}, get_fa_spec_constants(fa.first), aligned ? Bc : 1, true, @@ -3715,10 +3977,20 @@ static void ggml_vk_load_shaders(vk_device& device) { const bool aligned = fa.first.aligned; const bool f32acc = fa.first.f32acc; + const bool bf16_kv = fa.first.k_type == GGML_TYPE_BF16; const void * spv_data; size_t spv_size; const char * name; - if (aligned) { + if (bf16_kv) { +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (!device->coopmat2_bf16_support) continue; + spv_data = flash_attn_f32_f16_bf16_cm2_data; + spv_size = flash_attn_f32_f16_bf16_cm2_len; + name = aligned ? "flash_attn_f32_bf16_aligned_cm2" : "flash_attn_f32_bf16_cm2"; +#else + continue; +#endif + } else if (aligned) { if (f32acc) { spv_data = flash_attn_f32_f16_cm2_data; spv_size = flash_attn_f32_f16_cm2_len; name = "flash_attn_f32_f16_aligned_f32acc_cm2"; } else { spv_data = flash_attn_f32_f16_f16acc_cm2_data; spv_size = flash_attn_f32_f16_f16acc_cm2_len; name = "flash_attn_f32_f16_aligned_f16acc_cm2"; } } else { @@ -4568,6 +4840,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_cpy_bf16_f32,"cpy_bf16_f32",cpy_bf16_f32_len,cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_i32_f32, "cpy_i32_f32", cpy_i32_f32_len, cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_i32, "cpy_f32_i32", cpy_f32_i32_len, cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -4576,6 +4849,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_bf16_f32,"contig_cpy_bf16_f32",contig_cpy_bf16_f32_len,contig_cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -4684,9 +4958,11 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_i32, "repeat_i32", repeat_i32_len, repeat_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + #define CREATE_UNARY(name) \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \ ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -4799,6 +5075,16 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + // Intel Arc B390 was observed segfaulting with this shader. + if (device->subgroup_basic && device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL) { + int idx = 0; + for (uint32_t n : {64, 128, 256, 512}) { + if (device->subgroup_size <= n) { + ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_f32", fwht_f32_len, fwht_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { device->subgroup_size, n }, 1, true, true, device->subgroup_size); + } + ++idx; + } + } const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size); @@ -4839,6 +5125,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); @@ -4900,7 +5190,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true); } - ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 3, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 0}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_silu_f32, "ssm_conv_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 1}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_bias_silu_f32, "ssm_conv_bias_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 1, 1}, 1); ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -4908,7 +5200,8 @@ static void ggml_vk_load_shaders(vk_device& device) { // conv2d, conv_transpose_2d for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) { - uint32_t conv2d_WG_SIZE = 256; + // smaller WG for the small-tile fallback gives more concurrent WGs per SM + uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256; uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices. uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8; uint32_t conv2d_SHMEM_PAD = 4; @@ -4947,18 +5240,77 @@ static void ggml_vk_load_shaders(vk_device& device) { conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used. } - uint32_t conv2d_shmem_req = - (conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float); - if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) { + // cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size). + // Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader. + // Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need + // subgroup_size_control to force the driver to actually use it. + bool conv2d_use_cm1 = false; +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + conv2d_use_cm1 = !device->coopmat2 && + device->coopmat_support && device->coopmat_support_16x16x16_f16acc && + device->subgroup_size_control && + (device->subgroup_size == 32 || device->subgroup_size == 64) && + s != CONV_SHAPE_128x128; +#endif + + const uint32_t conv2d_cm1_shmem_pad = 8; + + auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) { + const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float); + const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u; + return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size; + }; + + // coopmat1 needs to store the output through shared memory, so check up front + // whether it'll fit and disable it before applying coopmat1 parameters. + if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) { + conv2d_use_cm1 = false; + } + + uint32_t conv2d_WM = 16, conv2d_WN = 16; // cm1 subgroup tile, ignored otherwise + if (conv2d_use_cm1) { + conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad; + // 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256 + // (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64). + const bool sg64 = (device->subgroup_size == 64); + switch (s) { + case CONV_SHAPE_64x32: conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break; + case CONV_SHAPE_64x128: conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break; + case CONV_SHAPE_32x256: conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break; + default: break; + } + const uint32_t warps_M = conv2d_BS.K / conv2d_WM; + const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN; + conv2d_WG_SIZE = warps_M * warps_N * device->subgroup_size; + } + + // stage cm2 accumulator through shmem for coalesced global stores; + // skipped on 128x128 where the extra Csh footprint hurts occupancy. + // cm1 always uses the staged path. + uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u; + if (conv2d_use_cm1) { + conv2d_csh_store = 1; + } + + // shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar + const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1; + + // shrink CRS if the non-cm1 config still doesn't fit + if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) { + GGML_ASSERT(!conv2d_use_cm1); conv2d_BS.CRS = 8; if (use_collectives) { conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS); } + conv2d_csh_store = 0; } std::array wg_denoms = { conv2d_BS.K, 1, 1 }; std::vector spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD }; + // cm1 needs a fixed subgroup width to match the WG_SIZE we computed + const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0; + #define CREATE_CONV(name, type_suffix, spv_suffix) \ for (auto &c : device->pipeline_##name##type_suffix[s]) { \ const vk_conv2d_pipeline_state &state = c.first; \ @@ -4971,10 +5323,14 @@ static void ggml_vk_load_shaders(vk_device& device) { spec_constants_cpy.push_back(state.d1); \ spec_constants_cpy.push_back(state.KW); \ spec_constants_cpy.push_back(state.KH); \ + spec_constants_cpy.push_back(state.aligned); \ + spec_constants_cpy.push_back(conv2d_csh_store); \ + spec_constants_cpy.push_back(conv2d_WM); \ + spec_constants_cpy.push_back(conv2d_WN); \ ggml_vk_create_pipeline( \ device, c.second, #name #type_suffix, \ name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \ - sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives); \ + sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size); \ } #define CREATE_CONVS(spv_suffix) \ CREATE_CONV(conv2d, _f32, spv_suffix) \ @@ -4985,6 +5341,11 @@ static void ggml_vk_load_shaders(vk_device& device) { if (device->coopmat2) { CREATE_CONVS(_cm2) } else +#endif +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (conv2d_use_cm1) { + CREATE_CONVS(_cm1) + } else #endif if (conv2d_UNROLL) { CREATE_CONVS(_unroll) @@ -5006,8 +5367,25 @@ static void ggml_vk_load_shaders(vk_device& device) { } } - for (auto &c : compiles) { - c.wait(); + // Drop compile_mutex so other threads can walk while we compile. + compile_lock.unlock(); + + // Compile what we claimed; create_pipeline_func reacquires compile_mutex + // at the end to flip compile_pending/compiled and notify waiters. + if (has_claimed_task) { + auto & task = claimed_task; + ggml_vk_create_pipeline_func(device, task.pipeline, task.spv_size, task.spv_data, + task.entrypoint, task.parameter_count, task.wg_denoms, + task.specialization_constants, task.disable_robustness, + task.require_full_subgroups, task.required_subgroup_size); + } + + // Another thread may be compiling the pipeline we need; block on it here. + if (wait_pipeline) { + std::unique_lock wait_lock(device->compile_mutex); + device->compile_cv.wait(wait_lock, [&] { + return wait_pipeline->compiled.load(); + }); } } @@ -5057,6 +5435,7 @@ static vk_device ggml_vk_get_device(size_t idx) { bool amd_shader_core_properties2 = false; bool pipeline_robustness = false; bool coopmat2_support = false; + bool coopmat2_decode_vector_support = false; bool pipeline_executable_properties_support = false; device->coopmat_support = false; device->integer_dot_product = false; @@ -5091,6 +5470,9 @@ static vk_device ggml_vk_get_device(size_t idx) { !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; #endif + } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) { + coopmat2_decode_vector_support = true; #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { @@ -5368,6 +5750,14 @@ static vk_device ggml_vk_get_device(size_t idx) { } #endif + VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; + coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; + if (coopmat2_decode_vector_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + device_extensions.push_back(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME); + } + #if defined(VK_KHR_shader_bfloat16) VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {}; bfloat16_features.pNext = nullptr; @@ -5487,46 +5877,73 @@ static vk_device ggml_vk_get_device(size_t idx) { found_fp16_256 = false, found_fp32_128 = false, found_fp32_256 = false; + bool found_bf16_128 = false, + found_bf16_256 = false; // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 // with 32x16x16 and 256 with 32x32x16. for (auto &prop : flexible_dimensions) { if (prop.saturatingAccumulation == VK_FALSE && - prop.scope == VK_SCOPE_WORKGROUP_KHR && - prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && - prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { - - if (prop.workgroupInvocations == 128 && - prop.MGranularity <= 32 && - prop.NGranularity <= 16 && - prop.KGranularity <= 16) { - if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { - found_fp16_128 = true; + prop.scope == VK_SCOPE_WORKGROUP_KHR) { + + if (prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_128 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_128 = true; + } } - if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { - found_fp32_128 = true; + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_256 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_256 = true; + } } } - if (prop.workgroupInvocations == 256 && - prop.MGranularity <= 32 && - prop.NGranularity <= 32 && - prop.KGranularity <= 16) { - if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { - found_fp16_256 = true; + +#if defined(VK_KHR_shader_bfloat16) && defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) + if (prop.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR && + prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + found_bf16_128 = true; } - if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && - prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { - found_fp32_256 = true; + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + found_bf16_256 = true; } } +#endif } } if (found_fp16_128 && found_fp16_256 && found_fp32_128 && found_fp32_256 && coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { device->coopmat2 = true; + device->coopmat2_bf16_support = found_bf16_128 && found_bf16_256; + device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; } } #endif @@ -5742,8 +6159,12 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_load_shaders(device); - // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled - const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue; + // Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled. + const bool prefers_transfer_queue = + device->vendor_id == VK_VENDOR_ID_AMD && + device->architecture != AMD_GCN && + !device->uma && + !allow_graphics_queue; if (!device->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; @@ -5809,6 +6230,7 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool fp16_compute = false; bool coopmat_support = false; bool coopmat2_support = false; + bool coopmat2_decode_vector_support = false; bool integer_dot_product = false; bool bfloat16_support = false; @@ -5827,6 +6249,9 @@ static void ggml_vk_print_gpu_info(size_t idx) { !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; #endif + } else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) { + coopmat2_decode_vector_support = true; #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) { @@ -5911,6 +6336,13 @@ static void ggml_vk_print_gpu_info(size_t idx) { } #endif + VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {}; + coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV; + if (coopmat2_decode_vector_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features; + } + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); fp16 = fp16 && vk12_features.shaderFloat16; @@ -5935,7 +6367,14 @@ static void ggml_vk_print_gpu_info(size_t idx) { #endif && ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture); - std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; + coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector; +#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + coopmat2_decode_vector_support = false; +#endif + + std::string matrix_cores = coopmat2_support ? (coopmat2_decode_vector_support ? "NV_coopmat2v" : "NV_coopmat2") + : coopmat_support ? "KHR_coopmat" + : "none"; std::string device_name = props2.properties.deviceName.data(); GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n", @@ -6625,7 +7064,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) { return nullptr; } - std::lock_guard guard(device->mutex); + std::lock_guard guard(device->pinned_memory_mutex); device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); return buf->ptr; @@ -6636,7 +7075,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { return; } VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); - std::lock_guard guard(device->mutex); + std::lock_guard guard(device->pinned_memory_mutex); vk_buffer buf; size_t index; @@ -6660,7 +7099,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { } static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) { - std::lock_guard guard(device->mutex); + std::shared_lock guard(device->pinned_memory_mutex); buf = nullptr; buf_offset = 0; for (size_t i = 0; i < device->pinned_memory.size(); i++) { @@ -6781,13 +7220,6 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx->s->buffer->buf.dispatch(wg0, wg1, wg2); } -static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { - s.buffer->buf.end(); - - s.wait_semaphores = std::move(wait_semaphores); - s.signal_semaphores = std::move(signal_semaphores); -} - static void ggml_vk_ctx_end(vk_context& ctx) { VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); if (ctx->s == nullptr) { @@ -6940,7 +7372,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; for (uint64_t i0 = 0; i0 < ne0; i0++) { - slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 }); + slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 }); } } } @@ -7538,6 +7970,13 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_cpy_f32_bf16; } } + if (src->type == GGML_TYPE_BF16 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_bf16_f32; + } else { + return ctx->device->pipeline_cpy_bf16_f32; + } + } if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_I32) { if (contig) { return ctx->device->pipeline_contig_cpy_f32_i32; @@ -7951,8 +8390,10 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return false; } - // General performance issue with q3_k and q6_k due to 2-byte alignment - if (src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { + // q6_k only has 2-byte alignment which makes it somewhat problematic, + // using MMVQ is only a win on Intel. + bool mmvq_q6 = device->vendor_id == VK_VENDOR_ID_INTEL; + if (src0_type == GGML_TYPE_Q6_K && !mmvq_q6) { return false; } @@ -7964,7 +8405,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ // Quantization overhead is not worth it for small k switch (device->vendor_id) { case VK_VENDOR_ID_NVIDIA: - if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { + if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { return true; } @@ -7991,9 +8432,16 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return true; } case VK_VENDOR_ID_INTEL: + if (device->architecture == vk_device_architecture::INTEL_XE2) { + if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { + return true; + } + } + if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { - // Intel Windows proprietary driver MMVQ performance is worse than fp16, see - // https://github.com/ggml-org/llama.cpp/issues/17628 + // Intel Windows proprietary driver MMVQ performance for !Q2/Q3/Q6 is worse than fp16, + // see https://github.com/ggml-org/llama.cpp/issues/17628 and + // https://github.com/ggml-org/llama.cpp/pull/23056 return false; } @@ -8441,6 +8889,68 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); } +static int ggml_vk_fwht_pipeline_idx(int64_t n) { + switch (n) { + case 64: return 0; + case 128: return 1; + case 256: return 2; + case 512: return 3; + default: return -1; + } +} + +static bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst) { + if (ctx->num_additional_fused_ops != 0) { + return false; + } + + if (ggml_get_op_params_i32(dst, 1) != GGML_HINT_SRC0_IS_HADAMARD) { + return false; + } + + const int idx = ggml_vk_fwht_pipeline_idx(src1->ne[0]); + if (idx < 0 || ctx->device->pipeline_fwht_f32[idx] == nullptr) { + return false; + } + + if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + + if (!ggml_is_contiguous(src1)) { + return false; + } + GGML_ASSERT(ggml_is_contiguous(dst)); + + return true; +} + +static void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst) { + const int idx = ggml_vk_fwht_pipeline_idx(src->ne[0]); + vk_pipeline pipeline = ctx->device->pipeline_fwht_f32[idx]; + + const uint32_t rows_per_workgroup = 4; + const uint32_t n_rows = (uint32_t)ggml_nrows(src); + const uint32_t max_workgroups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0]; + + const uint32_t total_workgroups = CEIL_DIV(n_rows, rows_per_workgroup); + const uint32_t workgroups_x = std::min(total_workgroups, max_workgroups_x); + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + const vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src, true); + const vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true); + + vk_op_fwht_push_constants pc = { + n_rows, + 0, + 0, + 1.0f / std::sqrt((float)src->ne[0]), + }; + init_pushconst_tensor_offsets(ctx, pc, src, nullptr, nullptr, nullptr, dst); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, { workgroups_x, 1, 1 }); +} + static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; @@ -8474,6 +8984,8 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c m_offset += cur_M_size; } + } else if (ggml_vk_can_use_fwht(ctx, src1, dst)) { + ggml_vk_fwht(ctx, subctx, src1, dst); } else if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 && // detect 0213 permutation, and batch size of 1 src0->nb[0] <= src0->nb[2] && @@ -9057,7 +9569,8 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con const uint32_t Br = params.block_rows; const uint32_t Bc = params.block_cols; - const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); + // BF16 uses the fp32 shader (FLOAT_TYPE=float) + const uint32_t float_type_size = (device->fp16 && k_type != GGML_TYPE_BF16) ? sizeof(ggml_fp16_t) : sizeof(float); const bool mmq = ggml_vk_fa_scalar_uses_mmq(device, k_type); @@ -9098,7 +9611,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con return supported; } -static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc) { +static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type) { // Needs to be kept up to date on shader changes const uint32_t Br = params.block_rows; const uint32_t Bc = params.block_cols; @@ -9128,8 +9641,10 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co const uint32_t vsh_stride = MatBc / 4 * row_split; const uint32_t ksh = ((kvshstride >= vsh_stride) ? (Bc * kvshstride) : (Bc * vsh_stride)) * f16vec4; + // BF16 PVMat accumulator is f32 (no bf16 accumulator support), so pvsh is vec4 (16 bytes) + const uint32_t pvsh_elem_size = (k_type == GGML_TYPE_BF16) ? 16u : f16vec4; const uint32_t osh_stride = params.row_split * MatBr / 4; - const uint32_t pvsh = MatBc * osh_stride * f16vec4; + const uint32_t pvsh = MatBc * osh_stride * pvsh_elem_size; const uint32_t slope = Br * acctype; @@ -9198,7 +9713,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx uint32_t workgroups_y = (uint32_t)neq2; uint32_t workgroups_z = (uint32_t)neq3; - const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32; + const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16; // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga. // For coopmat2 FA, we always use the small size (which is still pretty large for gqa). @@ -9259,7 +9774,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_pipeline pipeline = nullptr; { - std::lock_guard guard(ctx->device->mutex); + std::lock_guard guard(ctx->device->compile_mutex); auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16; auto it = pipelines.find(fa_pipeline_state); if (it != pipelines.end()) { @@ -9323,13 +9838,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_pipeline pipeline_fa_mask_opt = nullptr; if (use_mask_opt) { - std::lock_guard guard(ctx->device->mutex); - auto &pipelines = ctx->device->pipeline_fa_mask_opt; - auto it = pipelines.find({Br, Bc}); - if (it != pipelines.end()) { - pipeline_fa_mask_opt = it->second; - } else { - pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared(); + { + std::lock_guard guard(ctx->device->compile_mutex); + auto &pipelines = ctx->device->pipeline_fa_mask_opt; + auto it = pipelines.find({Br, Bc}); + if (it != pipelines.end()) { + pipeline_fa_mask_opt = it->second; + } else { + pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared(); + } } assert(pipeline_fa_mask_opt); ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1); @@ -9440,10 +9957,23 @@ static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, u // so small convolutions will still choose a smaller tile. const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32; - if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) { + // 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile. + bool allow_128x128 = true; +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) { + allow_128x128 = false; + } +#endif + + if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) { return CONV_SHAPE_128x128; } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) { return CONV_SHAPE_32x256; + } else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) { + return CONV_SHAPE_64x128; + } else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) { + // cm1 fallback for large K when 128x128 isn't available + return CONV_SHAPE_64x128; } else { return CONV_SHAPE_64x32; } @@ -9615,7 +10145,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_REPEAT: if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { - return ctx->device->pipeline_repeat_f32; + return ctx->device->pipeline_repeat_i32; + } + if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) { + return ctx->device->pipeline_repeat_i16; } return nullptr; case GGML_OP_REPEAT_BACK: @@ -9847,7 +10380,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const vk_pipeline pipeline = nullptr; { - std::lock_guard guard(ctx->device->mutex); + std::lock_guard guard(ctx->device->compile_mutex); auto it = ctx->device->pipeline_solve_tri_f32.find(solve_tri_pipeline_state); if (it != ctx->device->pipeline_solve_tri_f32.end()) { pipeline = it->second; @@ -9936,7 +10469,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_SSM_CONV: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_ssm_conv_f32; + switch (ctx->num_additional_fused_ops) { + case 0: return ctx->device->pipeline_ssm_conv_f32; + case 1: return ctx->device->pipeline_ssm_conv_silu_f32; + case 2: return ctx->device->pipeline_ssm_conv_bias_silu_f32; + default: return nullptr; + } } return nullptr; case GGML_OP_OPT_STEP_ADAMW: @@ -9970,7 +10508,18 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0; uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1; uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1; - vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH); + + // tile-aligned shapes let the shader skip bounds checks + const uint32_t Cin = (uint32_t)src1->ne[2]; + const uint32_t CRS = Cin * KW * KH; + const uint32_t BS_K = vk_conv_block_sizes[shape].K; + const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS; + const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ; + const uint32_t aligned = ((K % BS_K == 0) && + (CRS % BS_CRS == 0) && + (NPQ % BS_NPQ == 0)) ? 1u : 0u; + + vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned); std::map *pipelines = nullptr; if (op == GGML_OP_CONV_2D) { @@ -9990,7 +10539,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const vk_pipeline pipeline = nullptr; { - std::lock_guard guard(ctx->device->mutex); + std::lock_guard guard(ctx->device->compile_mutex); auto it = pipelines->find(conv2d_pipeline_state); if (it != pipelines->end()) { pipeline = it->second; @@ -10117,6 +10666,15 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk GGML_UNUSED(src3); } +template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_UNUSED(src1); + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + template static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; @@ -10767,6 +11325,7 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s const ggml_tensor * src_q = dst->src[0]; const ggml_tensor * src_v = dst->src[2]; const ggml_tensor * src_beta = dst->src[4]; + const ggml_tensor * src_state = dst->src[5]; GGML_ASSERT(dst->buffer != nullptr); @@ -10775,6 +11334,9 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s const uint32_t n_tokens = (uint32_t)src_v->ne[2]; const uint32_t n_seqs = (uint32_t)src_v->ne[3]; + // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count. + const uint32_t K = (uint32_t)src_state->ne[1]; + const uint32_t s_off = S_v * H * n_tokens * n_seqs; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); @@ -10808,7 +11370,8 @@ static void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& s sv1, sv2, sv3, sb1, sb2, sb3, neq1, rq3, - scale + scale, + K }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, @@ -10872,11 +11435,28 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, pc, elements); } -static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; +static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { + ggml_tensor * conv = cgraph->nodes[node_idx]; + const ggml_tensor * src0 = conv->src[0]; + const ggml_tensor * src1 = conv->src[1]; + + // Pick the destination tensor (last node in the fused chain) and the optional bias. + // Fusion modes: 0 = ssm_conv, 1 = ssm_conv+silu, 2 = ssm_conv+add(bias)+silu. + ggml_tensor * dst = conv; + const ggml_tensor * bias = nullptr; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SSM_CONV, { + if (ctx->num_additional_fused_ops == 1) { + dst = cgraph->nodes[node_idx + 1]; // silu + } else if (ctx->num_additional_fused_ops == 2) { + ggml_tensor * add = cgraph->nodes[node_idx + 1]; + bias = (add->src[0] == conv) ? add->src[1] : add->src[0]; + dst = cgraph->nodes[node_idx + 2]; // silu + } + + // The shader always declares 4 bindings; bind src0 as a dummy when bias isn't fused. + const ggml_tensor * src2 = bias ? bias : src0; + + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SSM_CONV, { (uint32_t)src0->nb[1], (uint32_t)src0->nb[2], (uint32_t)src1->nb[1], (uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2], @@ -11239,6 +11819,7 @@ static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor * (uint32_t)src0->ne[2], nb01, nb02, nb03, nb11, nb12, nb13, + 0, 0, // a_offset, d_offset filled in by init_pushconst_tensor_offsets }; return rope; @@ -11334,6 +11915,11 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, GGML_ASSERT(buf[i] != nullptr); } + // a_offset is unused (the fused path reads from shared memory), but the rope/set_rows dst can be misaligned. + // Round the binding offset down to the storage buffer alignment; the in-element shift goes in pc.rope.d_offset. + pc.rope.d_offset = get_misalign_bytes(ctx, tensors[5]) / ggml_type_size(tensors[5]->type); + offset[5] &= ~(size_t(ctx->device->properties.limits.minStorageBufferOffsetAlignment) - 1); + std::array elements; elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] }; @@ -12053,6 +12639,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); } +// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b. +// Match the naive mul -> sin -> sqr -> mul -> add chain and run the +// dedicated kernel directly. The pattern is validated by +// ggml_vk_can_fuse_snake before this call. +static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { + const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; + const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; + const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; + ggml_tensor * add = cgraph->nodes[node_idx + 4]; + + // x carries the full activation shape, a is the broadcast operand + const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; + const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; + + // mul1 reads sqr and inv_b in either operand order + const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; + + vk_pipeline pipeline = nullptr; + switch (x->type) { + case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break; + case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break; + case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break; + default: GGML_ABORT("unsupported type"); + } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); + vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a); + vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add); + + vk_op_snake_push_constants pc{}; + pc.ne0 = static_cast(x->ne[0]); + pc.ne1 = static_cast(x->ne[1]); + + std::array elements = { pc.ne0, pc.ne1, 1 }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements); +} + static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; @@ -13261,7 +13886,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_MUL: - ggml_vk_mul(ctx, compute_ctx, src0, src1, node); + if (ctx->num_additional_fused_ops) { + ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx); + } else { + ggml_vk_mul(ctx, compute_ctx, src0, src1, node); + } break; case GGML_OP_DIV: @@ -13551,7 +14180,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_SSM_CONV: - ggml_vk_ssm_conv(ctx, compute_ctx, node); + ggml_vk_ssm_conv(ctx, compute_ctx, cgraph, node_idx); break; @@ -13939,12 +14568,6 @@ static const char * ggml_backend_vk_host_buffer_type_name(ggml_backend_buffer_ty UNUSED(buft); } -static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffer) { - return GGML_VK_NAME "_Host"; - - UNUSED(buffer); -} - static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(vk_instance.devices[0], buffer->context); @@ -14448,6 +15071,62 @@ static bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct g return true; } +// Match SSM_CONV + UNARY(SILU) or SSM_CONV + ADD + UNARY(SILU). num_extra is 1 or 2. +static bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, + int node_idx, int num_extra) { + const ggml_tensor * conv = cgraph->nodes[node_idx]; + if (conv->op != GGML_OP_SSM_CONV) { + return false; + } + + const ggml_tensor * silu = nullptr; + const ggml_tensor * bias = nullptr; + + if (num_extra == 1) { + if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_UNARY })) { + return false; + } + silu = cgraph->nodes[node_idx + 1]; + } else if (num_extra == 2) { + if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY })) { + return false; + } + const ggml_tensor * add = cgraph->nodes[node_idx + 1]; + silu = cgraph->nodes[node_idx + 2]; + bias = (add->src[0] == conv) ? add->src[1] : add->src[0]; + + if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) { + return false; + } + // bias must be channel-wise (one element per channel of the conv output) + if (ggml_nelements(bias) != conv->ne[0] || bias->ne[0] != conv->ne[0]) { + return false; + } + if (add->type != GGML_TYPE_F32) { + return false; + } + // The shader doesn't apply per-tensor offsets, so reject misaligned bias. + if (get_misalign_bytes(ctx, bias) != 0) { + return false; + } + } else { + return false; + } + + if (ggml_get_unary_op(silu) != GGML_UNARY_OP_SILU) { + return false; + } + if (conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) { + return false; + } + // The shader writes to the fused dst using its own strides, but the push constants don't + // carry a per-tensor offset, so the binding must be naturally aligned. + if (get_misalign_bytes(ctx, silu) != 0) { + return false; + } + return true; +} + static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, topk_moe_mode mode) { @@ -14578,6 +15257,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const return true; } +// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add. +// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that +// the broadcast operands a and inv_b share a [1, C] layout. +static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { + GGML_UNUSED(ctx); + if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) { + return false; + } + + const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; + const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1]; + const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; + const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; + const ggml_tensor * add = cgraph->nodes[node_idx + 4]; + + const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; + const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; + + const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; + const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0]; + + if (x_in_add != x) { + return false; + } + if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) { + return false; + } + // Shader bindings: data_a is A_TYPE so it follows x's precision, while + // data_b and data_c are hardcoded float, so the broadcast operands must + // be F32 regardless of x's type. + if (a->type != GGML_TYPE_F32) return false; + if (inv_b->type != GGML_TYPE_F32) return false; + // Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline). + if (mul0->type != x->type) return false; + if (sin_node->type != x->type) return false; + if (sqr->type != x->type) return false; + if (mul1->type != x->type) return false; + if (add->type != x->type) return false; + if (!ggml_are_same_shape(a, inv_b)) { + return false; + } + if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) { + return false; + } + // Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b + // must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader. + if (x->ne[2] != 1 || x->ne[3] != 1) return false; + if (add->ne[2] != 1 || add->ne[3] != 1) return false; + if (a->ne[2] != 1 || a->ne[3] != 1) return false; + if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false; + // Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1], + // so every operand must be contiguous. + if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) || + !ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) { + return false; + } + return true; +} + // Check whether the tensors overlap in memory. // Fusions can potentially overwrite src tensors in ways that are not prevented // by ggml-alloc. If the fusion src is being applied in a way that's elementwise @@ -14864,6 +15602,19 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // they are overwritten, and one workgroup per row. So close enough. op_srcs_fused_elementwise[0] = true; op_srcs_fused_elementwise[1] = true; + } else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 2)) { + ctx->num_additional_fused_ops = 2; + fusion_string = "SSM_CONV_BIAS_SILU"; + // ssm_conv reads multiple input tokens per output, so it's not elementwise w.r.t. its srcs. + // The downstream add and silu are elementwise on the conv output. + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; + op_srcs_fused_elementwise[2] = true; + } else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 1)) { + ctx->num_additional_fused_ops = 1; + fusion_string = "SSM_CONV_SILU"; + op_srcs_fused_elementwise[0] = false; + op_srcs_fused_elementwise[1] = true; } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) { @@ -14872,6 +15623,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg op_srcs_fused_elementwise[0] = false; op_srcs_fused_elementwise[1] = false; op_srcs_fused_elementwise[2] = false; + } else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) { + ctx->num_additional_fused_ops = 4; + fusion_string = "SNAKE"; + // elementwise=true: snake.comp is safe under exact aliasing because each + // thread reads data_x[idx] into a register before writing data_d[idx] + // with a data dependency on that register. The overlap check still + // rejects partial overlaps (different base or size). + std::fill_n(op_srcs_fused_elementwise, 5, true); } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) && ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) && ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) { @@ -15162,6 +15921,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * if (keep_pattern(topk_moe_late_softmax)) { continue; } + if (keep_pattern(snake_pattern)) { + continue; + } // First, grab the next unused node. current_set.push_back(first_unused); @@ -15184,7 +15946,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * if (match_pattern(topk_moe_early_softmax_norm, j) || match_pattern(topk_moe_sigmoid_norm_bias, j) || match_pattern(topk_moe_early_softmax, j) || - match_pattern(topk_moe_late_softmax, j)) { + match_pattern(topk_moe_late_softmax, j) || + match_pattern(snake_pattern, j)) { continue; } bool ok = true; @@ -15195,7 +15958,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) && !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID) && !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL) && - !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD)) { + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_ADD) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_UNARY)) { ok = false; break; } @@ -15278,6 +16043,19 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * } } } + // SSM_CONV + ADD + UNARY: pull the consuming UNARY forward + if (j > 0 && + graph->nodes[j]->op == GGML_OP_ADD && + graph->nodes[j-1]->op == GGML_OP_SSM_CONV) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_UNARY && + graph->nodes[k]->src[0] == graph->nodes[j]) { + current_set.push_back(k); + used[k] = true; + break; + } + } + } } } // Second pass grabs view nodes. @@ -15742,6 +16520,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm switch (t) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_0: @@ -15757,6 +16536,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (!fa_kv_ok(op->src[1]->type) || !fa_kv_ok(op->src[2]->type)) { return false; } + if ((op->src[1]->type == GGML_TYPE_BF16) != (op->src[2]->type == GGML_TYPE_BF16)) { + return false; + } if (!coopmat2 && !(device->subgroup_shuffle && device->subgroup_vote)) { // scalar/coopmat1 FA uses subgroupShuffle/subgroupAll return false; @@ -15842,6 +16624,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm if (src1_type == GGML_TYPE_F32) { switch (src0_type) { case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -15878,7 +16661,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return false; } case GGML_OP_REPEAT: - return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float); + return ggml_type_size(op->type) == ggml_type_size(op->src[0]->type) && + (ggml_type_size(op->type) == sizeof(float) || ggml_type_size(op->type) == 2); case GGML_OP_REPEAT_BACK: return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_ROPE: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index e1f613fb4f6..10a9ea21025 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -11,6 +11,10 @@ if (GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) message(STATUS "Enabling coopmat2 glslc support") endif() +if (GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + add_compile_definitions(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT) + message(STATUS "Enabling coopmat2 decode_vector glslc support") +endif() if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) message(STATUS "Enabling dot glslc support") diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp index ca1a3ac25bd..b3b182fb084 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp @@ -19,7 +19,9 @@ void main() { if (idx + (num_iter-1)*num_threads < p.ne) { [[unroll]] for (uint i = 0; i < num_iter; ++i) { -#if defined(DATA_D_BF16) +#if defined(DATA_A_BF16) + data_d[get_doffset() + idx] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + idx]))); +#elif defined(DATA_D_BF16) float f = float(data_a[get_aoffset() + idx]); data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f)); #elif !defined(OPTIMIZATION_ERROR_WORKAROUND) @@ -35,7 +37,9 @@ void main() { continue; } -#if defined(DATA_D_BF16) +#if defined(DATA_A_BF16) + data_d[get_doffset() + idx] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + idx]))); +#elif defined(DATA_D_BF16) float f = float(data_a[get_aoffset() + idx]); data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f)); #elif !defined(OPTIMIZATION_ERROR_WORKAROUND) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp index 875c012cd3b..1428ef68d81 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -7,6 +7,13 @@ #extension GL_KHR_memory_scope_semantics : enable #endif +#ifdef COOPMAT +#extension GL_KHR_cooperative_matrix : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_KHR_memory_scope_semantics : enable +#endif + #ifdef USE_COLLECTIVES # extension GL_KHR_shader_subgroup_shuffle : enable #endif @@ -77,6 +84,39 @@ layout(constant_id = 12) const uint d1 = 1; // Kernel spatial sizes layout(constant_id = 13) const uint KW = 1; layout(constant_id = 14) const uint KH = 1; +// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned) +layout(constant_id = 15) const uint aligned = 0; +// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this. +layout(constant_id = 16) const uint csh_store = 0; + +#ifdef COOPMAT +// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of +// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN == +// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size. +layout(constant_id = 17) const uint WM = 32; +layout(constant_id = 18) const uint WN = 32; +const uint TM = 16; +const uint TN = 16; +const uint TK = 16; +const uint cms_per_row = WM / TM; +const uint cms_per_col = WN / TN; +const uint warps_M = BS_K / WM; +const uint warps_N = BS_NPQ / WN; +#endif + +// without padding, H_idx/W_idx are in bounds by construction (non-TRANSPOSE only) +#ifdef TRANSPOSE +const bool hw_in_bounds = false; +#else +const bool hw_in_bounds = (p0 == 0) && (p1 == 0); +#endif + +// TRANSPOSE stride alignment is trivially satisfied for stride 1 +#ifdef TRANSPOSE +const bool stride_in_bounds = (s0 == 1) && (s1 == 1); +#else +const bool stride_in_bounds = true; +#endif uint32_t tid = gl_LocalInvocationID.x; const uint32_t WG_SIZE = gl_WorkGroupSize.x; @@ -94,7 +134,7 @@ uint32_t n_elems_out = K * NPQ; // Number of blocktiles per input uint32_t NB_CRS = splitWork(CRS, BS_CRS); -#ifdef COOPMAT2 +#if defined(COOPMAT2) || defined(COOPMAT) #define SHMEM_TYPE float16_t #else #define SHMEM_TYPE float @@ -112,6 +152,17 @@ const uint32_t Bsh_len = BS_CRS * Bsh_stride; shared SHMEM_TYPE Ash[Ash_len]; // K x CRS shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ +#if defined(COOPMAT2) || defined(COOPMAT) +// stage matC through shmem so global stores are row-major (NPQ-contiguous) +const uint32_t Csh_stride = BS_NPQ; +#ifdef COOPMAT +const uint32_t Csh_len = BS_K * Csh_stride; +#else +const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1; +#endif +shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ +#endif + // Threadtile sizes const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K; @@ -161,7 +212,7 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW; uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW; uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3; - if (K_idx < K && NPQ_idx < NPQ) { + if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) { dst_data[dst_idx] = D_TYPE(elem); } return elem; @@ -176,6 +227,13 @@ void main() { #ifdef COOPMAT2 coopmat matC; matC = coopmat(0.0); +#elif defined(COOPMAT) + coopmat sums[cms_per_row * cms_per_col]; + [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) { + sums[i] = coopmat(0.0); + } + const uint warp_r = gl_SubgroupID / warps_N; + const uint warp_c = gl_SubgroupID % warps_N; #else float regC[TS_K][TS_NPQ]; for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { @@ -228,12 +286,15 @@ void main() { uint32_t B_lx = Ac; uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/ #ifdef TRANSPOSE - uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1); + uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03; #else - uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1); + uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03; #endif + if (aligned == 0) { + knl_idx = min(knl_idx, K * CRS - 1); + } float val = knl_data[knl_idx]; - if (K_idx >= K || CRS_idx_a >= CRS) { + if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) { val = 0.0; } Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val); @@ -282,15 +343,27 @@ void main() { uint32_t H_idx = OH_idx * s1 + KH_idx_b * d1 - p1; uint32_t W_idx = OW_idx * s0 + KW_idx_b * d0 - p0; #endif - uint32_t src_idx = - min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1); + uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13; + // skip clamp when address can't go OOB + if (aligned == 0 || !hw_in_bounds || !stride_in_bounds) { + src_idx = min(max(src_idx, 0), p.Cin * p.N * p.W * p.H - 1); + } float val = src_data[src_idx]; - if (CRS_idx_b >= CRS || NPQ_idx >= NPQ - || H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case) + bool oob = false; + if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) { + oob = true; + } + // also catches lower-bound underflow (idx wraps to 0x80000000+) + if (!hw_in_bounds && (H_idx >= p.H || W_idx >= p.W)) { + oob = true; + } #ifdef TRANSPOSE - || (H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0) + if (!stride_in_bounds && + ((H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0))) { + oob = true; + } #endif - ) { + if (oob) { val = 0.0; } Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val); @@ -303,6 +376,23 @@ void main() { coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor); coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor); matC = coopMatMulAdd(matA, matB, matC); +#elif defined(COOPMAT) + // each subgroup multiplies its grid of fragments per TK-sized CRS chunk + [[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) { + coopmat cache_a[cms_per_row]; + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK; + coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor); + } + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + coopmat cache_b; + const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN; + coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor); + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]); + } + } + } #else if (T_y * TS_K < K) { UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) { @@ -325,8 +415,51 @@ void main() { barrier(); } /* Save C* */ +#if defined(COOPMAT2) || defined(COOPMAT) + // stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores +#ifdef COOPMAT + const bool use_staged_store = true; +#else + const bool use_staged_store = (csh_store != 0); +#endif + if (use_staged_store) { +#ifdef COOPMAT + // cm1: each subgroup stores its fragment grid into its Csh slot + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN; + coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor); + } + } +#else + coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor); +#endif + barrier(); + + // cooperative shmem->global: WG threads spread across BS_NPQ (the + // contiguous direction of dst), each iter covers store_rows_per_iter K-rows + const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ; + const uint32_t store_iters = BS_K / store_rows_per_iter; + const uint32_t k_thread_offset = tid / BS_NPQ; + const uint32_t npq_thread = tid % BS_NPQ; + [[unroll]] for (uint32_t i = 0; i < store_iters; i++) { + uint32_t k_local = i * store_rows_per_iter + k_thread_offset; + uint32_t K_idx = B_idx_K * BS_K + k_local; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread; + uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); + uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); + uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW; + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3; + if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) { + dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]); + } + } + } #ifdef COOPMAT2 - coopMatPerElementNV(matC, matC, perElemOpStore); + else { + coopMatPerElementNV(matC, matC, perElemOpStore); + } +#endif #else if (T_y * TS_K < K) { for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { @@ -337,7 +470,7 @@ void main() { uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW; uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW; uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3; - if (K_idx < K && NPQ_idx < NPQ) { + if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) { dst_data[dst_idx] = regC[T_ly][T_lx]; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp index 9f8bfd3c182..d55e13253a8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp @@ -12,7 +12,9 @@ void main() { return; } -#if defined(DATA_D_BF16) +#if defined(DATA_A_BF16) + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + src0_idx(idx)]))); +#elif defined(DATA_D_BF16) float f = float(data_a[get_aoffset() + src0_idx(idx)]); data_d[get_doffset() + dst_idx(idx)] = D_TYPE(fp32_to_bf16(f)); #elif !defined(OPTIMIZATION_ERROR_WORKAROUND) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl index 88d07d2dfd5..e67299fdeca 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl @@ -5,21 +5,60 @@ #include "types.glsl" #if defined(DATA_A_F32) +FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) { + return data_a[a_offset + ib]; +} vec2 dequantize(uint ib, uint iqs, uint a_offset) { return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]); } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + return vec4(data_a[a_offset + ib ], data_a[a_offset + ib + 1], + data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]); +} +vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) { + return vec4(data_a[a_offset + ib ], data_a[a_offset + ib + 1], + data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]); +} + #endif #if defined(DATA_A_F16) +FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) { + return data_a[a_offset + ib]; +} vec2 dequantize(uint ib, uint iqs, uint a_offset) { return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]); } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + return vec4(data_a[a_offset + ib ], data_a[a_offset + ib + 1], + data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]); +} +vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) { + const vec2 a = data_a_packed32[(a_offset + ib)/2]; + const vec2 b = data_a_packed32[(a_offset + ib)/2 + 1]; + return vec4(a, b); +} #endif #if defined(DATA_A_BF16) +FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) { + return bf16_to_fp32(data_a[a_offset + ib]); +} vec2 dequantize(uint ib, uint iqs, uint a_offset) { return vec2(bf16_to_fp32(data_a[a_offset + ib]), bf16_to_fp32(data_a[a_offset + ib + 1])); } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + return vec4(bf16_to_fp32(data_a[a_offset + ib ]), bf16_to_fp32(data_a[a_offset + ib + 1]), + bf16_to_fp32(data_a[a_offset + ib + 2]), bf16_to_fp32(data_a[a_offset + ib + 3])); +} +vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) { + const uint a = data_a_packed32[(a_offset + ib)/2]; + const uint b = data_a_packed32[(a_offset + ib)/2 + 1]; + return vec4(uintBitsToFloat((a & 0x0000ffff) << 16), + uintBitsToFloat( a & 0xffff0000), + uintBitsToFloat((b & 0x0000ffff) << 16), + uintBitsToFloat( b & 0xffff0000)); +} #endif #if defined(DATA_A_Q4_0) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl index c582aba87dc..7171cbfa559 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl @@ -1,4 +1,12 @@ +// Each format defines a scalar dequantFunc plus a V=4 dequantFunc_v +// passed as the optional vector decoder to coopMatLoadTensorNV via +// GL_NV_cooperative_matrix_decode_vector. When the driver doesn't support +// the extension, ggml-vulkan.cpp strips it from the compiled SPIR-V. +#ifdef GL_NV_cooperative_matrix_decode_vector +#extension GL_NV_cooperative_matrix_decode_vector : enable +#endif + #include "types.glsl" layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufF32 { @@ -25,6 +33,19 @@ float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2 return bit != 0u ? d : -d; } +f16vec4 dequantFuncQ1_0_v(const in decodeBufQ1_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t md = -d; + const uint idx = coordInBlock[1]; + const uint qs_nib = uint(bl.block.qs[idx >> 3]) >> (idx & 0x4u); + return f16vec4( + (qs_nib & 1u) != 0u ? d : md, + (qs_nib & 2u) != 0u ? d : md, + (qs_nib & 4u) != 0u ? d : md, + (qs_nib & 8u) != 0u ? d : md); +} + layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 { block_q4_0_packed16 block; }; @@ -42,10 +63,28 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ4_0_v(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; // 0 or 4 + const uint qs_i = (idx & 0xE) >> 1; // even, in {0,2,4,6} + const uint qsw = uint32_t(bl.block.qs[qs_i ]) + | (uint32_t(bl.block.qs[qs_i + 1u]) << 16); + // shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte. + const uint q4 = (qsw >> shift) & 0x0F0F0F0Fu; + const u8vec4 q = unpack8(q4); + return f16vec4((vec4(q) - vec4(8.0)) * vec4(float(d))); +} + layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 { block_q4_1 block; }; +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1_packed32 { + block_q4_1_packed32 block; +}; + float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const float16_t d = bl.block.d; @@ -60,10 +99,27 @@ float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ4_1_v(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ4_1_packed32 bl32 = decodeBufQ4_1_packed32(bl); + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; // 0 or 4 + const uint qs_w = (idx & 0xC) >> 2; // iqs / 4 in [0,4) + const uint qsw = uint32_t(bl32.block.qs[qs_w]); + const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu); + return f16vec4(vec4(q) * vec4(float(d)) + vec4(float(m))); +} + layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 { block_q5_0 block; }; +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0_packed16 { + block_q5_0_packed16 block; +}; + float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const float16_t d = bl.block.d; @@ -82,10 +138,32 @@ float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ5_0_v(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ5_0_packed16 bl16 = decodeBufQ5_0_packed16(bl); + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; // 0 or 4 + const uint qs_i = (idx & 0xC) >> 1; // packed16 word index, in {0,2,4,6} + const uint qsw = uint32_t(bl16.block.qs[qs_i ]) + | (uint32_t(bl16.block.qs[qs_i + 1u]) << 16); + const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu); + + const uint uint_qh = uint(bl16.block.qh[1]) << 16 | uint(bl16.block.qh[0]); + const uint qh_pack = uint_qh >> idx; // bits 0..3 = element idx..idx+3 high bits + const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u; + + return f16vec4((vec4(ql) + vec4(qh_high) - vec4(16.0)) * vec4(float(d))); +} + layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 { block_q5_1 block; }; +layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1_packed32 { + block_q5_1_packed32 block; +}; + float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const float16_t d = bl.block.d; @@ -105,6 +183,23 @@ float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ5_1_v(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ5_1_packed32 bl32 = decodeBufQ5_1_packed32(bl); + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; // 0 or 4 + const uint qs_w = (idx & 0xC) >> 2; // iqs / 4 in [0,4) + const uint qsw = uint32_t(bl32.block.qs[qs_w]); + const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu); + + const uint qh_pack = bl.block.qh >> idx; // bits 0..3 = element idx..idx+3 high bits + const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u; + + return f16vec4((vec4(ql) + vec4(qh_high)) * vec4(float(d)) + vec4(float(m))); +} + layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 { block_q8_0_packed16 block; }; @@ -121,6 +216,17 @@ float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ8_0_v(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint base = idx >> 1u; + const uint w = uint(uint16_t(bl.block.qs[base])) + | (uint(uint16_t(bl.block.qs[base + 1u])) << 16u); + const i8vec4 qi = unpack8(int32_t(w)); + return f16vec4(vec4(qi) * vec4(float(d))); +} + layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K { block_q2_K block; }; @@ -129,6 +235,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2 block_q2_K_packed16 block; }; +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K_packed32 { + block_q2_K_packed32 block; +}; + float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl); @@ -147,10 +257,36 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ2_K_v(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ2_K_packed32 bl32 = decodeBufQ2_K_packed32(bl); + const f16vec2 dm = bl.block.dm; + const uint idx = coordInBlock[1]; + + const uint scalesi = idx >> 4; // 0..15 + const uint qsshift = (idx & 0x60) >> 4; // 0,2,4,6 + + // qs_i (packed16) = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1) is even for idx % 4 == 0, + // so qs_w (packed32) = qs_i / 2 = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2). + const uint qs_w = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2); + const uint qsw = uint32_t(bl32.block.qs[qs_w]); + const uint qs4 = (qsw >> qsshift) & 0x03030303u; + const u8vec4 qi = unpack8(qs4); + + const uint scales = bl.block.scales[scalesi]; + const float16_t d_sub = dm.x * float16_t(scales & 0xF); + const float16_t m_sub = dm.y * float16_t(scales >> 4); + return f16vec4(vec4(qi) * vec4(float(d_sub)) - vec4(float(m_sub))); +} + layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K { block_q3_K block; }; +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K_packed16 { + block_q3_K_packed16 block; +}; + float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const uint idx = coordInBlock[1]; @@ -179,6 +315,47 @@ float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ3_K_v(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ3_K_packed16 bl16 = decodeBufQ3_K_packed16(bl); + const uint idx = coordInBlock[1]; + + const uint n = idx >> 7; // 0,1 + const uint is = idx >> 4; // 0..15 + const uint halfsplit = (idx & 0x60) >> 5; // 0,1,2,3 + const uint qsshift = halfsplit << 1; // 0,2,4,6 + const uint hbit = (n << 2) + halfsplit; // 0..7 (bit position in hmask byte) + + uint32_t scaleidx0 = (is < 8) ? is : (is - 8); + uint32_t scaleidx0shift = (is < 8) ? 0u : 4u; + uint32_t scaleidx1 = is + 8 - (is / 4) * 4; + uint32_t scaleidx1shift = (is / 4) * 2; + + const int8_t us = int8_t( + ((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | + (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4)); + const float16_t dl = bl.block.d * float16_t(int(us) - 32); + + // For idx % 4 == 0: (idx & 0x1F) == (idx & 0x1C) is a multiple of 4. + const uint qsi = (n << 5) + (idx & 0x1Cu); + const uint hmi = (idx & 0x1Cu); + + // Two adjacent uint16 packed16 reads, combined into a uint32 in registers. + // After this: byte j of qsw / hmw holds the data for element idx+j. + const uint qsw = uint32_t(bl16.block.qs[qsi >> 1]) + | (uint32_t(bl16.block.qs[(qsi >> 1) + 1u]) << 16); + const uint hmw = uint32_t(bl16.block.hmask[hmi >> 1]) + | (uint32_t(bl16.block.hmask[(hmi >> 1) + 1u]) << 16); + + // qsshift in {0,2,4,6} and hbit in {0..7}: per-byte masks isolate the wanted bits + // with no inter-byte leakage. + const uint ql4 = (qsw >> qsshift) & 0x03030303u; + const uint qh4 = (hmw >> hbit) & 0x01010101u; + + const ivec4 q = ivec4(unpack8(ql4 | (qh4 << 2))) - ivec4(4); + return f16vec4(vec4(q) * vec4(float(dl))); +} + layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K { block_q4_K block; }; @@ -187,6 +364,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4 block_q4_K_packed16 block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed32 { + block_q4_K_packed32 block; +}; + layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 { block_q4_K_packed128 block; }; @@ -334,6 +515,55 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2 return float16_t(ret); } +f16vec4 dequantFuncQ4_K_v(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ4_K_packed32 bl32 = decodeBufQ4_K_packed32(bl); + decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl); + const uint idx = coordInBlock[1]; + + const uint is = idx >> 5; // 0..7 + +#if defined(IS_MUL_MM2) && defined(DATA_A_Q4_K) + vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)]; + float d = v.x; + float m = v.y; +#else + uvec4 v = bl128.block.q4k[0]; + const vec2 loadd = vec2(unpackFloat2x16(v.x)); + + uint32_t sc; + uint32_t mbyte; + + uint32_t scale0 = v.y; + uint32_t scale4 = v.z; + uint32_t scale8 = v.w; + + uint32_t sc_lo = scale0; + uint32_t mb_lo = scale4; + uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2); + uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2); + + sc = is < 4 ? sc_lo : sc_hi; + mbyte = is < 4 ? mb_lo : mb_hi; + sc = sc >> (8 * (is & 3)); + mbyte = mbyte >> (8 * (is & 3)); + sc &= 0x3F; + mbyte &= 0x3F; + + const float d = loadd.x * float(sc); + const float m = loadd.y * float(mbyte); +#endif + + // idx in [0,256); vector decode uses idx a multiple of 4. packed32 word index: + // (qs_i >> 1) == (idx >> 6) * 8 + ((idx & 0x1E) >> 2). sh is 0 or 4 only, so a + // single (w >> sh) & 0x0F0F0F0F isolates all four nibbles without inter-byte leakage. + const uint sh = (idx & 0x20u) >> 3u; + const uint w = uint32_t(bl32.block.qs[(idx >> 6) * 8u + ((idx & 0x1Eu) >> 2)]); + const u8vec4 q = unpack8((w >> sh) & 0x0F0F0F0Fu); + + return f16vec4(vec4(d) * vec4(q) - vec4(m)); +} + layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K { block_q5_K block; }; @@ -346,6 +576,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5 block_q5_K_packed128 block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed32 { + block_q5_K_packed32 block; +}; + float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl); @@ -399,6 +633,58 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2 return float16_t(ret); } +f16vec4 dequantFuncQ5_K_v(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ5_K_packed32 bl32 = decodeBufQ5_K_packed32(bl); + decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl); + const uint idx = coordInBlock[1]; + const uint is = idx >> 5; + +#if defined(IS_MUL_MM2) && defined(DATA_A_Q5_K) + vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)]; + float d = v.x; + float m = v.y; +#else + uvec4 v = bl128.block.q5k[0]; + + const f16vec2 loadd = unpackFloat2x16(v.x); + + uint32_t sc; + uint32_t mbyte; + + uint32_t scale0 = v.y; + uint32_t scale4 = v.z; + uint32_t scale8 = v.w; + + uint32_t sc_lo = scale0; + uint32_t mb_lo = scale4; + uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2); + uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2); + + sc = is < 4 ? sc_lo : sc_hi; + mbyte = is < 4 ? mb_lo : mb_hi; + sc = sc >> (8 * (is & 3)); + mbyte = mbyte >> (8 * (is & 3)); + sc &= 0x3F; + mbyte &= 0x3F; + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); +#endif + + // sh is 0 or 4; mask 0x0F0F0F0F covers the four nibbles regardless (no inter-byte leakage). + const uint sh = (idx & 0x20u) >> 3u; + const uint qs_w = (idx >> 6) * 8u + ((idx & 0x1Eu) >> 2); + const uint qh_w = (idx & 0x1Eu) >> 2; + + const uint ql4 = (uint32_t(bl32.block.qs[qs_w]) >> sh) & 0x0F0F0F0Fu; + // qh stores bit `is` per element across 4 consecutive bytes; one shift+mask handles all 4. + const uint qh4 = ((uint32_t(bl32.block.qh[qh_w]) >> is) & 0x01010101u) << 4u; + + const u8vec4 qi = unpack8(ql4 | qh4); + return f16vec4(vec4(qi) * vec4(d) - vec4(m)); +} + layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K { block_q6_K block; }; @@ -431,6 +717,35 @@ float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2 return ret; } +f16vec4 dequantFuncQ6_K_v(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl); + const uint idx = coordInBlock[1]; + + const uint b = (idx & 0x40) >> 6; + const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6 + const uint is = idx >> 4; + const uint sh = b * 4; // 0 or 4 + + const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]); + + const uint ql_i = ((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1); + const uint qh_i = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1); + + // Two adjacent uint16 packed16 reads, combined into a uint32 in registers. + // After this: byte j of qlw / qhw holds the data for element idx+j. + const uint qlw = uint32_t(bl16.block.ql[ql_i ]) | (uint32_t(bl16.block.ql[ql_i + 1]) << 16); + const uint qhw = uint32_t(bl16.block.qh[qh_i ]) | (uint32_t(bl16.block.qh[qh_i + 1]) << 16); + + // sh in {0,4} and qhshift in {0,2,4,6}: per-byte masks 0x0F / 0x03 keep only the + // wanted bits with no inter-byte leakage; place qh's 2 bits at nibble high position. + const uint ql4 = (qlw >> sh) & 0x0F0F0F0Fu; + const uint qh4 = ((qhw >> qhshift) & 0x03030303u) << 4u; + + const ivec4 qi = ivec4(unpack8(ql4 | qh4)); + return f16vec4((vec4(qi) - vec4(32.0f)) * vec4(float(dscale))); +} + #if defined(DATA_A_IQ1_S) layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ1_S { block_iq1_s block; @@ -453,6 +768,29 @@ float16_t dequantFuncIQ1_S(const in decodeBufIQ1_S bl, const in uint blockCoords float16_t ret = float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * int(idx % 8), 2)) + float16_t(delta)); return ret; } + +f16vec4 dequantFuncIQ1_S_v(const in decodeBufIQ1_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + + const uint ib32 = idx >> 5; + const uint ib8 = idx >> 3; + const int i8b = int(idx & 4); // 0 or 4 + + const uint qh = bl.block.qh[ib32]; + const uint qs = bl.block.qs[ib8]; + const float dl = float(d) * float(2 * bitfieldExtract(qh, 12, 3) + 1); + const float delta = ((qh & 0x8000u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA; + const uint grid = iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]; + + const ivec4 q = ivec4( + bitfieldExtract(int(grid), 2 * (i8b + 0), 2), + bitfieldExtract(int(grid), 2 * (i8b + 1), 2), + bitfieldExtract(int(grid), 2 * (i8b + 2), 2), + bitfieldExtract(int(grid), 2 * (i8b + 3), 2)); + return f16vec4((vec4(q) + vec4(delta)) * dl); +} #endif #if defined(DATA_A_IQ1_M) @@ -485,6 +823,33 @@ float16_t dequantFuncIQ1_M(const in decodeBufIQ1_M bl, const in uint blockCoords float16_t ret = d * float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * i8, 2)) + float16_t(delta)); return ret; } + +f16vec4 dequantFuncIQ1_M_v(const in decodeBufIQ1_M bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufIQ1_M_packed64 bl64 = decodeBufIQ1_M_packed64(bl); + const uint idx = coordInBlock[1]; + + uvec2 scales = unpack32(bl64.block.scales); + const float16_t d = uint16BitsToHalf(uint16_t(((scales.x & 0xF000) >> 12) | ((scales.x & 0xF0000000) >> 24) | ((scales.y & 0xF000) >> 4) | ((scales.y & 0xF0000000) >> 16))); + + const uint ib8 = idx >> 3; + const uint ib16 = idx >> 4; + const int i8b = int(idx & 4); // 0 or 4 -- i8 base for the V=4 group + + const uint sc = bl.block.scales[ib8 / 8]; + const uint qs = bl.block.qs[ib8]; + const uint qh = bl.block.qh[ib16] >> (4 * (ib8 & 1)); + const float dl = 2.0 * float(bitfieldExtract(sc, 3 * int(ib16 & 3), 3)) + 1.0; + const float delta = ((qh & 8u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA; + const uint grid = iq1s_grid[qs | ((qh & 7u) << 8)]; + + const ivec4 q = ivec4( + bitfieldExtract(int(grid), 2 * (i8b + 0), 2), + bitfieldExtract(int(grid), 2 * (i8b + 1), 2), + bitfieldExtract(int(grid), 2 * (i8b + 2), 2), + bitfieldExtract(int(grid), 2 * (i8b + 3), 2)); + return f16vec4((vec4(q) + vec4(delta)) * (float(d) * dl)); +} #endif #if defined(DATA_A_IQ2_XXS) @@ -520,6 +885,33 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); return float16_t(ret[idx & 1]); } + +f16vec4 dequantFuncIQ2_XXS_v(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl); + const uint idx = coordInBlock[1]; + + const uint ib32 = idx >> 5; + const uint ib8 = (idx & 0x18) >> 3; + const uint iqs = 8 * ib32 + ib8; + + const uint qs = bl.block.qs[iqs]; + const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3])); + const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28)); + + uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7); + sign |= bitCount(sign) << 7; + const uint sb = sign >> (idx & 7u); + + const uint g2 = iq2xxs_grid[qs][(idx & 4) >> 2]; + const u8vec4 g = unpack8(g2); + + return f16vec4( + dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0), + dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0), + dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0), + dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0)); +} #endif #if defined(DATA_A_IQ2_XS) @@ -548,6 +940,31 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); return float16_t(ret[idx & 1]); } + +f16vec4 dequantFuncIQ2_XS_v(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + + const uint is = idx >> 5; + const uint sshift = (idx & 0x10) >> 2; + const uint iqs = idx >> 3; + + const uint16_t qs = bl.block.qs[iqs]; + const float dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF)); + + uint sign = uint(qs >> 9); + sign |= bitCount(sign) << 7; + const uint sb = sign >> (idx & 7u); + + const uint g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2]; + const u8vec4 g = unpack8(g2); + + return f16vec4( + dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0), + dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0), + dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0), + dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0)); +} #endif #if defined(DATA_A_IQ2_S) @@ -576,6 +993,32 @@ float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords const vec2 v = db * vec2(sign01) * vec2(unpack8(g2)); return float16_t(v[idx & 1]); } + +f16vec4 dequantFuncIQ2_S_v(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + + const uint ib32 = idx >> 5; + const uint ib8 = idx >> 3; + const uint qhshift = 2 * (ib8 % 4); + + const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf; + const uint qs = bl.block.qs[ib8]; + const uint qh = bl.block.qh[ib32]; + const uint sb = uint(bl.block.qs[QUANT_K / 8 + ib8]) >> (idx & 0x6u); + + const float d = float(bl.block.d); + const float db = d * 0.25 * (0.5 + scale); + + const uint g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2]; + const u8vec4 g = unpack8(g2); + + return f16vec4( + db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0), + db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0), + db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0), + db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0)); +} #endif #if defined(DATA_A_IQ3_XXS) @@ -609,6 +1052,32 @@ float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCo const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); return float16_t(v[idx & 1]); } + +f16vec4 dequantFuncIQ3_XXS_v(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl); + const uint idx = coordInBlock[1]; + + const uint iqs = idx >> 2; + const uint is = QUANT_K / 4 + ((idx & 0xE0) >> 3); + + const float d = float(bl.block.d); + const uint qs = bl.block.qs[iqs]; + const uint signs = pack32(u16vec2(bl16.block.qs[is/2+0], bl16.block.qs[is/2+1])); + const float db = d * 0.5 * (0.5 + (signs >> 28)); + + const uint sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); + const uint sb = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6u); + + const uint grid = iq3xxs_grid[qs]; + const u8vec4 g = unpack8(grid); + + return f16vec4( + db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0), + db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0), + db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0), + db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0)); +} #endif #if defined(DATA_A_IQ3_S) @@ -635,6 +1104,30 @@ float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords return float16_t(v[idx & 1]); } + +f16vec4 dequantFuncIQ3_S_v(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + + const uint iqs = idx >> 2; + const uint iqh = idx >> 5; + + const float d = float(bl.block.d); + const uint qs = bl.block.qs[iqs]; + const uint qh = bl.block.qh[iqh]; + const uint sb = uint(bl.block.signs[iqs / 2]) >> (idx & 0x6u); + const uint scale = bl.block.scales[iqs / 16]; + const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); + + const uint grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)]; + const u8vec4 g = unpack8(grid); + + return f16vec4( + db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0), + db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0), + db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0), + db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0)); +} #endif #if defined(DATA_A_IQ4_XS) @@ -642,6 +1135,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4 block_iq4_xs block; }; +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufIQ4_XS_packed32 { + block_iq4_xs_packed32 block; +}; + float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const float16_t d = bl.block.d; @@ -657,6 +1154,30 @@ float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoor float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]); return ret; } + +f16vec4 dequantFuncIQ4_XS_v(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufIQ4_XS_packed32 bl32 = decodeBufIQ4_XS_packed32(bl); + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + + const uint ib32 = idx >> 5; // 0..7 + const uint sl = (bl32.block.scales_l >> (4 * ib32)) & 0xF; + const uint sh = (uint(bl32.block.scales_h) >> (2 * ib32)) & 0x3; + const uint qshift = (idx & 0x10) >> 2; // {0, 4} + const uint qs_w = 4 * ib32 + ((idx & 0xC) >> 2); // iqs / 4, in [0,32) + + const float16_t dl = d * float16_t(int(sl | (sh << 4)) - 32); + + const uint qsw = bl32.block.qs[qs_w]; + const u8vec4 qv = unpack8((qsw >> qshift) & 0x0F0F0F0Fu); + const vec4 ret = vec4( + float(kvalues_iq4nl[qv.x]), + float(kvalues_iq4nl[qv.y]), + float(kvalues_iq4nl[qv.z]), + float(kvalues_iq4nl[qv.w])) * float(dl); + return f16vec4(ret); +} #endif #if defined(DATA_A_IQ4_NL) @@ -664,6 +1185,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4 block_iq4_nl block; }; +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL_packed16 { + block_iq4_nl_packed16 block; +}; + float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const float16_t d = bl.block.d; @@ -676,6 +1201,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor float16_t ret = float16_t(kvalues_iq4nl[qs]) * d; return ret; } + +f16vec4 dequantFuncIQ4_NL_v(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufIQ4_NL_packed16 bl16 = decodeBufIQ4_NL_packed16(bl); + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; // 0 or 4 + const uint qs_i = (idx & 0xC) >> 1; // packed16 word index, in {0,2,4,6} + const uint qsw = uint32_t(bl16.block.qs[qs_i ]) + | (uint32_t(bl16.block.qs[qs_i + 1u]) << 16); + // shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte. + const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu); + return f16vec4( + float(d) * float(kvalues_iq4nl[q.x]), + float(d) * float(kvalues_iq4nl[q.y]), + float(d) * float(kvalues_iq4nl[q.z]), + float(d) * float(kvalues_iq4nl[q.w])); +} #endif #if defined(DATA_A_MXFP4) @@ -695,6 +1238,26 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5); return ret; } + +f16vec4 dequantFuncMXFP4_v(const in decodeBufMXFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float d = e8m0_to_fp32(bl.block.e); + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uvec4 qv = uvec4( + uint(bl.block.qs[iqs]), + uint(bl.block.qs[iqs + 1u]), + uint(bl.block.qs[iqs + 2u]), + uint(bl.block.qs[iqs + 3u])); + qv = (qv >> shift) & 0xFu; + const vec4 ret = vec4( + float(kvalues_mxfp4[qv.x]), + float(kvalues_mxfp4[qv.y]), + float(kvalues_mxfp4[qv.z]), + float(kvalues_mxfp4[qv.w])) * d * 0.5f; + return f16vec4(ret); +} #endif #if defined(DATA_A_NVFP4) @@ -702,6 +1265,10 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVF block_nvfp4 block; }; +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVFP4_packed32 { + block_nvfp4_packed32 block; +}; + float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const uint idx = coordInBlock[1]; @@ -713,56 +1280,97 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords qs = (qs >> shift) & 0xF; return float16_t(kvalues_mxfp4[qs] * d * 0.5); } + +f16vec4 dequantFuncNVFP4_v(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + decodeBufNVFP4_packed32 bl32 = decodeBufNVFP4_packed32(bl); + const uint idx = coordInBlock[1]; + const uint sub = idx >> 4; + const uint qs_w = ((idx & 0x30) >> 3) + ((idx & 0x4u) >> 2); // iqs / 4, in [0,8) + const uint shift = (idx & 0x8) >> 1; + const float d = ue4m3_to_fp32(bl.block.d[sub]); + + const uint qsw = uint32_t(bl32.block.qs[qs_w]); + const u8vec4 qv = unpack8((qsw >> shift) & 0x0F0F0F0Fu); + const vec4 ret = vec4( + float(kvalues_mxfp4[qv.x]), + float(kvalues_mxfp4[qv.y]), + float(kvalues_mxfp4[qv.z]), + float(kvalues_mxfp4[qv.w])) * d * 0.5f; + return f16vec4(ret); +} #endif #if defined(DATA_A_Q1_0) #define dequantFuncA dequantFuncQ1_0 +#define dequantFuncA_v dequantFuncQ1_0_v #elif defined(DATA_A_Q4_0) #define dequantFuncA dequantFuncQ4_0 +#define dequantFuncA_v dequantFuncQ4_0_v #elif defined(DATA_A_Q4_1) #define dequantFuncA dequantFuncQ4_1 +#define dequantFuncA_v dequantFuncQ4_1_v #elif defined(DATA_A_Q5_0) #define dequantFuncA dequantFuncQ5_0 +#define dequantFuncA_v dequantFuncQ5_0_v #elif defined(DATA_A_Q5_1) #define dequantFuncA dequantFuncQ5_1 +#define dequantFuncA_v dequantFuncQ5_1_v #elif defined(DATA_A_Q8_0) #define dequantFuncA dequantFuncQ8_0 +#define dequantFuncA_v dequantFuncQ8_0_v #elif defined(DATA_A_Q2_K) #define dequantFuncA dequantFuncQ2_K +#define dequantFuncA_v dequantFuncQ2_K_v #elif defined(DATA_A_Q3_K) #define dequantFuncA dequantFuncQ3_K +#define dequantFuncA_v dequantFuncQ3_K_v #elif defined(DATA_A_Q4_K) #define dequantFuncA dequantFuncQ4_K +#define dequantFuncA_v dequantFuncQ4_K_v #define fetch_scales fetch_scalesQ4_K #define store_scales store_scalesQ4_K #elif defined(DATA_A_Q5_K) #define dequantFuncA dequantFuncQ5_K +#define dequantFuncA_v dequantFuncQ5_K_v #define fetch_scales fetch_scalesQ5_K #define store_scales store_scalesQ4_K #elif defined(DATA_A_Q6_K) #define dequantFuncA dequantFuncQ6_K +#define dequantFuncA_v dequantFuncQ6_K_v #elif defined(DATA_A_IQ1_S) #define dequantFuncA dequantFuncIQ1_S +#define dequantFuncA_v dequantFuncIQ1_S_v #elif defined(DATA_A_IQ1_M) #define dequantFuncA dequantFuncIQ1_M +#define dequantFuncA_v dequantFuncIQ1_M_v #elif defined(DATA_A_IQ2_XXS) #define dequantFuncA dequantFuncIQ2_XXS +#define dequantFuncA_v dequantFuncIQ2_XXS_v #elif defined(DATA_A_IQ2_XS) #define dequantFuncA dequantFuncIQ2_XS +#define dequantFuncA_v dequantFuncIQ2_XS_v #elif defined(DATA_A_IQ2_S) #define dequantFuncA dequantFuncIQ2_S +#define dequantFuncA_v dequantFuncIQ2_S_v #elif defined(DATA_A_IQ3_XXS) #define dequantFuncA dequantFuncIQ3_XXS +#define dequantFuncA_v dequantFuncIQ3_XXS_v #elif defined(DATA_A_IQ3_S) #define dequantFuncA dequantFuncIQ3_S +#define dequantFuncA_v dequantFuncIQ3_S_v #elif defined(DATA_A_IQ4_XS) #define dequantFuncA dequantFuncIQ4_XS +#define dequantFuncA_v dequantFuncIQ4_XS_v #elif defined(DATA_A_IQ4_NL) #define dequantFuncA dequantFuncIQ4_NL +#define dequantFuncA_v dequantFuncIQ4_NL_v #elif defined(DATA_A_MXFP4) #define dequantFuncA dequantFuncMXFP4 +#define dequantFuncA_v dequantFuncMXFP4_v #elif defined(DATA_A_NVFP4) #define dequantFuncA dequantFuncNVFP4 +#define dequantFuncA_v dequantFuncNVFP4_v #elif defined(DATA_A_F32) #define dequantFuncA dequantFuncF32 #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp new file mode 100644 index 00000000000..65e9c678401 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp @@ -0,0 +1,7 @@ +#version 460 + +#extension GL_NV_cooperative_matrix_decode_vector : require + +void main() +{ +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index 9a7957da97b..66dcf610219 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -97,8 +97,17 @@ layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];}; #define FA_TYPE_Q5_0 6u #define FA_TYPE_Q5_1 7u #define FA_TYPE_Q8_0 8u +#define FA_TYPE_BF16 30u #define FA_TYPE_Q1_0 41u +#if defined(BFLOAT16) +#define O_TYPE float +#define O_TYPEV4 vec4 +#else +#define O_TYPE FLOAT_TYPE +#define O_TYPEV4 FLOAT_TYPEV4 +#endif + // Number of matrix elements per buffer block, derived from the K/V type spec // constant. F32 is treated as a vec4 "block" of 4 floats. F16 uses block size 1 // and bypasses the dequant path entirely. Quants follow their ggml block sizes. @@ -111,6 +120,7 @@ uint fa_block_elems(uint ty) { case FA_TYPE_Q5_0: return uint(QUANT_K_Q5_0); case FA_TYPE_Q5_1: return uint(QUANT_K_Q5_1); case FA_TYPE_Q8_0: return uint(QUANT_K_Q8_0); + case FA_TYPE_BF16: return 1u; case FA_TYPE_Q1_0: return uint(QUANT_K_Q1_0); // cm2-only, harmless elsewhere default: return 1u; } @@ -248,7 +258,7 @@ const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f; // Store the output when doing grouped query attention. // Rows index by Q's dimension 2, and the first N rows are valid. -void gqaStore(const in uint32_t r, const in uint32_t c, const in FLOAT_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) +void gqaStore(const in uint32_t r, const in uint32_t c, const in O_TYPEV4 elems, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) { uint32_t offset = (iq2 + r) * HSV / 4 + c; data_ov4[o_offset + offset] = D_TYPEV4(elems); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index bffcc095be3..23ae3833e52 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -6,6 +6,10 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#if defined(BFLOAT16) +#extension GL_EXT_bfloat16 : enable +#endif + #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_arithmetic : enable #extension GL_KHR_shader_subgroup_vote : enable @@ -14,7 +18,9 @@ #include "types.glsl" #include "flash_attn_base.glsl" +#if !defined(BFLOAT16) #include "flash_attn_dequant.glsl" +#endif // These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd const uint32_t MatBr = 16; @@ -27,32 +33,32 @@ const uint32_t cols_per_thread = Bc / cols_per_iter; layout (binding = 0) readonly buffer Q {float data_q[];}; layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];}; -layout (binding = 1) readonly buffer K {float16_t data_k[];}; -layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];}; -layout (binding = 2) readonly buffer V {float16_t data_v[];}; -layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];}; +layout (binding = 1) readonly buffer K {FLOAT_TYPE data_k[];}; +layout (binding = 1) readonly buffer KV4 {FLOAT_TYPEV4 data_kv4[];}; +layout (binding = 2) readonly buffer V {FLOAT_TYPE data_v[];}; +layout (binding = 2) readonly buffer VV4 {FLOAT_TYPEV4 data_vv4[];}; layout (binding = 3) readonly buffer M {float16_t data_m[];}; shared float tmpsh[row_split]; -const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4 -shared f16vec4 Qf[Br * qstride]; +const uint32_t qstride = HSK_pad / 4 + 2; +shared FLOAT_TYPEV4 Qf[Br * qstride]; const uint psh_stride = Br / 4 + 2; -shared f16vec4 Psh[Bc * psh_stride]; +shared FLOAT_TYPEV4 Psh[Bc * psh_stride]; // Avoid padding for hsk==256 to make it fit in 48KB shmem. const uint32_t sfshstride = (HSK <= 128) ? (Br / 4 + 2) : Br / 4; shared ACC_TYPEV4 sfsh[Bc * sfshstride]; const uint32_t D_pad = HSK_pad > HSV_pad ? HSK_pad : HSV_pad; -const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2; // in units of f16vec4 +const uint32_t kvsh_stride = (SHMEM_STAGING != 0 ? D_pad : MatBr) / 4 + 2; const uint v_cols = MatBc / 4 * row_split; // total cols, 4 vec4s per MatBc * number of subgroups const uint vsh_stride = v_cols; -shared f16vec4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)]; +shared FLOAT_TYPEV4 kvsh[(kvsh_stride >= vsh_stride) ? (Bc * kvsh_stride) : (Bc * vsh_stride)]; const uint32_t osh_stride = row_split * MatBr / 4; -shared f16vec4 pvsh[MatBc * osh_stride]; +shared O_TYPEV4 pvsh[MatBc * osh_stride]; shared ACC_TYPE slope[Br]; @@ -76,7 +82,7 @@ void main() { if ((HSK % 16) != 0) { [[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) { if (i + tid < Br * qstride) { - Qf[i + tid] = f16vec4(0); + Qf[i + tid] = FLOAT_TYPEV4(0); } } barrier(); @@ -89,15 +95,15 @@ void main() { uint32_t r = (idx + tid) / (HSK / 4); if (r < Br && d < HSK / 4 && i * Br + r < N) { - Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale); + Qf[r * qstride + d] = FLOAT_TYPEV4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale); } } barrier(); - f16vec4 Of[rows_per_thread][d_per_thread]; + O_TYPEV4 Of[rows_per_thread][d_per_thread]; [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { [[unroll]] for (uint32_t d = 0; d < d_per_thread; ++d) { - Of[r][d] = f16vec4(0.0); + Of[r][d] = O_TYPEV4(0.0); } } @@ -222,15 +228,18 @@ void main() { uint32_t d = (idx + tid) % (HSK_pad / 4); uint32_t c = (idx + tid) / (HSK_pad / 4); if (idx + gl_WorkGroupSize.x <= Bc * HSK_pad / 4 || c < Bc) { - f16vec4 K_Tf = f16vec4(0); + FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0); if ((!KV_bounds_check || j * Bc + c < KV) && (HSK == HSK_pad || d < HSK / 4)) { +#if !defined(BFLOAT16) if (USE_DECODE_K) { uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d; uint ib = coord / BLOCK_SIZE_K; uint iqs = (coord % BLOCK_SIZE_K); K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K); - } else { - K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]); + } else +#endif + { + K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]); } } @@ -244,16 +253,16 @@ void main() { // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16 // This is written transposed in order to allow for N being 8 if implementations need it coopmat SfMat = coopmat(0); - coopmat KMat; - coopmat QMat; + coopmat KMat; + coopmat QMat; [[unroll]] for (uint32_t d = 0; d < HSK_pad / 16; ++d) { // If SHMEM_STAGING is set, a Bc * HSK_pad size tile of K is loaded to shmem - // If not, f16 K is loaded directly from global memory if aligned, otherwise + // If not, K is loaded directly from global memory if aligned, otherwise // staged through a Bc * MatBr size staging buffer. - // If K is not type f16, then it is always staged for dequantization. + // If K is a quant type, then it is always staged for dequantization. if (SHMEM_STAGING == 0) { - // For quants we always need to dequant into kvsh; for f16 we can load + // For quants we always need to dequant into kvsh; for f16/bf16 we can load // directly from global memory when alignment / bounds allow it. const bool stage_k = USE_DECODE_K || KV_bounds_check || d * 16 + 16 > HSK; if (stage_k) { @@ -262,15 +271,18 @@ void main() { uint32_t col_vec = (idx + tid) % (MatBr / 4); uint32_t row = (idx + tid) / (MatBr / 4); if (idx + tid < Bc * MatBr / 4) { - f16vec4 K_Tf = f16vec4(0); + FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0); if ((!KV_bounds_check || j * Bc + row < KV) && (HSK == HSK_pad || d * 16 + col_vec * 4 < HSK)) { +#if !defined(BFLOAT16) if (USE_DECODE_K) { uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE_K + d * 16 + col_vec * 4; uint ib = coord / BLOCK_SIZE_K; uint iqs = (coord % BLOCK_SIZE_K); K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K); - } else { - K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]); + } else +#endif + { + K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]); } } @@ -357,7 +369,7 @@ void main() { [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) { const uint d_local = d0 / threads_per_rowgroup; [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Of[r][d_local] = float16_t(eMf[r]) * Of[r][d_local]; + Of[r][d_local] = O_TYPE(eMf[r]) * Of[r][d_local]; } } @@ -368,10 +380,10 @@ void main() { [[unroll]] for (uint32_t r = 0; r < rows_per_thread; r += 4) { const uint row = tile_row(r); if (KV_bounds_check && j * Bc + col >= KV) { - Psh[col * psh_stride + row / 4] = f16vec4(0.0f); + Psh[col * psh_stride + row / 4] = FLOAT_TYPEV4(0.0f); } else { const vec4 mfvec = vec4(Mf[r], Mf[r + 1], Mf[r + 2], Mf[r + 3]); - const f16vec4 Pf = f16vec4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec)); + const FLOAT_TYPEV4 Pf = FLOAT_TYPEV4(exp(vec4(sfsh[row / 4 + col * sfshstride]) - mfvec)); [[unroll]] for (uint32_t vec_idx = 0; vec_idx < 4; ++vec_idx) { Lf[r + vec_idx] += Pf[vec_idx]; } @@ -385,15 +397,18 @@ void main() { uint32_t d = (idx + tid) % (HSV_pad / 4); uint32_t c = (idx + tid) / (HSV_pad / 4); if (idx + gl_WorkGroupSize.x <= Bc * HSV_pad / 4 || c < Bc) { - f16vec4 V_Tf = f16vec4(0); + FLOAT_TYPEV4 V_Tf = FLOAT_TYPEV4(0); if ((!KV_bounds_check || j * Bc + c < KV) && (HSV == HSV_pad || d < HSV / 4)) { +#if !defined(BFLOAT16) if (USE_DECODE_V) { uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d; uint ib = coord / BLOCK_SIZE_V; uint iqs = (coord % BLOCK_SIZE_V); V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V); - } else { - V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]); + } else +#endif + { + V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]); } } @@ -409,7 +424,7 @@ void main() { [[unroll]] for (uint32_t hsv_tile = 0; hsv_tile < num_hsv_tiles; ++hsv_tile) { const uint hsv_offset = (hsv_tile * row_split + gl_SubgroupID) * 16; - coopmat PVMat = coopmat(0); + coopmat PVMat = coopmat(0); // Preload V tiles for [Bc, 16 * num subgroups] const uint v_rows = Bc; @@ -417,11 +432,11 @@ void main() { const uint v_loads_per_thread = v_total / gl_WorkGroupSize.x; // If SHMEM_STAGING is set, a Bc * HSV_pad size tile of V is loaded to shmem. - // If not, f16 V is loaded directly from global memory if aligned, otherwise + // If not, V is loaded directly from global memory if aligned, otherwise // staged through a Bc * MatBr size staging buffer. - // If V is not type f16, then it is always staged for dequantization. + // If V is a quant type, then it is always staged for dequantization. if (SHMEM_STAGING == 0) { - // For quants we always preload via kvsh. For f16 we only preload when + // For quants we always preload via kvsh. For f16/bf16 we only preload when // alignment / bounds force it (otherwise we coopMatLoad direct from data_vv4). const bool stage_v = USE_DECODE_V || KV_bounds_check; if (stage_v) { @@ -438,13 +453,16 @@ void main() { const uint iqs = coord % BLOCK_SIZE_V; if (!KV_bounds_check || (v_row < KV && v_col < HSV)) { +#if !defined(BFLOAT16) if (USE_DECODE_V) { kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V); - } else { + } else +#endif + { kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4]; } } else { - kvsh[row * vsh_stride + col] = f16vec4(0.0f); + kvsh[row * vsh_stride + col] = FLOAT_TYPEV4(0.0f); } } } @@ -459,7 +477,7 @@ void main() { if (SHMEM_STAGING == 0) { if (!USE_DECODE_V && !KV_bounds_check) { - // F16 values can be loaded directly from global memory + // F16/BF16 values can be loaded directly from global memory const uint v_tile_row = j * Bc + bc_chunk * MatBc; const uint v_tile_offset = v_offset / 4 + v_tile_row * v_stride / 4 + hsv_offset / 4; coopMatLoad(QMat, data_vv4, v_tile_offset, v_stride / 4, gl_CooperativeMatrixLayoutRowMajor); @@ -573,7 +591,7 @@ void main() { [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) { const uint d_local = d0 / threads_per_rowgroup; - Of[r][d_local] *= float16_t(ms); + Of[r][d_local] *= O_TYPE(ms); } } else { vs = exp(sink - Mf[r]); @@ -591,7 +609,7 @@ void main() { [[unroll]] for (uint32_t d0 = 0; d0 < HSV / 4; d0 += threads_per_rowgroup) { const uint d_local = d0 / threads_per_rowgroup; [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) { - Of[r][d_local] *= float16_t(Lfrcp[r]); + Of[r][d_local] *= O_TYPE(Lfrcp[r]); #if defined(FLOAT_TYPE_MAX) Of[r][d_local] = clamp(Of[r][d_local], -FLOAT_TYPE_MAX, FLOAT_TYPE_MAX); #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 141bb870883..b9c03fe499d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -8,9 +8,16 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#if defined(BFLOAT16) +#extension GL_EXT_bfloat16 : enable +#endif + #extension GL_KHR_memory_scope_semantics : enable #extension GL_KHR_cooperative_matrix : enable #extension GL_NV_cooperative_matrix2 : enable +#ifdef GL_NV_cooperative_matrix_decode_vector +#extension GL_NV_cooperative_matrix_decode_vector : enable +#endif #extension GL_EXT_buffer_reference : enable #extension GL_KHR_shader_subgroup_ballot : enable #extension GL_KHR_shader_subgroup_vote : enable @@ -18,7 +25,9 @@ #include "types.glsl" #include "flash_attn_base.glsl" +#if !defined(BFLOAT16) #include "dequant_funcs_cm2.glsl" +#endif // buffer_reference stride = sizeof(struct) = FaBlockBytesK/V. layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_K { @@ -28,6 +37,7 @@ layout(buffer_reference, std430, buffer_reference_align = 1) buffer decodeBufFA_ uint8_t raw[FaBlockBytesV]; }; +#if !defined(BFLOAT16) float16_t faDecodeK(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) { switch (FaTypeK) { case FA_TYPE_F32: return dequantFuncF32 (decodeBufF32 (bl_in), blockCoords, coordInBlock); @@ -54,6 +64,42 @@ float16_t faDecodeV(const decodeBufFA_V bl_in, const uint blockCoords[2], const } } +// V=4 vector decode for K/V; dispatches to per-format _v decoders. +f16vec4 faDecodeKVector(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) { + switch (FaTypeK) { + case 0u: return f16vec4(decodeBufF32(bl_in).block); + case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock); + case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock); + case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock); + case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock); + case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock); + case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock); + default: return f16vec4(0); + } +} + +f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], const uint coordInBlock[2]) { + switch (FaTypeV) { + case 0u: return f16vec4(decodeBufF32(bl_in).block); + case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock); + case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock); + case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock); + case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock); + case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock); + case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock); + default: return f16vec4(0); + } +} + +#ifdef GL_NV_cooperative_matrix_decode_vector +#define FADECODEK , faDecodeK, faDecodeKVector +#define FADECODEV , faDecodeV, faDecodeVVector +#else +#define FADECODEK , faDecodeK +#define FADECODEV , faDecodeV +#endif +#endif + layout (binding = 0) readonly buffer Q {uint8_t data_q[];}; layout (binding = 1) readonly buffer K {uint8_t data_k[];}; layout (binding = 2) readonly buffer V {uint8_t data_v[];}; @@ -157,15 +203,15 @@ void main() { tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1); coopmat Q; - coopmat Qf16; + coopmat Qf16; uint32_t q_offset = gqa_iq1*p.nb01*4/*sizeof(float)*/ + iq2*p.nb02+iq3*p.nb03; coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad)); - Qf16 = coopmat(Q); - Qf16 *= float16_t(p.scale); + Q *= Q_TYPE(p.scale); + Qf16 = coopmat(Q); - coopmat O = coopmat(0); + coopmat O = coopmat(0); coopmat L, M; @@ -253,16 +299,20 @@ void main() { coopmat S = coopmat(0); - coopmat K_T; + coopmat K_T; uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; // F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128. +#if defined(BFLOAT16) + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose); +#else const bool k_use_decode = (bs_k > 1u); if (k_use_decode) { - coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose, faDecodeK); + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK); } else { coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose); } +#endif S = coopMatMulAdd(Qf16, K_T, S); if (LOGIT_SOFTCAP) { @@ -313,22 +363,26 @@ void main() { coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C); } - coopmat P_A = coopmat(P); + coopmat P_A = coopmat(P); // compute rowsum by multiplying by matrix of all ones. - coopmat One = coopmat(1.0); + coopmat One = coopmat(1.0); rowsum = coopmat(0.0); rowsum = coopMatMulAdd(P_A, One, rowsum); - coopmat V; + coopmat V; uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; +#if defined(BFLOAT16) + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad)); +#else const bool v_use_decode = (bs_v > 1u); if (v_use_decode) { - coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad), faDecodeV); + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV); } else { coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad)); } +#endif L = eM*L + rowsum; @@ -340,7 +394,7 @@ void main() { // resize eM by using smear/reduce coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); - O *= coopmat(eMdiag); + O *= coopmat(eMdiag); O = coopMatMulAdd(P_A, V, O); } @@ -389,7 +443,7 @@ void main() { if (sink > Mr[i]) { ms = exp(Mr[i] - sink); - O[i] *= float16_t(ms); + O[i] *= O_TYPE(ms); } else { vs = exp(sink - Mr[i]); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl index 02106f33cbe..8704479d960 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_dequant.glsl @@ -28,6 +28,9 @@ layout (binding = 2) readonly buffer V_PACKED_Q5_1 { block_q5_1_packed16 data[]; layout (binding = 1) readonly buffer K_PACKED_Q8_0 { block_q8_0_packed16 data[]; } k_packed_q8_0; layout (binding = 2) readonly buffer V_PACKED_Q8_0 { block_q8_0_packed16 data[]; } v_packed_q8_0; +layout (binding = 1) readonly buffer K_PACKED_BF16 { u16vec4 data[]; } k_packed_bf16; +layout (binding = 2) readonly buffer V_PACKED_BF16 { u16vec4 data[]; } v_packed_bf16; + // Q4_1 and Q5_1 packed32 views: aliased to the same memory as the packed16 // views, used by the MMQ K-side hot path for fast 4-uint loads. layout (binding = 1) readonly buffer K_PACKED_Q4_1_P32 { block_q4_1_packed32 data[]; } k_packed_q4_1_p32; @@ -99,6 +102,9 @@ layout (binding = 1) readonly buffer K_PACKED_Q5_1_P32 { block_q5_1_packed32 dat return FLOAT_TYPE(BUF.data[a_offset + ib].d) * FLOAT_TYPEV4(v0.x, v0.y, v1.x, v1.y); \ } +#define FA_DEQUANT4_BF16(BUF) \ + return FLOAT_TYPEV4(bf16_to_fp32(uvec4(BUF.data[(a_offset + ib) / 4]))); + FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { if (binding_idx == BINDING_IDX_K) { switch (FaTypeK) { @@ -108,6 +114,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(k_packed_q5_0) case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(k_packed_q5_1) case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(k_packed_q8_0) + case FA_TYPE_BF16: FA_DEQUANT4_BF16(k_packed_bf16) } } else { switch (FaTypeV) { @@ -117,6 +124,7 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) { case FA_TYPE_Q5_0: FA_DEQUANT4_Q5_0(v_packed_q5_0) case FA_TYPE_Q5_1: FA_DEQUANT4_Q5_1(v_packed_q5_1) case FA_TYPE_Q8_0: FA_DEQUANT4_Q8_0(v_packed_q8_0) + case FA_TYPE_BF16: FA_DEQUANT4_BF16(v_packed_bf16) } } return FLOAT_TYPEV4(0); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp new file mode 100644 index 00000000000..72059d4afc2 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp @@ -0,0 +1,69 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : require +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_shuffle : enable + +layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; + +layout(constant_id = 0) const uint WARP_SIZE = 32; +layout(constant_id = 1) const uint N = 128; + +layout(push_constant) uniform parameter +{ + uint n_rows; + uint src_offset; + uint dst_offset; + float scale; +}; + +layout(binding = 0, std430) readonly buffer A { float data_a[]; }; +layout(binding = 1, std430) writeonly buffer D { float data_d[]; }; + +const uint EL_W = N / WARP_SIZE; + +void main() { + const uint lane = gl_SubgroupInvocationID; + for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID; + row < n_rows; + row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) { + const uint row_offset = row * N; + + float reg[EL_W]; + + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale; + } + + [[unroll]] + for (uint h = 1; h < WARP_SIZE; h <<= 1) { + [[unroll]] + for (uint j = 0; j < EL_W; ++j) { + const float val = reg[j]; + const float val2 = subgroupShuffleXor(val, h); + reg[j] = (lane & h) == 0 ? val + val2 : val2 - val; + } + } + + [[unroll]] + for (uint h = WARP_SIZE; h < N; h <<= 1) { + const uint step = h / WARP_SIZE; + [[unroll]] + for (uint j = 0; j < EL_W; j += 2 * step) { + [[unroll]] + for (uint k = 0; k < step; ++k) { + const float x = reg[j + k]; + const float y = reg[j + k + step]; + reg[j + k] = x + y; + reg[j + k + step] = x - y; + } + } + } + + [[unroll]] + for (uint i = 0; i < EL_W; ++i) { + data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i]; + } + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp index 5e9f8308c1d..33c3202dbb7 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp @@ -31,6 +31,7 @@ layout(push_constant) uniform Parameters { uint sb1, sb2, sb3; uint neq1, rq3; float scale; + uint K; }; layout(binding = 0) readonly buffer QBuf { FLOAT_TYPE data_q[]; }; @@ -101,13 +102,21 @@ void main() { const uint iq3 = seq_id / rq3; const uint state_size = S_V * S_V; - const uint state_base = (seq_id * H + head_id) * state_size; + // input state layout (D, K, n_seqs): per-seq stride is K*H*D; we read slot 0. + const uint state_in_base = (seq_id * K * H + head_id) * state_size; + // output state layout per slot: same per-(seq,head) offset as the single-slot case. + const uint state_out_base = (seq_id * H + head_id) * state_size; + const uint state_size_per_snap = state_size * H * n_seqs; FLOAT_TYPE s_shard[ROWS_PER_LANE]; [[unroll]] for (uint r = 0; r < ROWS_PER_LANE; r++) { - s_shard[r] = FLOAT_TYPE(data_state[state_base + col * S_V + r * LANES_PER_COLUMN + lane]); + s_shard[r] = FLOAT_TYPE(data_state[state_in_base + col * S_V + r * LANES_PER_COLUMN + lane]); } + // snapshot slot mapping: target_slot = t - shift. When n_tokens < K, only the last + // n_tokens slots are written; earlier slots are left untouched (caller-owned). + const int shift = int(n_tokens) - int(K); + uint attn_off = (seq_id * n_tokens * H + head_id) * S_V; for (uint t = 0; t < n_tokens; t++) { @@ -161,9 +170,21 @@ void main() { } attn_off += S_V * H; + + if (K > 1u) { + const int target_slot = int(t) - shift; + if (target_slot >= 0 && target_slot < int(K)) { + const uint slot_base = s_off + uint(target_slot) * state_size_per_snap + state_out_base; + [[unroll]] for (uint r = 0; r < ROWS_PER_LANE; r++) { + data_dst[slot_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[r]; + } + } + } } - [[unroll]] for (uint r = 0; r < ROWS_PER_LANE; r++) { - data_dst[s_off + state_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[r]; + if (K == 1u) { + [[unroll]] for (uint r = 0; r < ROWS_PER_LANE; r++) { + data_dst[s_off + state_out_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[r]; + } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index ba4c2103f0c..f4130d223b1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -44,36 +44,81 @@ void im2col(const uint ow, const uint z_idx) { const uint KHKW = p.KH * p.KW; + // Precompute base input coordinates + const int base_iw = int(ow * p.s0) - p.p0; + const int base_ih = int(oh * p.s1) - p.p1; + + // Precompute step deltas + const uint delta_ic = BLOCK_SIZE / KHKW; + const uint delta_rem = BLOCK_SIZE % KHKW; + + const uint delta_ky = delta_rem / p.KW; + const uint delta_kx = delta_rem % p.KW; + + const uint delta_ic_offset = delta_ic * p.offset_delta; + + // If using BDA mode, precompute the base pointer and step size +#if BDA + const BDA_STORAGE_T base_dst_addr = p.dst_addr + D_SIZE * dst_row; + const uint bda_step = D_SIZE * BLOCK_SIZE; +#endif + uint wg_x = gl_WorkGroupID.x; do { const uint wg_offset = wg_x * 512; - [[unroll]] for (uint i = 0; i < NUM_ITER; ++i) { - const uint chw_idx = wg_offset + gidx + i * BLOCK_SIZE; + uint chw_idx = wg_offset + gidx; + + uint ic = chw_idx / KHKW; + uint rem = chw_idx % KHKW; + + uint ky = rem / p.KW; + uint kx = rem % p.KW; + uint ic_offset = src_batch + ic * p.offset_delta; + + // Initialize running pointer/index for the destination buffer +#if BDA + BDA_STORAGE_T current_dst_addr = base_dst_addr + D_SIZE * chw_idx; +#else + uint current_dst_idx = dst_row + chw_idx; +#endif + + [[unroll]] for (uint i = 0; i < NUM_ITER; ++i) { if (chw_idx >= p.CHW) { return; } - const uint ic = chw_idx / KHKW; - const uint rem = chw_idx - ic * KHKW; - const uint ky = rem / p.KW; - const uint kx = rem - ky * p.KW; - - const uint iiw = ow * p.s0 + kx * p.d0 - p.p0; - const uint iih = oh * p.s1 + ky * p.d1 - p.p1; + const int iiw = base_iw + int(kx * p.d0); + const int iih = base_ih + int(ky * p.d1); A_TYPE val = A_TYPE(0); - if (iih < p.IH && iiw < p.IW) { - val = data_a[src_batch + ic * p.offset_delta + iih * p.IW + iiw]; + if (uint(iih) < p.IH && uint(iiw) < p.IW) { + val = data_a[ic_offset + uint(iih) * p.IW + uint(iiw)]; } #if BDA - D_ptr out_ptr = D_ptr(p.dst_addr + D_SIZE * (dst_row + chw_idx)); - out_ptr.d = D_TYPE(val); + D_ptr(current_dst_addr).d = D_TYPE(val); + current_dst_addr += bda_step; #else - data_d[dst_row + chw_idx] = D_TYPE(val); + data_d[current_dst_idx] = D_TYPE(val); + current_dst_idx += BLOCK_SIZE; #endif + + chw_idx += BLOCK_SIZE; + ic_offset += delta_ic_offset; + kx += delta_kx; + ky += delta_ky; + + // Handle X axis wrap + uint kx_wrap = uint(kx >= p.KW); + kx -= kx_wrap * p.KW; + ky += kx_wrap; + + // Handle Y axis wrap + uint ky_wrap = uint(ky >= p.KH); + ky -= ky_wrap * p.KH; + ic_offset += ky_wrap * p.offset_delta; } wg_x += gl_NumWorkGroups.x; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 2271be4021b..5a9d0e778fd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -10,12 +10,38 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; #if !defined(DATA_A_F32) && !defined(DATA_A_F16) && !defined(DATA_A_BF16) #define K_PER_ITER 8 #else -#define K_PER_ITER 2 +#define K_PER_ITER 4 #endif uint a_offset, b_offset, d_offset, y_offset; +vec4 load_b(const uint j, const uint iybs, const uint iqs, const bool lastiter, out bool OOB_y, out bool OOB_z, out bool OOB_w) { + // Check if the latter elements are OOB, and don't fetch B or accumulate it. + OOB_y = lastiter && (iybs + iqs + y_offset >= p.ncols); + OOB_z = lastiter && (iybs + iqs + y_offset*2 >= p.ncols); + OOB_w = lastiter && (iybs + iqs + y_offset*3 >= p.ncols); + + if (!OOB_w) { + return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]), + FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]), + FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]), + FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*3])); + } else if (!OOB_z) { + return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]), + FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]), + FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]), + 0); + } else if (!OOB_y) { + return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]), + FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]), + 0, 0); + } else { + return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]), + 0, 0, 0); + } +} + void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter) { [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { @@ -25,6 +51,8 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const #if K_PER_ITER == 8 #if QUANT_R == 2 + // Note that we end up fetching bogus elements here, but its fine as they'll be + // within an accessible block. const vec4 bv02 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]); const vec4 bv13 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]); const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y); @@ -34,18 +62,11 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]); #endif #else - // Check if the second of the pair of elements is OOB, and don't fetch B or - // accumulate it. We still fetch a pair of elements for A, which is fine for - // quantized formats since they'll be within the same block. We should - // probably skip fetching the second element for F16/F32, but as of now we - // still do. - const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols); - - FLOAT_TYPE b0 = 0, b1 = 0; - b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]); - if (!OOB) { - b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]); - } + bool OOB_y; + bool OOB_z; + bool OOB_w; + + const vec4 b = load_b(j, iybs, iqs, lastiter, OOB_y, OOB_z, OOB_w); #endif uint ibi = first_row*p.ncols; [[unroll]] for (uint n = 0; n < num_rows; ++n) { @@ -71,22 +92,60 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const temp[j][n] += rowtmp; #else - const vec2 v = dequantize(ib, iqs, a_offset); - - // matrix multiplication - temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]); - if (!OOB) { - temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]); + if (!OOB_w) { + const vec4 v = dequantize4(ib, iqs, a_offset); + temp[j][n] += dot(v, b); + } else if (!OOB_z) { + const vec2 v0 = dequantize(ib, iqs, a_offset); + const FLOAT_TYPE v1 = dequantize1(ib + 2/QUANT_R, iqs, a_offset); + const vec3 v = vec3(v0.x, v0.y, v1); + const vec3 b0 = vec3(b.x, b.y, b.z); + temp[j][n] += dot(v, b0); + } else if (!OOB_y) { + const vec2 v0 = dequantize(ib, iqs, a_offset); + const vec2 b0 = vec2(b.x, b.y); + temp[j][n] += dot(v0, b0); + } else { + const FLOAT_TYPE v = dequantize1(ib, iqs, a_offset); + temp[j][n] = fma(v, b.x, temp[j][n]); } #endif } } } +#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16) +void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i) +{ + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + const uint col = i*BLOCK_SIZE + K_PER_ITER*tid; + const uint iqs = 0; // quant index + const uint iybs = col; // y block start index + + const vec4 b = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]; + + uint ibi = first_row*p.ncols; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib = (ibi + col)/QUANT_K; // block index + ibi += p.ncols; + + const vec4 v = dequantize4_2aligned(ib, iqs, a_offset); + + // matrix multiplication + temp[j][n] += dot(v, b); + } + } +} +#endif + void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { const uint tid = gl_LocalInvocationID.x; get_offsets(a_offset, b_offset, d_offset); + const bool is_aligned_nonquant = + p.batch_stride_b % 4 == 0 && b_offset % 4 == 0 && + p.ncols % 4 == 0 && BLOCK_SIZE % 4 == 0 && + K_PER_ITER == 4; y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; @@ -105,17 +164,26 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { int unroll_count = 4; uint unrolled_iters = num_iters & ~(unroll_count - 1); -#if K_PER_ITER == 2 + uint i = 0; + +#if K_PER_ITER == 4 // If the K dimension is odd, we need lastiter==true on the last iteration // so OOB is computed correctly. Skip some unrolling to make that happen. - if ((p.ncols & 1) != 0 && + if ((p.ncols & 3) != 0 && unrolled_iters == num_iters && unrolled_iters > 0) { unrolled_iters -= unroll_count; } + if (is_aligned_nonquant) { + while (i < unrolled_iters) { + // Manually partially unroll the loop + [[unroll]] for (uint k = 0; k < unroll_count; ++k) { + iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER); + i++; + } + } + } else { #endif - - uint i = 0; while (i < unrolled_iters) { // Manually partially unroll the loop [[unroll]] for (uint k = 0; k < unroll_count; ++k) { @@ -123,18 +191,30 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { i++; } } +#if K_PER_ITER == 4 + } +#endif unroll_count = 2; unrolled_iters = num_iters & ~(unroll_count - 1); -#if K_PER_ITER == 2 - if ((p.ncols & 1) != 0 && +#if K_PER_ITER == 4 + if ((p.ncols & 3) != 0 && unrolled_iters == num_iters && unrolled_iters > 0) { unrolled_iters -= unroll_count; } -#endif + if (is_aligned_nonquant) { + while (i < unrolled_iters && is_aligned_nonquant) { + // Manually partially unroll the loop + [[unroll]] for (uint k = 0; k < unroll_count; ++k) { + iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER); + i++; + } + } + } else { +#endif while (i < unrolled_iters) { // Manually partially unroll the loop [[unroll]] for (uint k = 0; k < unroll_count; ++k) { @@ -142,10 +222,25 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { i++; } } +#if K_PER_ITER == 4 + } +#endif + +#if K_PER_ITER == 4 + if (is_aligned_nonquant) { + while (i < num_iters) { + iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER); + i++; + } + } else { +#endif while (i < num_iters) { iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true); i++; } +#if K_PER_ITER == 4 + } +#endif reduce_result(temp, d_offset, first_row, num_rows, tid); } @@ -164,6 +259,6 @@ void main() { if (first_row >= p.stride_d) { return; } - compute_outputs(first_row, p.stride_d - first_row); + compute_outputs(first_row, min(NUM_ROWS, p.stride_d - first_row)); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl index bc580aeeb83..73cf9c79955 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl @@ -212,28 +212,40 @@ i32vec4 repack4(uint ib, uint iqs) { const uint qs_shift = ((iqs_k % 32) / 8) * 2; const uint hm_shift = iqs_k / 8; + const uvec4 qs = uvec4( uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 ]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 7]) << 16)); + + const uvec4 hmask = uvec4( uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 ]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 7]) << 16)); + // bitwise OR to add 4 if hmask is set, subtract later - const i8vec2 vals00 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 ] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 ] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals01 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 1] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 1] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals10 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 2] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 2] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals11 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 3] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 3] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals20 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 4] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 4] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals21 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 5] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 5] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals30 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 6] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 6] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals31 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 7] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 7] >> hm_shift) & uint16_t(0x0101)) << 2)); - - return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y) - int8_t(4)), - pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y) - int8_t(4)), - pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y) - int8_t(4)), - pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y) - int8_t(4))); + const uint vals0 = (( qs.x >> qs_shift) & 0x03030303) | + (((hmask.x >> hm_shift) & 0x01010101) << 2); + const uint vals1 = (( qs.y >> qs_shift) & 0x03030303) | + (((hmask.y >> hm_shift) & 0x01010101) << 2); + const uint vals2 = (( qs.z >> qs_shift) & 0x03030303) | + (((hmask.z >> hm_shift) & 0x01010101) << 2); + const uint vals3 = (( qs.w >> qs_shift) & 0x03030303) | + (((hmask.w >> hm_shift) & 0x01010101) << 2); + + // Subtract 4 by twiddling bits rather than using re-packing as mesa + // compiles repacking poorly. + return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals1 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals2 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals3 ^ 0x80808080) - 0x04040404) ^ 0x80808080)); } float get_d_scale(uint ib, uint iqs) { @@ -343,27 +355,39 @@ i32vec4 repack4(uint ib, uint iqs) { const uint qh_idx = (iqs_k / 32) * 8 + iqs; const uint qh_shift = ((iqs_k % 32) / 8) * 2; - const i8vec2 vals00 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 ] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 ] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals01 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 1] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 1] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals10 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 2] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 2] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals11 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 3] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 3] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals20 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 4] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 4] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals21 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 5] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 5] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals30 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 6] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 6] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals31 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 7] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 7] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - - return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y)), - pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y)), - pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y)), - pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y))); + const uvec4 ql = uvec4( uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 ]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 7]) << 16)); + + const uvec4 qh = uvec4( uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 ]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 7]) << 16)); + + const uint vals0 = (( ql.x >> ql_shift) & 0x0F0F0F0F) | + (((qh.x >> qh_shift) & 0x03030303) << 4); + const uint vals1 = (( ql.y >> ql_shift) & 0x0F0F0F0F) | + (((qh.y >> qh_shift) & 0x03030303) << 4); + const uint vals2 = (( ql.z >> ql_shift) & 0x0F0F0F0F) | + (((qh.z >> qh_shift) & 0x03030303) << 4); + const uint vals3 = (( ql.w >> ql_shift) & 0x0F0F0F0F) | + (((qh.w >> qh_shift) & 0x03030303) << 4); + + // Subtract 32 by twiddling bits rather than using re-packing as mesa + // compiles repacking poorly. + return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals1 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals2 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals3 ^ 0x80808080) - 0x20202020) ^ 0x80808080)); } float get_d_scale(uint ib, uint iqs) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 497a18ff8a7..250d708479b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -71,10 +71,12 @@ layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #if QUANT_K > 1 -#define DECODEFUNCA , dequantFuncA - #include "dequant_funcs_cm2.glsl" - +#if defined(dequantFuncA_v) && defined(GL_NV_cooperative_matrix_decode_vector) +#define DECODEFUNCA , dequantFuncA, dequantFuncA_v +#else +#define DECODEFUNCA , dequantFuncA +#endif #else #define DECODEFUNCA #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl index 2e53459909d..03358793140 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl @@ -9,7 +9,7 @@ uint rope_a_coord(const uint i0, const uint i01, const uint i02, const uint i03, // Per-row offset in shared memory const uint ix = i0; #else - const uint ix = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0; + const uint ix = p.a_offset + i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0; #endif return ix; } @@ -48,6 +48,7 @@ void rope_norm(const uint i0, const uint i1, const uint i2, const uint i3, rope_ idst = i1*p.nb11 + i0; idst += rope_data_i[i2].x * p.set_rows_stride; } + idst += p.d_offset; if (i0 >= p.n_dims) { rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]); @@ -84,6 +85,7 @@ void rope_neox(const uint i0, const uint i1, const uint i2, const uint i3, rope_ idst = i1*p.nb11 + i0/2; idst += rope_data_i[i2].x * p.set_rows_stride; } + idst += p.d_offset; if (i0 >= p.n_dims) { rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]); @@ -121,6 +123,7 @@ void rope_multi(const uint i0, const uint i1, const uint i2, const uint i3, rope idst = i1*p.nb11 + i0/2; idst += rope_data_i[i2].x * p.set_rows_stride; } + idst += p.d_offset; if (i0 >= p.n_dims) { rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]); @@ -176,7 +179,7 @@ void rope_vision(const uint i0, const uint i1, const uint i2, const uint i3, rop return; } - const uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13; + const uint idst = p.d_offset + i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13; const uint ix = rope_a_coord(i0/2, i1, i2, i3, p); const int sect_dims = p.sections[0] + p.sections[1]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl index 2e2a7e14c66..3602485b943 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl @@ -26,6 +26,9 @@ struct rope_params { uint nb11; uint nb12; uint nb13; + + uint a_offset; + uint d_offset; }; #endif // !defined(GGML_ROPE_PARAMS) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp new file mode 100644 index 00000000000..8585538cbb0 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp @@ -0,0 +1,49 @@ +#version 450 + +#include "types.glsl" + +// Fused snake activation: y = x + sin(b * x)^2 * c +// data_a [ne0, ne1] per element activation x (A_TYPE) +// data_b [1, ne1] per channel multiplier (float) +// data_c [1, ne1] per channel inverse scale (float, precomputed as 1 / freq) +// data_d [ne0, ne1] output y (D_TYPE) +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {float data_b[];}; +layout (binding = 2) readonly buffer C {float data_c[];}; +layout (binding = 3) writeonly buffer D {D_TYPE data_d[];}; + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (push_constant) uniform parameter { + uint32_t ne0; + uint32_t ne1; +} p; + +// Load A_TYPE to float +float load_val(uint32_t idx) { +#if defined(DATA_A_BF16) + return bf16_to_fp32(uint32_t(data_a[idx])); +#else + return float(data_a[idx]); +#endif +} + +// Store float as D_TYPE +void store_val(uint32_t idx, float v) { +#if defined(DATA_D_BF16) + data_d[idx] = D_TYPE(fp32_to_bf16(v)); +#else + data_d[idx] = D_TYPE(v); +#endif +} + +void main() { + const uint32_t i0 = gl_GlobalInvocationID.x; + const uint32_t i1 = gl_GlobalInvocationID.y; + if (i0 >= p.ne0 || i1 >= p.ne1) return; + + const uint32_t idx = i0 + i1 * p.ne0; + const float xi = load_val(idx); + const float s = sin(data_b[i1] * xi); + store_val(idx, xi + s * s * data_c[i1]); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp b/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp index 6802b1fc955..4cd9b8da359 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp @@ -6,12 +6,15 @@ layout(constant_id = 0) const uint BLOCK_SIZE = 32; layout(constant_id = 1) const uint TOKENS_PER_WG = 16; +layout(constant_id = 2) const bool APPLY_BIAS = false; +layout(constant_id = 3) const bool APPLY_SILU = false; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Src0 { float src0[]; }; layout(binding = 1) readonly buffer Src1 { float src1[]; }; -layout(binding = 2) buffer Dst { float dst[]; }; +layout(binding = 2) readonly buffer Bias { float bias[]; }; +layout(binding = 3) buffer Dst { float dst[]; }; layout(push_constant) uniform PushConstants { uint nb01; uint nb02; @@ -45,6 +48,13 @@ void main() { } } + if (APPLY_BIAS) { + sum += bias[i1]; + } + if (APPLY_SILU) { + sum = sum / (1.0f + exp(-sum)); + } + const uint dst_idx = i3 * (dst_nb2 / 4) + i2 * (dst_nb1 / 4) + i1; dst[dst_idx] = sum; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl index 4bcd97756fd..f84d6f87334 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl @@ -31,6 +31,7 @@ #else #define A_TYPE float16_t #endif +#define A_TYPE_PACKED32 f16vec2 #endif #if defined(DATA_A_BF16) @@ -44,6 +45,7 @@ #else #define A_TYPE uint16_t #endif +#define A_TYPE_PACKED32 uint32_t #endif #define QUANT_K_Q4_0 32 @@ -1722,11 +1724,18 @@ struct block_nvfp4 uint8_t qs[QUANT_K_NVFP4 / 2]; }; +struct block_nvfp4_packed32 +{ + uint32_t d[QUANT_K_NVFP4 / 16 / 4]; + uint32_t qs[QUANT_K_NVFP4 / 2 / 4]; +}; + #if defined(DATA_A_NVFP4) #define QUANT_K QUANT_K_NVFP4 #define QUANT_R QUANT_R_NVFP4 #define QUANT_AUXF 1 #define A_TYPE block_nvfp4 +#define A_TYPE_PACKED32 block_nvfp4_packed32 #endif #if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index d99b2b5d802..de7dbec2c63 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -662,6 +662,28 @@ void process_shaders() { } } + const std::map fa_bf16_dict = { + {"FLOAT_TYPE", "bfloat16_t"}, + {"FLOAT_TYPEV2", "bf16vec2"}, + {"FLOAT_TYPEV4", "bf16vec4"}, + {"ACC_TYPE", "float"}, + {"ACC_TYPEV2", "vec2"}, + {"ACC_TYPEV4", "vec4"}, + {"BFLOAT16", "1"}, + }; + +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm1.comp", + merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}), + true, true, false, false); +#endif + +#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + string_to_spv("flash_attn_f32_f16_bf16", "flash_attn_cm2.comp", + merge_maps(fa_bf16_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), + true, false, true, false); +#endif + std::map base_dict = {{"FLOAT_TYPE", "float"}, {"FLOAT_TYPEV2", "vec2"}}; for (const auto& tname : type_names) { @@ -731,6 +753,7 @@ void process_shaders() { string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); string_to_spv("cpy_f16_f32", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}}); + string_to_spv("cpy_bf16_f32","copy.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "float"}, {"DATA_A_BF16", "1"}}); string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("contig_cpy_f32_i32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}}); string_to_spv("contig_cpy_i32_f32", "contig_copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}}); @@ -738,6 +761,7 @@ void process_shaders() { string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); string_to_spv("contig_cpy_f16_f32", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}}); + string_to_spv("contig_cpy_bf16_f32","contig_copy.comp",{{"A_TYPE", "uint16_t"}, {"D_TYPE", "float"}, {"DATA_A_BF16", "1"}}); string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}}); string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}}); @@ -796,9 +820,11 @@ void process_shaders() { string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); - string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("repeat_i32", "repeat.comp", {{"A_TYPE", "int32_t"}, {"D_TYPE", "int32_t"}}); string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("repeat_i16", "repeat.comp", {{"A_TYPE", "int16_t"}, {"D_TYPE", "int16_t"}}); + string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); @@ -930,6 +956,7 @@ void process_shaders() { string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}})); string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("fwht_f32", "fwht.comp", {}); string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}})); string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); @@ -950,6 +977,10 @@ void process_shaders() { string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}}); + string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); @@ -978,8 +1009,16 @@ void process_shaders() { string_to_spv(name + (unroll ? "_unroll" : ""), "conv2d_mm.comp", defines); #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (unroll) { - defines["COOPMAT2"] = "1"; - string_to_spv(name, "conv2d_mm.comp", defines, true, false, true); + auto cm2_defines = defines; + cm2_defines["COOPMAT2"] = "1"; + string_to_spv(name, "conv2d_mm.comp", cm2_defines, true, false, true); + } +#endif +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (unroll) { + auto cm1_defines = defines; + cm1_defines["COOPMAT"] = "1"; + string_to_spv(name, "conv2d_mm.comp", cm1_defines, true, true, false); } #endif } diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp index 4c4eda1cbe5..f4c5eca0df5 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp @@ -52,7 +52,7 @@ #define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4 #define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 4 -// default size for legacy matrix multiplication +// default size for reg-tile matrix multiplication #define WEBGPU_MUL_MAT_WG_SIZE 256 // Same hash combine function as in boost @@ -84,15 +84,17 @@ struct ggml_webgpu_shader_lib_context { ggml_tensor * src5; ggml_tensor * dst; - uint32_t max_wg_size; - size_t wg_mem_limit_bytes = 0; - bool supports_subgroups = false; - bool supports_subgroup_matrix = false; - uint32_t sg_mat_m = 0; - uint32_t sg_mat_n = 0; - uint32_t sg_mat_k = 0; - uint32_t min_subgroup_size = 0; - uint32_t max_subgroup_size = 0; + uint32_t max_wg_size; + size_t wg_mem_limit_bytes = 0; + bool supports_subgroups = false; + bool supports_subgroup_matrix = false; + uint32_t sg_mat_m = 0; + uint32_t sg_mat_n = 0; + uint32_t sg_mat_k = 0; + uint32_t min_subgroup_size = 0; + uint32_t max_subgroup_size = 0; + bool supports_dot_product = false; + std::string vendor; }; struct webgpu_pipeline { @@ -164,9 +166,11 @@ struct ggml_webgpu_set_rows_pipeline_key { int dst_type; int vec4; int i64_idx; + int pair_blocks; bool operator==(const ggml_webgpu_set_rows_pipeline_key & other) const { - return dst_type == other.dst_type && vec4 == other.vec4 && i64_idx == other.i64_idx; + return dst_type == other.dst_type && vec4 == other.vec4 && i64_idx == other.i64_idx && + pair_blocks == other.pair_blocks; } }; @@ -176,6 +180,7 @@ struct ggml_webgpu_set_rows_pipeline_key_hash { ggml_webgpu_hash_combine(seed, key.dst_type); ggml_webgpu_hash_combine(seed, key.vec4); ggml_webgpu_hash_combine(seed, key.i64_idx); + ggml_webgpu_hash_combine(seed, key.pair_blocks); return seed; } }; @@ -183,6 +188,7 @@ struct ggml_webgpu_set_rows_pipeline_key_hash { struct ggml_webgpu_set_rows_shader_decisions { bool vec4; bool i64_idx; + bool pair_blocks; uint32_t wg_size; }; @@ -770,31 +776,30 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions( (v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u); const bool kv_vec_type_supported = K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0; - const uint32_t kv_vec_head_align = K->type == GGML_TYPE_F16 ? GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH : - (uint32_t) ggml_blck_size(K->type); - const bool kv_vec_head_dims_aligned = context.src0->ne[0] % kv_vec_head_align == 0 && - context.src2->ne[0] % kv_vec_head_align == 0; + const uint32_t kv_vec_head_align = + K->type == GGML_TYPE_F16 ? GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH : (uint32_t) ggml_blck_size(K->type); + const bool kv_vec_head_dims_aligned = + context.src0->ne[0] % kv_vec_head_align == 0 && context.src2->ne[0] % kv_vec_head_align == 0; // Compile with enough invocations to cover the largest reported subgroup. - const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && - kv_vec_head_dims_aligned && kv_vec_type_supported && - (K->type != GGML_TYPE_F16 || f16_vec4_aligned) && + const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && kv_vec_head_dims_aligned && + kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) && (context.src2->type == K->type); const bool tile_can_dispatch_all_q_rows = context.max_subgroup_size > 0 && context.max_wg_size >= GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE * context.max_subgroup_size; - const bool use_subgroup_matrix = - context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 && - context.src0->ne[0] % context.sg_mat_k == 0 && context.src2->ne[0] % context.sg_mat_n == 0; + const bool use_subgroup_matrix = context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 && + context.src0->ne[0] % context.sg_mat_k == 0 && + context.src2->ne[0] % context.sg_mat_n == 0; const bool use_tile = context.supports_subgroups && !use_subgroup_matrix && K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16 && f16_vec4_aligned && (context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) && (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) && tile_can_dispatch_all_q_rows && !use_vec; - decisions.path = use_vec ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC : - use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE : - use_subgroup_matrix ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX : - GGML_WEBGPU_FLASH_ATTN_PATH_NONE; + decisions.path = use_vec ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC : + use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE : + use_subgroup_matrix ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX : + GGML_WEBGPU_FLASH_ATTN_PATH_NONE; if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_NONE) { return decisions; @@ -850,31 +855,15 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions( /** Matrix Multiplication **/ -struct ggml_webgpu_legacy_mul_mat_pipeline_key { - ggml_type src0_type; - ggml_type src1_type; - - bool operator==(const ggml_webgpu_legacy_mul_mat_pipeline_key & other) const { - return src0_type == other.src0_type && src1_type == other.src1_type; - } -}; - -struct ggml_webgpu_legacy_mul_mat_pipeline_key_hash { - size_t operator()(const ggml_webgpu_legacy_mul_mat_pipeline_key & key) const { - size_t seed = 0; - ggml_webgpu_hash_combine(seed, key.src0_type); - ggml_webgpu_hash_combine(seed, key.src1_type); - return seed; - } -}; - struct ggml_webgpu_mul_mat_vec_pipeline_key { ggml_type src0_type; ggml_type src1_type; int vectorized; + bool use_mmvq; bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const { - return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized; + return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized && + use_mmvq == other.use_mmvq; } }; @@ -884,6 +873,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash { ggml_webgpu_hash_combine(seed, key.src0_type); ggml_webgpu_hash_combine(seed, key.src1_type); ggml_webgpu_hash_combine(seed, key.vectorized); + ggml_webgpu_hash_combine(seed, key.use_mmvq); return seed; } }; @@ -894,6 +884,20 @@ struct ggml_webgpu_mul_mat_vec_shader_decisions { uint32_t vec_size; }; +struct ggml_webgpu_quantize_q8_pipeline_key { + ggml_type src0_type; + + bool operator==(const ggml_webgpu_quantize_q8_pipeline_key & other) const { return src0_type == other.src0_type; } +}; + +struct ggml_webgpu_quantize_q8_pipeline_key_hash { + size_t operator()(const ggml_webgpu_quantize_q8_pipeline_key & key) const { + size_t seed = 0; + ggml_webgpu_hash_combine(seed, key.src0_type); + return seed; + } +}; + struct ggml_webgpu_mul_mat_pipeline_key { ggml_type src0_type; ggml_type src1_type; @@ -1051,6 +1055,36 @@ struct ggml_webgpu_soft_max_pipeline_key_hash { } }; +/** MMVQ **/ + +inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0, + const ggml_tensor * src1, + bool supports_dot_product, + const std::string & vendor) { + if (src1->ne[1] == 1) { + bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia"; + if (supports_dp4a && supports_dot_product) { + switch (src1->type) { + case GGML_TYPE_F32: + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q4_K: + return src0->ne[0] % 4 == 0; + default: + break; + } + break; + default: + break; + } + } + } + return false; +} + class ggml_webgpu_shader_lib { wgpu::Device device; pre_wgsl::Preprocessor preprocessor; @@ -1099,14 +1133,12 @@ class ggml_webgpu_shader_lib { webgpu_pipeline, ggml_webgpu_flash_attn_blk_pipeline_key_hash> flash_attn_blk_pipelines; - std::unordered_map - mul_mat_legacy_pipelines; // legacy mul_mat (non-subgroup/non-regtile/non-vec) std::unordered_map - mul_mat_vec_pipelines; // fast mat-vec (n==1) + mul_mat_vec_pipelines; // fast mat-vec (n==1) std::unordered_map - mul_mat_fast_pipelines; // fast mat-mat (reg-tile or subgroup) + mul_mat_fast_pipelines; // fast mat-mat (reg-tile or subgroup) + std::unordered_map + quantize_q8_pipelines; std::unordered_map mul_mat_id_gather_pipelines; // key is fixed std::unordered_map mul_mat_id_pipelines; // src0_type/src1_type @@ -1235,10 +1267,13 @@ class ggml_webgpu_shader_lib { } webgpu_pipeline get_set_rows_pipeline(const ggml_webgpu_shader_lib_context & context) { - ggml_webgpu_set_rows_pipeline_key key = {}; - key.dst_type = context.dst->type; - key.vec4 = context.src0->ne[0] % 4 == 0; - key.i64_idx = context.src1->type == GGML_TYPE_I64; + const bool quantized = ggml_is_quantized(context.dst->type); + ggml_webgpu_set_rows_pipeline_key key = {}; + key.dst_type = context.dst->type; + key.vec4 = + (context.dst->type == GGML_TYPE_F32 || context.dst->type == GGML_TYPE_F16) && context.src0->ne[0] % 4 == 0; + key.i64_idx = context.src1->type == GGML_TYPE_I64; + key.pair_blocks = quantized && ((context.src0->ne[0] / ggml_blck_size(context.dst->type)) % 2 == 0); auto it = set_rows_pipelines.find(key); if (it != set_rows_pipelines.end()) { @@ -1257,6 +1292,14 @@ class ggml_webgpu_shader_lib { defines.push_back("DST_F16"); variant += "_dstf16"; break; + case GGML_TYPE_Q8_0: + defines.push_back("DST_Q8_0"); + variant += "_dstq8_0"; + break; + case GGML_TYPE_Q4_0: + defines.push_back("DST_Q4_0"); + variant += "_dstq4_0"; + break; default: GGML_ABORT("Unsupported dst type for set_rows shader"); } @@ -1269,13 +1312,19 @@ class ggml_webgpu_shader_lib { defines.push_back("I64_IDX"); variant += "_i64idx"; } + if (key.pair_blocks) { + defines.push_back("PAIR_BLOCKS"); + variant += "_pair_blocks"; + } defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size)); - auto processed = preprocessor.preprocess(wgsl_set_rows, defines); - auto decisions = std::make_shared(); + const auto & shader_source = quantized ? wgsl_set_rows_quant : wgsl_set_rows; + auto processed = preprocessor.preprocess(shader_source, defines); + auto decisions = std::make_shared(); decisions->vec4 = key.vec4; decisions->i64_idx = key.i64_idx; + decisions->pair_blocks = key.pair_blocks; decisions->wg_size = context.max_wg_size; set_rows_pipelines[key] = ggml_webgpu_create_pipeline(device, processed, variant); set_rows_pipelines[key].context = decisions; @@ -1744,6 +1793,44 @@ class ggml_webgpu_shader_lib { return pad_pipelines[key]; } + webgpu_pipeline get_quantize_q8_pipeline(const ggml_webgpu_shader_lib_context & context) { + ggml_webgpu_quantize_q8_pipeline_key key = {}; + key.src0_type = context.src0->type; + + auto it = quantize_q8_pipelines.find(key); + if (it != quantize_q8_pipelines.end()) { + return it->second; + } + const char * shader_src = wgsl_quantize_q8; + std::vector defines; + std::string variant = "quantize_q8"; + + uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE; + + defines.push_back("SRC1_INNER_TYPE=f32"); + defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size)); + + const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type); + std::string src0_name = src0_traits->type_name; + std::string type_upper = src0_name; + variant += "_" + src0_name; + std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper); + + defines.push_back("MUL_ACC_" + type_upper); + defines.push_back("Q8_1_T"); + + defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION"); + variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce"; + + auto processed = preprocessor.preprocess(shader_src, defines); + auto decisions = std::make_shared(); + decisions->wg_size = wg_size; + webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant); + pipeline.context = decisions; + quantize_q8_pipelines[key] = pipeline; + return quantize_q8_pipelines[key]; + } + webgpu_pipeline get_mul_mat_vec_pipeline(const ggml_webgpu_shader_lib_context & context) { ggml_webgpu_mul_mat_vec_pipeline_key key = {}; key.src0_type = context.src0->type; @@ -1752,6 +1839,8 @@ class ggml_webgpu_shader_lib { (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ? 1 : 0; + key.use_mmvq = + ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor); auto it = mul_mat_vec_pipelines.find(key); if (it != mul_mat_vec_pipelines.end()) { @@ -1788,6 +1877,19 @@ class ggml_webgpu_shader_lib { defines.push_back("U32_DEQUANT_HELPERS"); defines.push_back("SRC0_INNER_TYPE=u32"); switch (context.src0->type) { + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + if (key.use_mmvq) { + defines.push_back("LEGACY_QUANTS"); + } + break; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q4_K: + if (key.use_mmvq) { + defines.push_back("K_QUANTS"); + } + break; case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_S: @@ -1840,6 +1942,11 @@ class ggml_webgpu_shader_lib { outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG; } + if (key.use_mmvq) { + defines.push_back("MMVQ"); + defines.push_back("Q8_1_T"); + } + defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size)); defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg)); defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION"); @@ -2018,100 +2125,6 @@ class ggml_webgpu_shader_lib { return mul_mat_fast_pipelines[key]; } - webgpu_pipeline get_mul_mat_legacy_pipeline(const ggml_webgpu_shader_lib_context & context) { - ggml_webgpu_legacy_mul_mat_pipeline_key key = {}; - key.src0_type = context.src0->type; - key.src1_type = context.src1->type; - - auto it = mul_mat_legacy_pipelines.find(key); - if (it != mul_mat_legacy_pipelines.end()) { - return it->second; - } - - std::vector defines; - std::string variant = "mul_mat"; - - switch (context.src1->type) { - case GGML_TYPE_F32: - defines.push_back("SRC1_TYPE=f32"); - variant += "_f32"; - break; - case GGML_TYPE_F16: - defines.push_back("SRC1_TYPE=f16"); - variant += "_f16"; - break; - default: - GGML_ABORT("Unsupported src1 type for mul_mat legacy shader"); - } - - const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type); - const char * src0_name = src0_traits->type_name; - - switch (context.src0->type) { - case GGML_TYPE_F32: - defines.push_back("SRC0_TYPE=f32"); - defines.push_back("FLOAT"); - variant += "_f32"; - break; - case GGML_TYPE_F16: - defines.push_back("SRC0_TYPE=f16"); - defines.push_back("FLOAT"); - variant += "_f16"; - break; - default: - { - std::string type_upper = src0_name; - std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper); - - switch (context.src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_MXFP4: - { - // Quantized types using u32 buffers for portability. - defines.push_back("SRC0_TYPE=u32"); - defines.push_back("U32_DEQUANT_HELPERS"); - break; - } - default: - { - defines.push_back(std::string("SRC0_TYPE=") + src0_name); - } - } - - defines.push_back("BYTE_HELPERS"); - defines.push_back(type_upper + "_T"); - defines.push_back(type_upper); - defines.push_back(type_upper + "_SCALE_MIN"); - defines.push_back(type_upper + "_TABLES"); - defines.push_back(type_upper + "_GRID"); - - variant += std::string("_") + src0_name; - break; - } - } - - auto processed = preprocessor.preprocess(wgsl_mul_mat, defines); - - auto decisions = std::make_shared(); - decisions->wg_size = WEBGPU_MUL_MAT_WG_SIZE; - - webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant); - pipeline.context = decisions; - mul_mat_legacy_pipelines[key] = pipeline; - return mul_mat_legacy_pipelines[key]; - } - webgpu_pipeline get_mul_mat_id_gather_pipeline(const ggml_webgpu_shader_lib_context & context) { auto it = mul_mat_id_gather_pipelines.find(1); if (it != mul_mat_id_gather_pipelines.end()) { diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 78cb02be06d..d577b5afa3c 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -94,14 +94,6 @@ static inline uint32_t ggml_webgpu_u32_from_f32(float value) { #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4 #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4 -// For operations which process a row in parallel, this seems like a reasonable -// default -#define WEBGPU_ROW_SPLIT_WG_SIZE 64 - -// Track https://github.com/gpuweb/gpuweb/issues/5315 for fixes to -// implementations so this can be removed, necessary only for get_rows right now -#define WEBGPU_MAX_WG_SIZE 288 - /* End Constants */ // This is a "fake" base pointer, since WebGPU buffers do not have pointers to @@ -181,6 +173,7 @@ struct webgpu_capabilities { wgpu::Limits limits; bool supports_subgroups = false; bool supports_subgroup_matrix = false; + bool supports_dot_product = false; uint32_t sg_mat_m = 0; uint32_t sg_mat_n = 0; @@ -210,6 +203,8 @@ struct webgpu_global_context_struct { wgpu::Buffer memset_params_buf; webgpu_pipeline memset_pipeline; + std::string vendor; + // TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050 #ifdef GGML_WEBGPU_CPU_PROFILE // Profiling: labeled CPU time in ms (total) @@ -259,6 +254,7 @@ struct webgpu_context_struct { wgpu::Buffer set_rows_host_error_buf; wgpu::CommandEncoder active_command_encoder; wgpu::ComputePassEncoder active_compute_pass; + bool batch_compute_passes = true; size_t memset_bytes_per_thread; @@ -590,9 +586,18 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context & } #else for (size_t i = 0; i < dispatches.size(); i++) { - ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline); - ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]); - ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1); + if (ctx->batch_compute_passes) { + ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline); + ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]); + ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, + 1); + } else { + wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass(); + pass.SetPipeline(dispatches[i].pipeline.pipeline); + pass.SetBindGroup(0, bind_groups[i]); + pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1); + pass.End(); + } } #endif @@ -618,7 +623,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx, size_t size) { std::vector params = { (uint32_t) offset, (uint32_t) size, value }; std::vector entries = { ggml_webgpu_make_bind_group_entry(0, buf, 0, buf.GetSize()) }; - size_t bytes_per_wg = WEBGPU_MAX_WG_SIZE * ctx->capabilities.memset_bytes_per_thread; + size_t bytes_per_wg = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.memset_bytes_per_thread; uint32_t wg_x = CEIL_DIV(size + 3, bytes_per_wg); ctx->queue.WriteBuffer(ctx->memset_params_buf, 0, params.data(), params.size() * sizeof(uint32_t)); @@ -736,8 +741,11 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst), }; - uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); - return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); + uint32_t wg_x; + uint32_t wg_y; + uint32_t total_wg = CEIL_DIV(ne, decisions->wg_size); + compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); } static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx, @@ -961,9 +969,10 @@ static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx, auto * decisions = static_cast(pipeline.context.get()); + uint32_t wg_x; + uint32_t wg_y; uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size); - uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg); - uint32_t wg_y = CEIL_DIV(total_wg, wg_x); + compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y); return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); } @@ -1051,9 +1060,10 @@ static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx, auto * decisions = static_cast(pipeline.context.get()); + uint32_t wg_x; + uint32_t wg_y; uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size); - uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg); - uint32_t wg_y = CEIL_DIV(total_wg, wg_x); + compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y); return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); } @@ -1234,6 +1244,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, const uint32_t h = (uint32_t) src2->ne[1]; const uint32_t n_tokens = (uint32_t) src2->ne[2]; const uint32_t n_seqs = (uint32_t) src2->ne[3]; + const uint32_t K = (uint32_t) src5->ne[1]; const float scale = 1.0f / sqrtf((float) s_v); uint32_t scale_u32; memcpy(&scale_u32, &scale, sizeof(scale_u32)); @@ -1258,6 +1269,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, (uint32_t) src0->ne[1], (uint32_t) (src2->ne[3] / src0->ne[3]), + K, scale_u32, }; @@ -1319,7 +1331,11 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & ct } uint32_t threads; - if (decisions->vec4) { + if (ggml_is_quantized(dst->type)) { + const uint32_t blocks_per_row = src->ne[0] / ggml_blck_size(dst->type); + threads = + (src->ne[1] * src->ne[2] * src->ne[3]) * (decisions->pair_blocks ? (blocks_per_row / 2) : blocks_per_row); + } else if (decisions->vec4) { threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4); } else { threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3]; @@ -1346,7 +1362,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx, shader_lib_ctx.src0 = src; shader_lib_ctx.src1 = nullptr; shader_lib_ctx.dst = dst; - shader_lib_ctx.max_wg_size = WEBGPU_MAX_WG_SIZE; + shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup; webgpu_pipeline pipeline = ctx->shader_lib->get_get_rows_pipeline(shader_lib_ctx); auto * decisions = static_cast(pipeline.context.get()); @@ -1382,6 +1398,58 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx, return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } +static void ggml_webgpu_quantize_q8_dispatch(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + std::vector & dispatches) { + ggml_webgpu_shader_lib_context shader_lib_ctx = {}; + + shader_lib_ctx.src0 = src0; + shader_lib_ctx.src1 = src1; + shader_lib_ctx.dst = dst; + shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup; + shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups; + + webgpu_pipeline qq8_pipeline = ctx->shader_lib->get_quantize_q8_pipeline(shader_lib_ctx); + + // quantize_q8 pipeline + const size_t dst_offset = ggml_webgpu_tensor_offset(dst); + const size_t q8_src1_align_offset = ROUNDUP_POW2( + dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment); + const size_t q8_src1_binding_size = + ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)), + WEBGPU_STORAGE_BUF_BINDING_MULT); + + std::vector q8_params = { + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), + (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), + (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), + (uint32_t) src1->ne[0], + (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3], + }; + + std::vector q8_entries = { + ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1), + ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), q8_src1_align_offset, q8_src1_binding_size) + }; + + auto q8_decisions = static_cast(qq8_pipeline.context.get()); + + uint32_t q8_wg_size = q8_decisions->wg_size; + uint32_t q8_wg_x = 1; + uint32_t q8_wg_y = 1; + const uint32_t wg_per_vec = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size; + const uint32_t q8_total_wg = src1->ne[2] * src1->ne[3] * wg_per_vec; + const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; + compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y); + + dispatches.push_back({ + qq8_pipeline, std::move(q8_params), std::move(q8_entries), { q8_wg_x, q8_wg_y } + }); +} + static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, @@ -1389,47 +1457,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, // Determine if this is a mat-vec operation bool is_vec = (dst->ne[1] == 1); - // Determine if we should use fast path - bool use_fast = false; - switch (src1->type) { - case GGML_TYPE_F16: - use_fast = (src0->type == GGML_TYPE_F16); - break; - case GGML_TYPE_F32: - // TODO: implement better mat-mat for k-quants, mat-vec for all k-quants except q6_K - switch (src0->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q6_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q1_0: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_MXFP4: - use_fast = true; - break; - default: - break; - } - break; - default: - break; - } + // use MMVQ path for mat-vec + bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product, + ctx->global_ctx->vendor); ggml_webgpu_shader_lib_context shader_lib_ctx = {}; @@ -1444,16 +1474,20 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, shader_lib_ctx.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k; shader_lib_ctx.min_subgroup_size = ctx->global_ctx->capabilities.min_subgroup_size; shader_lib_ctx.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size; + shader_lib_ctx.supports_dot_product = ctx->global_ctx->capabilities.supports_dot_product; + shader_lib_ctx.vendor = ctx->global_ctx->vendor; // Get or create pipeline - webgpu_pipeline pipeline; + webgpu_pipeline pipeline; + std::vector dispatches; - if (use_fast && is_vec) { + if (is_vec) { + if (use_mmvq) { + ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches); + } pipeline = ctx->shader_lib->get_mul_mat_vec_pipeline(shader_lib_ctx); - } else if (use_fast) { - pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx); } else { - pipeline = ctx->shader_lib->get_mul_mat_legacy_pipeline(shader_lib_ctx); + pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx); } // Build params @@ -1477,25 +1511,31 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, }; // Build bind group entries - std::vector entries = { - ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0), - ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1), - ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst), - }; + std::vector entries = {}; + + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0)); + if (use_mmvq) { + auto & mmvq_qq8_entry = dispatches[0].bind_group_entries[1]; + entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), mmvq_qq8_entry.offset, + mmvq_qq8_entry.size)); + } else { + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1)); + } + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst)); // Calculate workgroup dimensions uint32_t wg_x = 1; uint32_t wg_y = 1; const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; - if (use_fast && is_vec) { + if (is_vec) { auto * decisions = static_cast(pipeline.context.get()); uint32_t batches = dst->ne[2] * dst->ne[3]; uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg); uint32_t total_wg = output_groups * batches; compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); - } else if (use_fast) { + } else { auto * decisions = static_cast(pipeline.context.get()); // Fast-path tiled/subgroup calculations @@ -1516,15 +1556,13 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, } uint32_t total_wg = wg_m * wg_n * dst->ne[2] * dst->ne[3]; compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); - - } else { // legacy - auto * decisions = static_cast(pipeline.context.get()); - uint32_t wg_size = decisions->wg_size; - uint32_t total_wg = CEIL_DIV(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3], wg_size); - compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); } - return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); + dispatches.push_back({ + pipeline, std::move(params), std::move(entries), { wg_x, wg_y } + }); + + return ggml_backend_webgpu_build_multi(ctx, dispatches); } static webgpu_encoded_op ggml_webgpu_mul_mat_id_vec(webgpu_context & ctx, @@ -1652,14 +1690,11 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, gathered_count_ids_binding_size), }; - const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; - - const uint32_t gather_total_wg = param_n_expert; - const uint32_t gather_wg_x = std::min(gather_total_wg, max_wg_per_dim); - const uint32_t gather_wg_y = CEIL_DIV(gather_total_wg, gather_wg_x); + // n_expert is much less than maxComputeWorkgroupsPerDimension (e.g., n_exeprt=256 at Qwen3.5-35B-A3B) + const uint32_t gather_wg_x = param_n_expert; dispatches.push_back({ - gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, gather_wg_y } + gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, 1 } }); // params for mul_mat_id.wgsl @@ -1711,7 +1746,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx, uint32_t max_wg_n = CEIL_DIV(total_gathered, tile_n_s) + max_active_experts; uint32_t total_wg = wg_m * max_wg_n; - compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y); + compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y); dispatches.push_back({ main_pipeline, std::move(main_params), std::move(main_entries), { wg_x, wg_y } @@ -1954,10 +1989,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx, std::vector reduce_entries; if (use_vec_reduce) { const uint32_t reduce_sg_size = ctx->global_ctx->capabilities.max_subgroup_size; - const uint32_t reduce_wg_size = - std::max(reduce_sg_size, (uint32_t) std::min( - (uint64_t) nwg * reduce_sg_size, - ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup)); + const uint32_t reduce_wg_size = std::max( + reduce_sg_size, + (uint32_t) std::min((uint64_t) nwg * reduce_sg_size, + ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup)); ggml_webgpu_shader_lib_context reduce_shader_ctx = shader_lib_ctx; reduce_shader_ctx.max_wg_size = reduce_wg_size; reduce_pipeline = ctx->shader_lib->get_flash_attn_vec_reduce_pipeline(reduce_shader_ctx); @@ -2734,10 +2769,12 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * block_size, npr, nrows }; - const uint32_t total_wg_init = npr * nrows; - const uint32_t max_wg = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; - const uint32_t wg_x_init = std::min(total_wg_init, max_wg); - const uint32_t wg_y_init = CEIL_DIV(total_wg_init, wg_x_init); + uint32_t wg_x_init; + uint32_t wg_y_init; + const uint32_t total_wg_init = npr * nrows; + const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; + compute_2d_workgroups(total_wg_init, max_wg_per_dim, wg_x_init, wg_y_init); + std::vector init_entries = { ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src), ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), init_align_offset, init_binding_size) @@ -2794,9 +2831,11 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(dst), align_out, size_out) }; + uint32_t wg_x_merge; + uint32_t wg_y_merge; const uint32_t total_wg_merge = nm * nrows; - const uint32_t wg_x_merge = std::min(total_wg_merge, max_wg); - const uint32_t wg_y_merge = CEIL_DIV(total_wg_merge, wg_x_merge); + compute_2d_workgroups(total_wg_merge, max_wg_per_dim, wg_x_merge, wg_y_merge); + dispatches.push_back({ argsort_merge_pipeline, std::move(merge_params), std::move(merge_entries), { wg_x_merge, wg_y_merge } }); @@ -2916,9 +2955,12 @@ static webgpu_encoded_op ggml_webgpu_upscale(webgpu_context ctx, ggml_tensor * s webgpu_pipeline pipeline = ctx->shader_lib->get_upscale_pipeline(shader_lib_ctx); auto * decisions = static_cast(pipeline.context.get()); - uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size); - uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg); - uint32_t wg_y = CEIL_DIV(total_wg, wg_x); + + uint32_t wg_x; + uint32_t wg_y; + uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size); + compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y); } @@ -3108,18 +3150,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str uint32_t num_batched_kernels = 0; uint32_t num_inflight_batches = 0; bool contains_set_rows = false; - bool batch_compute_passes = true; int num_encoded_ops = 1; int node_idx = 0; #ifdef GGML_WEBGPU_GPU_PROFILE ctx->profile_timestamp_query_count = 0; - batch_compute_passes = false; std::vector profile_pipeline_names; #endif ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder(); - if (batch_compute_passes) { + if (ctx->batch_compute_passes) { ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass(); } @@ -3146,7 +3186,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str // reset state for next batch ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder(); - if (batch_compute_passes) { + if (ctx->batch_compute_passes) { ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass(); } ctx->param_arena.reset(); @@ -3546,8 +3586,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer const uint32_t kv_tile = decisions.kv_tile; const uint32_t vec_nwg_cap = ctx->webgpu_global_ctx->capabilities.min_subgroup_size; - uint32_t nwg = 1u; - const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile); + uint32_t nwg = 1u; + const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile); while ((2u * nwg * kv_span) < (uint64_t) K->ne[1] && nwg < vec_nwg_cap) { nwg <<= 1; } @@ -3580,6 +3620,22 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer } } break; + case GGML_OP_MUL_MAT: + { + const ggml_tensor * src0 = tensor->src[0]; + const ggml_tensor * src1 = tensor->src[1]; + bool use_mmvq = + ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product, + ctx->webgpu_global_ctx->vendor); + if (use_mmvq) { + const size_t q8_src1_size = + src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)); + res = ROUNDUP_POW2(res + q8_src1_size + + ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment, + WEBGPU_STORAGE_BUF_BINDING_MULT); + } + } + break; case GGML_OP_MUL_MAT_ID: { const ggml_tensor * src0 = tensor->src[0]; @@ -3656,19 +3712,19 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) { static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) { // we use the maximum workgroup size for the memset pipeline - size_t max_threads = WEBGPU_MAX_WG_SIZE * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; + size_t max_threads = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; // Size the bytes_per_thread so that the largest buffer size can be handled ctx->capabilities.memset_bytes_per_thread = CEIL_DIV(ctx->capabilities.limits.maxStorageBufferBindingSize, max_threads); std::vector constants(2); constants[0].key = "wg_size"; - constants[0].value = WEBGPU_MAX_WG_SIZE; + constants[0].value = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup; constants[1].key = "bytes_per_thread"; constants[1].value = ctx->capabilities.memset_bytes_per_thread; ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants); } -static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { +static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { wgpu::RequestAdapterOptions options = {}; #ifndef __EMSCRIPTEN__ @@ -3705,12 +3761,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { ctx->webgpu_global_ctx->adapter.GetInfo(&info); ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size(); ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches(); - wgpu::SupportedFeatures features; - ctx->webgpu_global_ctx->adapter.GetFeatures(&features); - // we require f16 support - GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16)); + ctx->webgpu_global_ctx->vendor = info.vendor; ctx->webgpu_global_ctx->capabilities.supports_subgroups = ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups); + // for dot4I8packed + ctx->webgpu_global_ctx->capabilities.supports_dot_product = ctx->webgpu_global_ctx->instance.HasWGSLLanguageFeature( + wgpu::WGSLLanguageFeatureName::Packed4x8IntegerDotProduct); bool valid_subgroup_matrix_config = false; #ifndef __EMSCRIPTEN__ @@ -3817,7 +3873,6 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { "device_desc: %s\n", info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID, std::string(info.device).c_str(), std::string(info.description).c_str()); - return true; } static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) { @@ -3837,6 +3892,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) { wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf"); #ifdef GGML_WEBGPU_GPU_PROFILE + webgpu_ctx->batch_compute_passes = false; ggml_webgpu_create_buffer( webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf"); @@ -3989,8 +4045,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32); break; case GGML_OP_SET_ROWS: - supports_op = ((op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32) && src0->type == GGML_TYPE_F32 && - (src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32)); + supports_op = ((op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_Q8_0 || + op->type == GGML_TYPE_Q4_0) && + src0->type == GGML_TYPE_F32 && (src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32)); break; case GGML_OP_GET_ROWS: if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_webgpu_supported_qtype(src0->type)) { @@ -4445,7 +4502,12 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { UINT64_MAX); } - if (adapter != nullptr) { + // WebGPU backend requires f16 support and, on native, implicit device synchronization. + if (adapter != nullptr && adapter.HasFeature(wgpu::FeatureName::ShaderF16) +#ifndef __EMSCRIPTEN__ + && adapter.HasFeature(wgpu::FeatureName::ImplicitDeviceSynchronization) +#endif + ) { ctx->device_count = 1; } @@ -4453,8 +4515,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { } ggml_backend_t ggml_backend_webgpu_init(void) { - ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_webgpu_reg(), 0); - + ggml_backend_reg_t reg = ggml_backend_webgpu_reg(); + if (ggml_backend_reg_dev_count(reg) == 0) { + return nullptr; + } + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, 0); return ggml_backend_webgpu_backend_init(dev, nullptr); } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl index 372ea79bf9d..758efa17d77 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl @@ -95,11 +95,10 @@ struct q5_1 { }; #endif - #ifdef Q8_1_T struct q8_1 { d: f16, - m: f16, + s: f16, // d * sum(qs[i]) qs: array }; #endif diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl index fa3bdf4e393..67f1dc0928f 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl @@ -49,7 +49,9 @@ struct Params{ var params: Params; @compute @workgroup_size(WG_SIZE) -fn main(@builtin(global_invocation_id) gid: vec3) { +fn main( + @builtin(global_invocation_id) gid: vec3, +) { if (gid.x >= params.ne) { return; } @@ -78,4 +80,3 @@ fn main(@builtin(global_invocation_id) gid: vec3) { dst[params.offset_dst + dst_idx] = DST_TYPE((src[params.offset_src + src_idx])); } - diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl index ae8036b9ac5..4133f0ab564 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl @@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u; const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE; const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE; -var q_shmem: array; -var kv_shmem: array; -var p_shmem: array; +var q_shmem: array; +var kv_shmem: array; +var p_shmem: array; @compute @workgroup_size(WG_SIZE) fn main(@builtin(workgroup_id) wg_id: vec3, @@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let head = f32(head_idx); let slope = select(1.0, - select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), - pow(params.m0, head + 1.0), - head < params.n_head_log2), - params.max_bias > 0.0); + select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), + pow(params.m0, head + 1.0), + head < params.n_head_log2), + params.max_bias > 0.0); for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) { let q_tile_row = elem_idx / HEAD_DIM_QK; @@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1; q_shmem[elem_idx] = select( 0.0, - f32(Q[global_q_row_offset + q_col]) * params.scale, + Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale, head_q_row < params.seq_len_q); } @@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u; let k4 = K[k_vec_index]; let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - kv_shmem[kv_off + 0u] = f32(k4.x); - kv_shmem[kv_off + 1u] = f32(k4.y); - kv_shmem[kv_off + 2u] = f32(k4.z); - kv_shmem[kv_off + 3u] = f32(k4.w); + kv_shmem[kv_off + 0u] = KV_TYPE(k4.x); + kv_shmem[kv_off + 1u] = KV_TYPE(k4.y); + kv_shmem[kv_off + 2u] = KV_TYPE(k4.z); + kv_shmem[kv_off + 3u] = KV_TYPE(k4.w); } workgroupBarrier(); @@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3, var dot_val = 0.0; for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) { let q_off = q_base + chunk * 4u; - let qv = vec4( + let qv = vec4( q_shmem[q_off + 0u], q_shmem[q_off + 1u], q_shmem[q_off + 2u], q_shmem[q_off + 3u]); let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - let kv = vec4( + let kv = vec4( kv_shmem[kv_off + 0u], kv_shmem[kv_off + 1u], kv_shmem[kv_off + 2u], kv_shmem[kv_off + 3u]); - dot_val += dot(qv, kv); + dot_val += dot(vec4(qv), vec4(kv)); } #ifdef LOGIT_SOFTCAP dot_val = params.logit_softcap * tanh(dot_val); @@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let kv_local = sg_inv_id + slot * subgroup_size; if (row_active && kv_local < kv_count) { let p = exp(local_scores[slot] - new_max); - p_shmem[subgroup_p_offset + kv_local] = p; + p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p); local_sum += p; } } @@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u; let v4 = V[v_vec_index]; let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - kv_shmem[kv_off + 0u] = f32(v4.x); - kv_shmem[kv_off + 1u] = f32(v4.y); - kv_shmem[kv_off + 2u] = f32(v4.z); - kv_shmem[kv_off + 3u] = f32(v4.w); + kv_shmem[kv_off + 0u] = KV_TYPE(v4.x); + kv_shmem[kv_off + 1u] = KV_TYPE(v4.y); + kv_shmem[kv_off + 2u] = KV_TYPE(v4.z); + kv_shmem[kv_off + 3u] = KV_TYPE(v4.w); } workgroupBarrier(); @@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3, for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) { let p = p_shmem[subgroup_p_offset + kv_local]; let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - let v4 = vec4( + let v4 = vec4( kv_shmem[kv_off + 0u], kv_shmem[kv_off + 1u], kv_shmem[kv_off + 2u], kv_shmem[kv_off + 3u]); - acc += p * v4; + acc += f32(p) * vec4(v4); } out_regs[reg_idx] = acc; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl index f9d98fda40b..d68520f8282 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl @@ -39,6 +39,7 @@ struct Params { neq1: u32, rq3: u32, + K: u32, scale: f32, }; @@ -62,11 +63,14 @@ fn main( let iq3 = seq_id / params.rq3; let state_size = S_V * S_V; - let state_base = (seq_id * params.h + head_id) * state_size; + let state_in_base = (seq_id * params.K * params.h + head_id) * state_size; + let state_out_base = (seq_id * params.h + head_id) * state_size; + let state_size_per_snap = state_size * params.h * params.n_seqs; + let shift = i32(params.n_tokens) - i32(params.K); var state: array; for (var i = 0u; i < S_V; i++) { - state[i] = src_state[state_base + col * S_V + i]; + state[i] = src_state[state_in_base + col * S_V + i]; } var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V; @@ -123,10 +127,22 @@ fn main( dst[attn_off + col] = attn_col * params.scale; attn_off += S_V * params.h; + if (params.K > 1u) { + let target_slot = i32(t) - shift; + if (target_slot >= 0 && target_slot < i32(params.K)) { + let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base; + for (var i = 0u; i < S_V; i++) { + dst[slot_base + col * S_V + i] = state[i]; + } + } + } + workgroupBarrier(); } - for (var i = 0u; i < S_V; i++) { - dst[params.s_off + state_base + col * S_V + i] = state[i]; + if (params.K == 1u) { + for (var i = 0u; i < S_V; i++) { + dst[params.s_off + state_out_base + col * S_V + i] = state[i]; + } } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl deleted file mode 100644 index fcbefdeb802..00000000000 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ /dev/null @@ -1,747 +0,0 @@ -enable f16; - -#define DECLARE_BYTE_LOADERS_SRC0 -#include "common_decls.tmpl" - - -#ifdef FLOAT -const BLOCK_SIZE = 1u; - -#elif defined(Q4_0) || defined(Q4_1) || defined(Q5_0) || defined(Q5_1) || defined(Q8_0) || defined(Q8_1) || defined(IQ4_NL) -const BLOCK_SIZE = 32u; - -#elif defined(Q2_K) || defined(Q3_K) || defined(Q4_K) || defined(Q5_K) || defined(Q6_K) || defined(IQ2_XXS) || defined(IQ2_XS) || defined(IQ2_S) || defined(IQ3_XXS) || defined(IQ3_S) || defined(IQ1_S) || defined(IQ1_M) || defined(IQ4_XS) -const BLOCK_SIZE = 256u; -#endif - -#ifdef FLOAT -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]); -} -#endif - -#ifdef Q4_0 -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 18; // Block stride: 18 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var sum: f32 = 0.0; - for (var j: u32 = 0; j < 4; j++) { - let q_byte_offset = block_byte_base + 2 + j * 4; - let q_packed = load_u32_at_src0(q_byte_offset); - for (var k: u32 = 0; k < 4; k++) { - let q_byte = get_byte(q_packed, k); - let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; - let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; - let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_lo * f32(src1[src1_offset]); - sum += q_hi * f32(src1[src1_offset + 16]); - } - } - return sum; -} -#endif - -#ifdef Q4_1 -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_q4_1 = src0[src0_idx_base + offset]; - let d = f32(block_q4_1.d); - let m = f32(block_q4_1.m); - var sum: f32 = 0.0; - for (var j: u32 = 0; j < 4; j++) { - let q_packed = block_q4_1.qs[j]; - for (var k: u32 = 0; k < 4; k++) { - let q_byte = get_byte(q_packed, k); - let q_hi = f32((q_byte >> 4) & 0xF) * d + m; - let q_lo = f32(q_byte & 0xF) * d + m; - let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_lo * f32(src1[src1_offset]); - sum += q_hi * f32(src1[src1_offset + 16]); - } - } - return sum; -} -#endif - -#ifdef Q5_0 -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 22; // Block stride: 22 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var sum: f32 = 0.0; - let qh_packed = load_u32_at_src0(block_byte_base + 2); - for (var j: u32 = 0; j < 4; j++) { - let q_byte_offset = block_byte_base + 6 + j * 4; - let q_packed = load_u32_at_src0(q_byte_offset); - for (var k: u32 = 0; k < 4; k++) { - let q_byte = get_byte(q_packed, k); - let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10; - let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d; - let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10; - let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d; - let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_lo * f32(src1[src1_offset]); - sum += q_hi * f32(src1[src1_offset + 16]); - } - } - return sum; -} -#endif - -#ifdef Q5_1 -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_q5_1 = src0[src0_idx_base + offset]; - let d = f32(block_q5_1.d); - let m = f32(block_q5_1.m); - var sum: f32 = 0.0; - for (var j: u32 = 0; j < 4; j++) { - let q_packed = block_q5_1.qs[j]; - for (var k: u32 = 0; k < 4; k++) { - let q_byte = get_byte(q_packed, k); - let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10; - let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m; - let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10; - let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m; - let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_lo * f32(src1[src1_offset]); - sum += q_hi * f32(src1[src1_offset + 16]); - } - } - return sum; -} -#endif - -#ifdef Q8_0 -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 34; // Block stride: 34 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var sum: f32 = 0.0; - for (var j: u32 = 0; j < 8; j++) { - let q_byte_offset = block_byte_base + 2 + j * 4; - let q_packed = load_u32_at_src0(q_byte_offset); - for (var k: u32 = 0u; k < 4u; k++) { - let q_byte = get_byte_i32(q_packed, k); - let q_val = f32(q_byte) * d; - let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_val * f32(src1[src1_offset]); - } - } - return sum; -} -#endif - -#ifdef Q8_1 -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_q8_1 = src0[src0_idx_base + offset]; - let d = f32(block_q8_1.d); - let m = f32(block_q8_1.m); - var sum: f32 = 0.0; - for (var j: u32 = 0; j < 8; j++) { - let q_packed = block_q8_1.qs[j]; - for (var k: u32 = 0; k < 4; k++) { - let q_byte = get_byte_i32(q_packed, k); - let q_val = f32(q_byte) * d + m; - let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; - sum += q_val * f32(src1[src1_offset]); - } - } - return sum; -} -#endif - -#ifdef Q2_K -// 16 blocks of 16 elements each -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block = src0[src0_idx_base + offset]; - let d = f32(block.d); - let m = f32(block.dmin); - var sum = 0.0; - var src1_i = src1_idx_base + offset * 256; - var is: u32 = 0; - // 2 halves of the block (128 elements each) - for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { - // 4 groups (each group has 2 blocks of 16 elements) - for (var shift: u32 = 0; shift < 8; shift += 2) { - // 2 blocks - for (var k: u32 = 0; k < 32; k += 16) { - let sc = get_byte(block.scales[is / 4], is % 4); - is++; - let dl = d * f32(sc & 0xF); - let ml = m * f32(sc >> 4); - for (var l: u32 = 0u; l < 16; l++) { - let q_idx = q_b_idx + k + l; - let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); - let qs_val = (q_byte >> shift) & 3; - sum += (f32(qs_val) * dl - ml) * src1[src1_i]; - src1_i++; - } - } - } - } - return sum; -} -#endif - -#ifdef Q3_K -// 16 blocks of 16 elements each -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 110; // Block stride: 110 bytes - - // Bytes 108-109: f16 scale 'd' - let d = load_f16_as_f32_at_src0(block_byte_base + 108); - - // extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale, - // and 2-bits from the last 4 bytes - // Bytes 96-107: 12 bytes of scales (3 u32s) - let kmask1: u32 = 0x03030303; - let kmask2: u32 = 0x0f0f0f0f; - var scale_vals: array; - scale_vals[0] = load_u32_at_src0(block_byte_base + 96); - scale_vals[1] = load_u32_at_src0(block_byte_base + 100); - scale_vals[2] = load_u32_at_src0(block_byte_base + 104); - - var tmp: u32 = scale_vals[2]; - scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); - scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); - scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4); - scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); - - // Bytes 0-31: 32 bytes of hmask (8 u32s) - var hmask_vals: array; - for (var i: u32 = 0; i < 8; i++) { - hmask_vals[i] = load_u32_at_src0(block_byte_base + i * 4); - } - - // Bytes 32-95: 64 bytes of qs (16 u32s) - var qs_vals: array; - for (var i: u32 = 0u; i < 16; i++) { - qs_vals[i] = load_u32_at_src0(block_byte_base + 32 + i * 4); - } - - var sum = 0.0; - var src1_i = src1_idx_base + offset * 256; - var is: u32 = 0; - var m: u32 = 1; - // 2 halves of the block (128 elements each) - for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { - // 4 groups (each group has 2 blocks of 16 elements) - for (var shift: u32 = 0; shift < 8; shift += 2) { - // 2 blocks - for (var k: u32 = 0; k < 32; k += 16) { - let sc = get_byte(scale_vals[is / 4], is % 4); - is++; - let dl = d * (f32(sc) - 32.0); - for (var l: u32 = 0u; l < 16u; l++) { - let q_idx = q_b_idx + k + l; - let hm_idx = k + l; - let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4); - let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4); - let hm = select(4.0, 0.0, (hmask_byte & m) != 0); - let qs_val = (q_byte >> shift) & 3; - sum += ((f32(qs_val) - hm) * dl) * src1[src1_i]; - src1_i++; - } - } - m <<= 1; - } - } - return sum; -} -#endif - -#ifdef Q4_K -// 8 blocks of 32 elements each -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block = src0[src0_idx_base + offset]; - let d = f32(block.d); - let m = f32(block.dmin); - var sum = 0.0; - var src1_i = src1_idx_base + offset * 256; - var is: u32 = 0; - // 2 blocks each iteration - for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { - for (var shift: u32 = 0; shift < 8; shift += 4) { - let scale_min = get_scale_min(is, block.scales); - is++; - let dl = d * scale_min.x; - let ml = m * scale_min.y; - for (var l: u32 = 0; l < 32; l++) { - let q_idx = q_b_idx + l; - let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); - let qs_val = (q_byte >> shift) & 0xF; - sum += (f32(qs_val) * dl - ml) * src1[src1_i]; - src1_i++; - } - } - } - return sum; -} -#endif - -#ifdef Q5_K -// 8 blocks of 32 elements each -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block = src0[src0_idx_base + offset]; - let d = f32(block.d); - let m = f32(block.dmin); - var sum = 0.0; - var src1_i = src1_idx_base + offset * 256; - var is: u32 = 0; - var u: u32 = 1; - // 2 blocks each iteration - for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { - for (var shift: u32 = 0; shift < 8; shift += 4) { - let scale_min = get_scale_min(is, block.scales); - is++; - let dl = d * scale_min.x; - let ml = m * scale_min.y; - for (var l: u32 = 0; l < 32; l++) { - let q_idx = q_b_idx + l; - let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); - let qh_byte = get_byte(block.qh[l / 4], l % 4); - let qs_val = (q_byte >> shift) & 0xF; - let qh_val = select(0.0, 16.0, (qh_byte & u) != 0); - sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i]; - src1_i++; - } - u <<= 1; - } - } - return sum; -} -#endif - -#ifdef Q6_K -// 16 blocks of 16 elements each -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 210; // Block stride: 210 bytes - - // Bytes 208-209: f16 scale 'd' - let d = load_f16_as_f32_at_src0(block_byte_base + 208); - - // Bytes 0-127: 128 bytes of ql (32 u32s) - var ql_vals: array; - for (var i: u32 = 0; i < 32; i++) { - ql_vals[i] = load_u32_at_src0(block_byte_base + i * 4); - } - - // Bytes 128-191: 64 bytes of qh (16 u32s) - var qh_vals: array; - for (var i: u32 = 0; i < 16; i++) { - qh_vals[i] = load_u32_at_src0(block_byte_base + 128 + i * 4); - } - - // Bytes 192-207: 16 bytes of scales (4 u32s) - var scale_vals: array; - for (var i: u32 = 0; i < 4; i++) { - scale_vals[i] = load_u32_at_src0(block_byte_base + 192 + i * 4); - } - - var sum = 0.0; - var src1_i = src1_idx_base + offset * 256; - var qh_b_idx: u32 = 0; - var sc_b_idx: u32 = 0; - for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) { - for (var l: u32 = 0; l < 32; l++) { - let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4); - let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4); - let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4); - - let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0; - let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0; - let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0; - let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0; - - let is = l/16; - let is1 = sc_b_idx + is; - let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4); - let is2 = sc_b_idx + is + 2; - let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4); - let is3 = sc_b_idx + is + 4; - let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4); - let is4 = sc_b_idx + is + 6; - let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4); - - sum += d * f32(sc1) * q1 * src1[src1_i + l]; - sum += d * f32(sc2) * q2 * src1[src1_i + l + 32]; - sum += d * f32(sc3) * q3 * src1[src1_i + l + 64]; - sum += d * f32(sc4) * q4 * src1[src1_i + l + 96]; - } - src1_i += 128; - qh_b_idx += 32; - sc_b_idx += 8; - } - return sum; -} -#endif - -#ifdef IQ2_XXS -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 66; // Block stride: 66 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 256; - var sum = 0.0; - for (var ib: u32 = 0; ib < 32; ib += 4) { - let aux0_offset = block_byte_base + 2 + ib * 2; - let aux1_offset = block_byte_base + 2 + (ib + 2) * 2; - let aux0 = load_u32_at_src0(aux0_offset); - let aux1 = load_u32_at_src0(aux1_offset); - let db = d * (0.5 + f32(aux1 >> 28)) * 0.25; - for (var l: u32 = 0; l < 4; l++) { - let ig = get_byte(aux0, l) * 8; - let is = (aux1 >> (7 * l)) & 127; - let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); - for (var j: u32 = 0; j < 8; j++) { - let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); - sum += db * f32(g) * m * src1[src1_i]; - src1_i++; - } - } - } - return sum; -} -#endif - -#ifdef IQ2_XS -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 74; // Block stride: 74 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 256; - - var scale_vals = array( - load_u32_at_src0(block_byte_base + 66), - load_u32_at_src0(block_byte_base + 70) - ); - - var sum = 0.0; - for (var ib: u32 = 0; ib < 32; ib += 4) { - let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4); - let db = array( - d * (0.5 + f32(s & 0xF)) * 0.25, - d * (0.5 + f32(s >> 4)) * 0.25 - ); - for (var l: u32 = 0; l < 4; l++) { - let qs_offset = block_byte_base + 2 + (ib + l) * 2; - let qs_val = load_u32_at_src0(qs_offset) & 0xFFFF; - let ig = (qs_val & 511) * 8; - let is = qs_val >> 9; - let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); - let dl = db[l/2]; - for (var j: u32 = 0; j < 8; j++) { - let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); - sum += dl * f32(g) * m * src1[src1_i]; - src1_i++; - } - } - } - return sum; -} -#endif - -#ifdef IQ2_S -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 82; // Block stride: 82 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 256; - - var qs_vals : array; - for (var i: u32 = 0; i < 16; i++) { - qs_vals[i] = load_u32_at_src0(block_byte_base + 2 + i * 4); - } - - var qh_vals: array; - qh_vals[0] = load_u32_at_src0(block_byte_base + 66); - qh_vals[1] = load_u32_at_src0(block_byte_base + 70); - - var scale_vals: array; - scale_vals[0] = load_u32_at_src0(block_byte_base + 74); - scale_vals[1] = load_u32_at_src0(block_byte_base + 78); - - var sum = 0.0; - for (var ib: u32 = 0; ib < 8; ib ++) { - let s = get_byte(scale_vals[ib / 4], ib % 4); - let db = array( - d * (0.5 + f32(s & 0xF)) * 0.25, - d * (0.5 + f32(s >> 4)) * 0.25 - ); - let qs_w = qs_vals[ib]; - for (var l: u32 = 0; l < 4; l++) { - let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300; - let ig = (get_byte(qs_w, l) | qh_b) * 8; - let signs = get_byte(qs_vals[ib + 8], l); - let dl = db[l/2]; - for (var j: u32 = 0; j < 8; j++) { - let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); - sum += dl * f32(g) * m * src1[src1_i]; - src1_i++; - } - } - } - return sum; -} -#endif - -#ifdef IQ3_XXS -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 98; // Block stride: 98 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 256; - var sum = 0.0; - for (var ib: u32 = 0; ib < 16; ib += 2) { - let sc_sign_offset = block_byte_base + 2 + (ib + 32) * 2; - let sc_sign = load_u32_at_src0(sc_sign_offset); - let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5; - for (var l: u32 = 0; l < 4; l++) { - let is = (sc_sign >> (7 * l)) & 127; - let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); - let ig_val = load_u32_at_src0(block_byte_base + 2 + (ib * 2 + l) * 2) & 0xFFFF; - let ig1 = get_byte(ig_val, 0); - let ig2 = get_byte(ig_val, 1); - for (var j: u32 = 0; j < 4; j++) { - let g1 = get_byte(iq3xxs_grid[ig1], j); - let g2 = get_byte(iq3xxs_grid[ig2], j); - let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); - let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); - sum += db * f32(g1) * m1 * src1[src1_i]; - sum += db * f32(g2) * m2 * src1[src1_i + 4]; - src1_i++; - } - src1_i += 4; - } - } - return sum; -} -#endif - -#ifdef IQ3_S -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 110; // Block stride: 110 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 256; - - var qh_vals = array( - load_u32_at_src0(block_byte_base + 66), - load_u32_at_src0(block_byte_base + 70) - ); - - var sign_vals: array; - for (var i: u32 = 0; i < 8; i++) { - sign_vals[i] = load_u32_at_src0(block_byte_base + 74 + i * 4); - } - - var scale_vals = load_u32_at_src0(block_byte_base + 106); - - var sum = 0.0; - for (var ib: u32 = 0; ib < 4; ib++) { - let s = get_byte(scale_vals, ib); - let db = array( - d * (1.0 + 2.0 * f32(s & 0xF)), - d * (1.0 + 2.0 * f32(s >> 4)) - ); - for (var k: u32 = 0; k < 2; k++) { - let dl = db[k]; - let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k); - let sign_w = sign_vals[ib * 2 + k]; - for (var l: u32 = 0; l < 4; l++) { - let signs = get_byte(sign_w, l); - let ig_val = load_u32_at_src0(block_byte_base + 2 + (ib * 8 + k * 4 + l) * 2) & 0xFFFF; - let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256); - let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256); - for (var j: u32 = 0; j < 4; j++) { - let g1 = get_byte(iq3s_grid[ig1], j); - let g2 = get_byte(iq3s_grid[ig2], j); - let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); - let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); - sum += dl * f32(g1) * m1 * src1[src1_i]; - sum += dl * f32(g2) * m2 * src1[src1_i + 4]; - src1_i++; - } - src1_i += 4; - } - } - } - return sum; -} -#endif - -#ifdef IQ1_S -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 50; // Block stride: 50 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 256; - var sum = 0.0; - for (var ib: u32 = 0; ib < 8; ib++) { - let qh = load_u32_at_src0(block_byte_base + 34 + ib * 2) & 0xFFFF; - let dl = d * (2.0 * f32((qh >> 12) & 7) + 1.0); - let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0); - let qs_w = load_u32_at_src0(block_byte_base + 2 + ib * 4); - for (var l: u32 = 0; l < 4; l++) { - let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8; - for (var j: u32 = 0; j < 8; j++) { - let gw = iq1_grid[(ig + j) / 16]; - let g = (gw >> (((ig + j) % 16) * 2)) & 3; - let gs = bitcast(g << 30) >> 30; - sum += dl * (f32(gs) + delta) * src1[src1_i]; - src1_i++; - } - } - } - return sum; -} -#endif - - -#ifdef IQ1_M -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block = src0[src0_idx_base + offset]; - - let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000); - let d = f32(bitcast>(scale).x); - var src1_i = src1_idx_base + offset * 256; - var sum = 0.0; - for (var ib: u32 = 0; ib < 8; ib++) { - let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF; - let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7; - let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7; - var dl = array( - d * f32(2 * s1 + 1), - d * f32(2 * s2 + 1) - ); - - let qh = block.qh[ib / 2] >> (16 * (ib % 2)); - var idx = array( - get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700), - get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700), - get_byte(block.qs[ib], 2) | ((qh) & 0x700), - get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700) - ); - var delta = array( - select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0), - select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0), - select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0), - select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0) - ); - for (var l: u32 = 0; l < 4; l++) { - let ig = idx[l] * 8; - for (var j: u32 = 0; j < 8; j++) { - let gw = iq1_grid[(ig + j) / 16]; - let g = (gw >> (((ig + j) % 16) * 2)) & 3; - let gs = bitcast(g << 30) >> 30; - sum += dl[l/2] * (f32(gs) + delta[l]) * src1[src1_i]; - src1_i++; - } - } - } - return sum; -} -#endif - -#ifdef IQ4_NL -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block_byte_base = (src0_idx_base + offset) * 18; // Block stride: 18 bytes - let d = load_f16_as_f32_at_src0(block_byte_base); - var src1_i = src1_idx_base + offset * 32; - var sum = 0.0; - var qs: array; - for (var i: u32 = 0; i < 4; i++) { - qs[i] = load_u32_at_src0(block_byte_base + 2 + i * 4); - } - for (var j: u32 = 0; j < 16; j++) { - let qsb = get_byte(qs[j / 4], j % 4); - sum += d * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; - sum += d * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; - src1_i++; - } - return sum; -} -#endif - -#ifdef IQ4_XS -fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { - let block = src0[src0_idx_base + offset]; - let d = unpack2x16float(block.d_scales_h)[0]; - let scales_h = block.d_scales_h >> 16; - var src1_i = src1_idx_base + offset * 256; - var sum = 0.0; - for (var ib: u32 = 0; ib < 8; ib++) { - let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4); - let dl = d * (f32(ls) - 32.0); - for (var j: u32 = 0; j < 16; j++) { - let iqs = ib * 16 + j; - let qsb = get_byte(block.qs[iqs / 4], iqs % 4); - sum += dl * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; - sum += dl * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; - src1_i++; - } - src1_i += 16; - } - return sum; -} -#endif - -struct MulMatParams { - offset_src0: u32, // in elements/blocks - offset_src1: u32, // in elements/blocks - offset_dst: u32, // in elements/blocks - m: u32, - n: u32, - k: u32, - // all strides are in elements/blocks - stride_01: u32, - stride_11: u32, - stride_02: u32, - stride_12: u32, - stride_03: u32, - stride_13: u32, - - bs02: u32, - bs03: u32, - broadcast2: u32, - broadcast3: u32 -}; - -@group(0) @binding(0) var src0: array; // M rows, K columns -@group(0) @binding(1) var src1: array; // K rows, N columns (transposed) -@group(0) @binding(2) var dst: array; // M rows, N columns - -@group(0) @binding(3) var params: MulMatParams; - -@compute @workgroup_size(256) -fn main(@builtin(local_invocation_id) local_id: vec3, - @builtin(workgroup_id) wg_id: vec3, - @builtin(num_workgroups) num_wg: vec3) { - let wg_linear = wg_id.y * num_wg.x + wg_id.x; - let global_idx = wg_linear * 256u + local_id.x; - - let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; - if (global_idx >= total) { - return; - } - - let dst2_stride = params.m * params.n; - let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; - - let dst3_idx = global_idx / dst3_stride; - let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension - let src13_idx = dst3_idx; // src1 is not broadcast - let dst3_rem = global_idx % dst3_stride; - - let dst2_idx = dst3_rem / dst2_stride; - let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension - let src12_idx = dst2_idx; // src1 is not broadcast - - let dst2_rem = dst3_rem % dst2_stride; - - let row = dst2_rem / params.m; // output row - let col = dst2_rem % params.m; // output column - - let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01; - let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11; - - var sum = 0.0; - for (var i: u32 = 0u; i < params.k/BLOCK_SIZE; i = i + 1u) { - sum += multiply_add(src0_idx_base, src1_idx_base, i); - } - dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.m + col] = sum; -} diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl index d79d5f3f282..581e922709d 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_gather.wgsl @@ -21,35 +21,32 @@ var count:atomic; @compute @workgroup_size(WG_SIZE) fn main(@builtin(workgroup_id) wg_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(num_workgroups) num_wg: vec3) { + @builtin(local_invocation_id) local_id: vec3) { let thread_id = local_id.x; - let own_expert = wg_id.y * num_wg.x + wg_id.x; // the expert assigned to this workgroup + let own_expert = wg_id.x; // the expert assigned to this workgroup - if (own_expert < params.n_expert) { - if (thread_id == 0u) { - atomicStore(&count, 0); - } + if (thread_id == 0u) { + atomicStore(&count, 0); + } - workgroupBarrier(); - - for (var i = thread_id;i < params.n_expert_used * params.n_tokens;i += WG_SIZE) { - let row = i / params.n_expert_used; - let col = i % params.n_expert_used; - let expert = u32(ids[params.offset_ids + row * params.stride_ids_1 + col]); - if (own_expert == expert) { - let pos = atomicAdd(&count, 1u); - let gathered_id = own_expert * params.n_tokens + pos; - global_gathered_expert_used[gathered_id] = col; - global_gathered_tokens[gathered_id] = row; - } + workgroupBarrier(); + + for (var i = thread_id;i < params.n_expert_used * params.n_tokens;i += WG_SIZE) { + let row = i / params.n_expert_used; + let col = i % params.n_expert_used; + let expert = u32(ids[params.offset_ids + row * params.stride_ids_1 + col]); + if (own_expert == expert) { + let pos = atomicAdd(&count, 1u); + let gathered_id = own_expert * params.n_tokens + pos; + global_gathered_expert_used[gathered_id] = col; + global_gathered_tokens[gathered_id] = row; } + } - workgroupBarrier(); + workgroupBarrier(); - if (thread_id == 0u) { - gathered_count_ids[own_expert] = atomicLoad(&count); - } + if (thread_id == 0u) { + gathered_count_ids[own_expert] = atomicLoad(&count); } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl index a194cf40468..f0a7fbd059a 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl @@ -3,10 +3,18 @@ enable subgroups; #endif enable f16; +#ifdef MMVQ +requires packed_4x8_integer_dot_product; +#endif + #define DECLARE_BYTE_LOADERS_SRC0 #include "common_decls.tmpl" +#ifdef MMVQ +#include "mul_mat_vec_q_acc.tmpl" +#else #include "mul_mat_vec_acc.tmpl" +#endif struct MulMatParams { offset_src0: u32, @@ -28,9 +36,14 @@ struct MulMatParams { }; @group(0) @binding(0) var src0: array; + +#ifdef MMVQ +@group(0) @binding(1) var src1q: array; +#else @group(0) @binding(1) var src1: array; -@group(0) @binding(2) var dst: array; +#endif +@group(0) @binding(2) var dst: array; // "mul_mat_vec_acc.tmpl" requires params.k, params.m, params.stride_01 @group(0) @binding(3) var params: MulMatParams; @@ -75,10 +88,15 @@ fn main( let src12_idx = dst2_idx; let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02; - let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; let dst_idx_base = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row_base; +#ifdef MMVQ + let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * (params.k / 32u); + let acc = accumulate_vec_q_dot(thread_id, row_base, src0_batch_offset, src1q_idx_base); +#else + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base); +#endif #ifdef USE_SUBGROUP_REDUCTION for (var row = 0u; row < OUTPUTS_PER_WG; row++) { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl index 711c7e829d8..08753b9d643 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl @@ -436,7 +436,6 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src } #endif - #ifdef MUL_ACC_Q3_K #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 110 diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl new file mode 100644 index 00000000000..3ef2f77ebe0 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl @@ -0,0 +1,303 @@ +#ifdef U32_DEQUANT_HELPERS +#define SRC0_TYPE u32 + +fn byte_of(v: u32, b: u32) -> u32 { + return (v >> (b * 8u)) & 0xFFu; +} + +fn sbyte_of(v: u32, b: u32) -> i32 { + let raw = i32((v >> (b * 8u)) & 0xFFu); + return select(raw, raw - 256, raw >= 128); +} +#endif + +#define SRC0_TYPE SRC0_INNER_TYPE +#define SRC1_TYPE SRC1_INNER_TYPE + +#ifdef LEGACY_QUANTS +#define BLOCK_SIZE 32 +#define THREADS_PER_BLOCK 4 +#elif K_QUANTS +#define BLOCK_SIZE 256 +#define THREADS_PER_BLOCK 16 +#endif + +#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) +#define Q8_BLOCK_SIZE 32 + +#ifdef MUL_ACC_Q4_0 +#define BLOCK_SIZE_BYTES 18 +#define B_DS_TYPE vec2 +fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2 { + let qs_packed = load_u32_at_src0(block_byte_base + 2u + 4u * inner_id); + + return vec2( + qs_packed & 0x0F0F0F0Fu, + (qs_packed >> 4u) & 0x0F0F0F0Fu + ); +} +fn repack_b_qs(block:u32, inner_id: u32) -> vec2 { + return vec2( + src1q[block].qs[inner_id], + src1q[block].qs[inner_id + 4u], + ); +} +fn repack_b_dm(block: u32) -> B_DS_TYPE { + return B_DS_TYPE( + f32(src1q[block].d), + f32(src1q[block].s) + ); +} +fn get_dm(block_byte_base: u32) -> f32 { + return f32(load_f16_at_src0(block_byte_base)); +} +fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 { + return f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK; +} +#endif + +#ifdef MUL_ACC_Q4_1 +#define BLOCK_SIZE_BYTES 20 +#define B_DS_TYPE vec2 +fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2 { + let qs_packed = load_u32_at_src0(block_byte_base + 4u + 4u * inner_id); + + return vec2( + qs_packed & 0x0F0F0F0Fu, + (qs_packed >> 4u) & 0x0F0F0F0Fu + ); +} +fn repack_b_qs(block:u32, inner_id: u32) -> vec2 { + return vec2( + src1q[block].qs[inner_id], + src1q[block].qs[inner_id + 4u], + ); +} +fn repack_b_dm(block: u32) -> B_DS_TYPE { + return B_DS_TYPE( + f32(src1q[block].d), + f32(src1q[block].s) + ); +} +fn get_dm(block_byte_base: u32) -> vec2 { + return vec2( + f32(load_f16_at_src0(block_byte_base)), + f32(load_f16_at_src0(block_byte_base + 2u)) + ); +} +fn mul_q8_1(row_sum: i32, dma: vec2, b_ds: B_DS_TYPE) -> f32 { + return f32(row_sum) * (dma.x * b_ds.x) + dma.y * b_ds.y / THREADS_PER_BLOCK; +} +#endif + +#ifdef MUL_ACC_Q8_0 +#define BLOCK_SIZE_BYTES 34 +#define B_DS_TYPE f32 +fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2 { + return vec2( + load_u32_at_src0(block_byte_base + 2u + 4u * (inner_id * 2u)), + load_u32_at_src0(block_byte_base + 2u + 4u * (inner_id * 2u + 1)) + ); +} +fn repack_b_qs(block:u32, inner_id: u32) -> vec2 { + return vec2( + src1q[block].qs[inner_id * 2u], + src1q[block].qs[inner_id * 2u + 1], + ); +} +fn repack_b_dm(block: u32) -> B_DS_TYPE { + return B_DS_TYPE(src1q[block].d); +} +fn get_dm(block_byte_base: u32) -> f32 { + return f32(load_f16_at_src0(block_byte_base)); +} +fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 { + return f32(row_sum) * (da * b_ds); +} +#endif + +#ifdef LEGACY_QUANTS +fn mmvq_dot_product(a_byte_base: u32, b_inner_id: u32, b_repacked: vec2, b_ds: B_DS_TYPE) -> f32 { + var row_sum = 0; + let a_repacked = repack_a(a_byte_base, b_inner_id); + + row_sum += dot4I8Packed(a_repacked[0], b_repacked[0]); + row_sum += dot4I8Packed(a_repacked[1], b_repacked[1]); + + return mul_q8_1(row_sum, get_dm(a_byte_base), b_ds); +} + +fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array { + var acc: array; + + let num_blocks = params.k / BLOCK_SIZE; + + for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { + let b_inner_id = thread_id % THREADS_PER_BLOCK; + let b_block_idx = src1q_idx_base + block; + + let b_repacked = repack_b_qs(b_block_idx, b_inner_id); + let b_ds = repack_b_dm(b_block_idx); + + for (var row = 0u; row < OUTPUTS_PER_WG; row++) { + let output_row = row_base + row; + if (output_row < params.m) { + let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; + acc[row] += mmvq_dot_product(block_byte_base, b_inner_id, b_repacked, b_ds); + } + } + } + + return acc; +} +#endif + +#ifdef MUL_ACC_Q2_K +#define BLOCK_SIZE_BYTES 84 +#define B_DS_TYPE f32 +fn repack_a(block_byte_base: u32, tid: u32) -> vec4 { + let ih2 = tid / 8u; + let phase = tid % 2u; + let iq4_idx = 2u * ih2 + phase; + let qs_byte_base = block_byte_base + 16u + 16u * iq4_idx; + let qs_shift = tid & 6u; + return vec4( + (load_u32_at_src0_aligned(qs_byte_base) >> qs_shift) & 0x03030303u, + (load_u32_at_src0_aligned(qs_byte_base + 4u) >> qs_shift) & 0x03030303u, + (load_u32_at_src0_aligned(qs_byte_base + 8u) >> qs_shift) & 0x03030303u, + (load_u32_at_src0_aligned(qs_byte_base + 12u) >> qs_shift) & 0x03030303u, + ); +} +fn repack_b_qs(q8_block_idx: u32, tid: u32) -> vec4 { + let phase = tid % 2u; + return vec4( + src1q[q8_block_idx].qs[4u * phase], + src1q[q8_block_idx].qs[4u * phase + 1u], + src1q[q8_block_idx].qs[4u * phase + 2u], + src1q[q8_block_idx].qs[4u * phase + 3u], + ); +} +fn repack_b_dm(q8_block_idx: u32) -> B_DS_TYPE { + return B_DS_TYPE(src1q[q8_block_idx].d); +} +fn get_dm(block_byte_base: u32) -> vec2 { + return vec2( + f32(load_f16_at_src0(block_byte_base + 80u)), + f32(load_f16_at_src0(block_byte_base + 82u)), + ); +} +fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2 { + let scale_byte = block_byte_base + tid; + let scale = byte_of(load_u32_at_src0_aligned(scale_byte), scale_byte & 3u); + return vec2(f32(scale & 0xFu), f32(scale >> 4u)); +} +fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4, b_ds: B_DS_TYPE) -> f32 { + let a_repacked = repack_a(a_byte_base, tid); + let dm = get_dm(a_byte_base); + let scale_min = get_scale_min(a_byte_base, tid); + + let scale_q = i32(scale_min.x); + let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u; + + let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1]) + + dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q; + let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4) + + dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4); + + return b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m)); +} +#endif + +#ifdef MUL_ACC_Q4_K +#define BLOCK_SIZE_BYTES 144 +#define B_DS_TYPE vec2 +fn repack_a(block_byte_base: u32, tid: u32) -> vec4 { + let iq4 = tid / 4u; + let phase = tid % 2u; + let nibble = (tid >> 1u) % 2u; + let q_qs_byte_base = block_byte_base + 16u + 32u * iq4 + 16u * phase; + let qs_shift = 4u * nibble; + return vec4( + (load_u32_at_src0_aligned(q_qs_byte_base) >> qs_shift) & 0x0F0F0F0Fu, + (load_u32_at_src0_aligned(q_qs_byte_base + 4u) >> qs_shift) & 0x0F0F0F0Fu, + (load_u32_at_src0_aligned(q_qs_byte_base + 8u) >> qs_shift) & 0x0F0F0F0Fu, + (load_u32_at_src0_aligned(q_qs_byte_base + 12u) >> qs_shift) & 0x0F0F0F0Fu, + ); +} +fn repack_b_qs(q8_block_idx: u32, tid: u32) -> vec4 { + let phase = tid % 2u; + return vec4( + src1q[q8_block_idx].qs[4u * phase], + src1q[q8_block_idx].qs[4u * phase + 1u], + src1q[q8_block_idx].qs[4u * phase + 2u], + src1q[q8_block_idx].qs[4u * phase + 3u], + ); +} +fn repack_b_dm(q8_block_idx: u32) -> B_DS_TYPE { + return B_DS_TYPE( + f32(src1q[q8_block_idx].d), + f32(src1q[q8_block_idx].s), + ); +} +fn get_dm(block_byte_base: u32) -> vec2 { + return vec2( + f32(load_f16_at_src0(block_byte_base + 0u)), + f32(load_f16_at_src0(block_byte_base + 2u)), + ); +} +fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2 { + let sc_m_idx = tid / 2u; + let scales_byte_base = block_byte_base + 4u; + let scales0_3 = load_u32_at_src0_aligned(scales_byte_base); + let scales4_7 = load_u32_at_src0_aligned(scales_byte_base + 4u); + let scales8_11 = load_u32_at_src0_aligned(scales_byte_base + 8u); + + let byte_idx = sc_m_idx & 3u; + let is_high = sc_m_idx >= 4u; + + let sc_low = byte_of(scales0_3, byte_idx) & 0x3Fu; + let sc_high = (byte_of(scales8_11, byte_idx) & 0x0Fu) | ((byte_of(scales0_3, byte_idx) & 0xC0u) >> 2u); + let scale = f32(select(sc_low, sc_high, is_high)); + + let mn_low = byte_of(scales4_7, byte_idx) & 0x3Fu; + let mn_high = (byte_of(scales8_11, byte_idx) >> 4u) | ((byte_of(scales4_7, byte_idx) & 0xC0u) >> 2u); + let min_val = f32(select(mn_low, mn_high, is_high)); + + return vec2(scale, min_val); +} +fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4, b_ds: B_DS_TYPE) -> f32 { + let a_repacked = repack_a(a_byte_base, tid); + let dm = get_dm(a_byte_base); + let scale_min = get_scale_min(a_byte_base, tid); + + let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1]) + + dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]); + + // Each thread covers half of the Q8_1 block, so add only b_ds.y/2. + return b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD)); +} +#endif + +#ifdef K_QUANTS +fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array { + var acc: array; + + let tid = thread_id % THREADS_PER_BLOCK; + + for (var block = thread_id / THREADS_PER_BLOCK; block < params.k / BLOCK_SIZE; block += WG_SIZE / THREADS_PER_BLOCK) { + let src1q_idx = src1q_idx_base + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE; + let b_repacked = repack_b_qs(src1q_idx, tid); + let b_ds = repack_b_dm(src1q_idx); + + for (var row = 0u; row < OUTPUTS_PER_WG; row++) { + let output_row = row_base + row; + if (output_row < params.m) { + let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; + acc[row] += mmvq_dot_product(block_byte_base, tid, b_repacked, b_ds); + } + } + } + + return acc; +} +#endif diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl new file mode 100644 index 00000000000..b3f1fa04b80 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl @@ -0,0 +1,173 @@ +#ifdef USE_SUBGROUP_REDUCTION +enable subgroups; +#endif +enable f16; + +requires packed_4x8_integer_dot_product; + +#include "common_decls.tmpl" + +struct Params { + offset_src1: u32, + stride_12: u32, + stride_13: u32, + ne0: u32, + ne2: u32, + ne3: u32, +}; + +#define SRC1_TYPE vec4 + +@group(0) @binding(0) var src1: array; +@group(0) @binding(1) var src1q: array; + +@group(0) @binding(2) var params: Params; + +#ifdef USE_SUBGROUP_REDUCTION +fn cluster_max_8(v: f32) -> f32 { + var r = v; + r = max(r, subgroupShuffleXor(r, 1u)); + r = max(r, subgroupShuffleXor(r, 2u)); + r = max(r, subgroupShuffleXor(r, 4u)); + return r; +} + +#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K) +fn cluster_add_i4x8(v: i32) -> i32 { + var r= v; + r += subgroupShuffleXor(r, 1u); + r += subgroupShuffleXor(r, 2u); + r += subgroupShuffleXor(r, 4u); + return r; +} +#endif +#endif + +#ifdef USE_WORKGROUP_REDUCTION +#define CLUSTER_SIZE 8 + +var partial_amaxs: array, WG_SIZE / CLUSTER_SIZE>; +var partial_sums: array, WG_SIZE / CLUSTER_SIZE>; +#endif + +@compute @workgroup_size(WG_SIZE) +fn main( + @builtin(local_invocation_id) local_id: vec3, + @builtin(workgroup_id) wg_id: vec3, + @builtin(num_workgroups) num_wg: vec3 +) { + let thread_id = local_id.x; + let num_vec4 = params.ne0 / 4u; + + let wg_per_vec = (num_vec4 + (WG_SIZE - 1u)) / WG_SIZE; + let total_batches = wg_per_vec * params.ne2 * params.ne3; + + let wg_linear = wg_id.y * num_wg.x + wg_id.x; + if (wg_linear >= total_batches) { + return; + } + + let src13_idx = wg_linear / (params.ne2 * wg_per_vec); + let src12_idx = (wg_linear - src13_idx * (params.ne2 * wg_per_vec)) / wg_per_vec; + let src11_wg_idx = wg_linear % wg_per_vec; + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; + let src1_idx_vec4_base = src1_idx_base / 4u; + + let blocks_per_row = params.ne0 / 32u; + let blocks_per_wg = (WG_SIZE * 4u) / 32u; + let src1q_idx_base = (src13_idx * params.ne2 + src12_idx) * blocks_per_row; + let src1q_idx = src1q_idx_base + src11_wg_idx * blocks_per_wg + thread_id / 8u; + let qs_idx = thread_id % 8u; + + // reduction + var q4 = vec4(0.0); + var q4_quants = 0u; + var thread_amax = 0.0; + + let src11_vec4_idx = src11_wg_idx * WG_SIZE + thread_id; + let is_valid = src11_vec4_idx < num_vec4; + +#ifdef USE_SUBGROUP_REDUCTION + + var d = 0.0; + + if (is_valid) { + q4 = src1[src1_idx_vec4_base + src11_vec4_idx]; + let abs_q4 = abs(q4); + thread_amax = max(max(abs_q4[0u], abs_q4[1u]), max(abs_q4[2], abs_q4[3])); + } + + d = cluster_max_8(thread_amax) / 127.0; + + if (is_valid) { + let id = select(0.0, 1.0 / d, d > 0.0); + q4_quants = pack4xI8(vec4(round(q4 * id))); + if (qs_idx == 0u) { + src1q[src1q_idx].d = f16(d); + } + src1q[src1q_idx].qs[qs_idx] = q4_quants; + } + +#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K) + let q4_quants_sum = dot4I8Packed(q4_quants, 0x01010101u); + let s = f16(d * f32(cluster_add_i4x8(q4_quants_sum))); + + if (is_valid) { + if (qs_idx == 0u) { + src1q[src1q_idx].s = s; + } + } +#endif +#endif + +#ifdef USE_WORKGROUP_REDUCTION + + var d = 0.0; + let cluster_id = thread_id / 8u; + + if (is_valid) { + q4 = src1[src1_idx_vec4_base + src11_vec4_idx]; + let abs_q4 = abs(q4); + thread_amax = max(max(abs_q4[0], abs_q4[1]), max(abs_q4[2], abs_q4[3])); + partial_amaxs[cluster_id][qs_idx] = thread_amax; + } + + workgroupBarrier(); + + if (is_valid) { + let amax = max( + max( + max(partial_amaxs[cluster_id][0], partial_amaxs[cluster_id][1]), max(partial_amaxs[cluster_id][2], partial_amaxs[cluster_id][3])), + max( + max(partial_amaxs[cluster_id][4], partial_amaxs[cluster_id][5]), max(partial_amaxs[cluster_id][6], partial_amaxs[cluster_id][7])) + ); + + d = amax / 127.0; + let id = select(0.0f, 1.0f / d, d > 0.0f); + + q4_quants = pack4xI8(vec4(round(q4 * id))); + src1q[src1q_idx].qs[qs_idx] = q4_quants; + + if (qs_idx == 0u) { + src1q[src1q_idx].d = f16(d); + } + } + +#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K) + + partial_sums[cluster_id][qs_idx] = dot4I8Packed(q4_quants, 0x01010101u); + + workgroupBarrier(); + + if (is_valid) { + if (qs_idx == 0u) { + let s = d * f32(partial_sums[cluster_id][0] + partial_sums[cluster_id][1] + partial_sums[cluster_id][2] + partial_sums[cluster_id][3] + + partial_sums[cluster_id][4] + partial_sums[cluster_id][5] + partial_sums[cluster_id][6] + partial_sums[cluster_id][7]); + src1q[src1q_idx].s = f16(s); + } + } + +#endif +#endif + +} diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl index 99e9192c71a..09f2f0eddb3 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl @@ -71,7 +71,6 @@ fn main(@builtin(global_invocation_id) gid: vec3) { return; } - // getting the row from gid let elems_per_row = params.ne0 / VEC_SIZE; var i = gid.x / elems_per_row; @@ -104,6 +103,6 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let i_dst_row = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; - let col_idx = (gid.x % elems_per_row); - dst[i_dst_row/VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row/VEC_SIZE + col_idx]); + let col_idx = gid.x % elems_per_row; + dst[i_dst_row / VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row / VEC_SIZE + col_idx]); } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl new file mode 100644 index 00000000000..876e65b6ae1 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows_quant.wgsl @@ -0,0 +1,224 @@ +#ifdef DST_Q8_0 +#define BLOCK_SIZE 32u +#define BLOCK_BYTES 34u +#define QS_WORDS 8u +#elif defined(DST_Q4_0) +#define BLOCK_SIZE 32u +#define BLOCK_BYTES 18u +#define QS_WORDS 4u +#endif + +@group(0) @binding(0) +var src: array; + +@group(0) @binding(1) +var idx: array; + +@group(0) @binding(2) +#ifdef PAIR_BLOCKS +var dst: array; +#else +var dst: array>; +#endif + +#ifdef I64_IDX +@group(0) @binding(3) +var error: atomic; +#define PARAMS_BINDING 4 +#else +#define PARAMS_BINDING 3 +#endif + +struct Params { + offset_src: u32, // in elements + offset_idx: u32, // in elements + offset_dst: u32, // in blocks + + // Strides (in elements / blocks) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_idx0: u32, + stride_idx1: u32, + stride_idx2: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Shape of src + ne0: u32, + n_rows: u32, + ne2: u32, + ne3: u32, + + // Shape of idx + idx1: u32, + idx2: u32, +}; + +@group(0) @binding(PARAMS_BINDING) +var params: Params; + +// if the quantization type is unaligned and there are an odd number of blocks per row, we need to store atomically +#ifndef PAIR_BLOCKS +fn merge_store_dst_word(word_idx: u32, mask: u32, bits: u32) { + loop { + let old = atomicLoad(&dst[word_idx]); + let merged = (old & ~mask) | (bits & mask); + let result = atomicCompareExchangeWeak(&dst[word_idx], old, merged); + if (result.exchanged) { + return; + } + } +} +#else +fn merge_store_dst_word(word_idx: u32, mask: u32, bits: u32) { + let old = dst[word_idx]; + dst[word_idx] = (old & ~mask) | (bits & mask); +} +#endif + +fn store_u16(dst_word_idx: u32, block_byte_offset: u32, byte_offset: u32, value: u32) { + let total_byte_offset = block_byte_offset + byte_offset; + let word_idx = dst_word_idx + total_byte_offset / 4u; + let shift = (total_byte_offset & 2u) * 8u; + let mask = 0xFFFFu << shift; + merge_store_dst_word(word_idx, mask, (value & 0xFFFFu) << shift); +} + +fn store_u32(dst_word_idx: u32, block_byte_offset: u32, byte_offset: u32, value: u32) { + let total_byte_offset = block_byte_offset + byte_offset; + let word_idx = dst_word_idx + total_byte_offset / 4u; + let shift = (total_byte_offset & 3u) * 8u; + + if (shift == 0u) { +#ifdef PAIR_BLOCKS + dst[word_idx] = value; +#else + atomicStore(&dst[word_idx], value); +#endif + return; + } + + let lo_mask = 0xFFFFFFFFu << shift; + let hi_mask = (1u << shift) - 1u; + merge_store_dst_word(word_idx, lo_mask, value << shift); + merge_store_dst_word(word_idx + 1u, hi_mask, value >> (32u - shift)); +} + +fn quantize_block_params(src_block: u32) -> vec2 { +#ifdef DST_Q8_0 + var amax = 0.0; + for (var j: u32 = 0u; j < BLOCK_SIZE; j++) { + amax = max(amax, abs(src[src_block + j])); + } + + let d = amax / 127.0; + let id = select(0.0, 1.0 / d, d > 0.0); + return vec2(d, id); +#elif defined(DST_Q4_0) + var amax = 0.0; + var max_val = 0.0; + for (var j: u32 = 0u; j < BLOCK_SIZE; j++) { + let v = src[src_block + j]; + let av = abs(v); + if (amax < av) { + amax = av; + max_val = v; + } + } + + let d = max_val / -8.0; + let id = select(0.0, 1.0 / d, d != 0.0); + return vec2(d, id); +#endif +} + +fn quantize_block_word(src_block: u32, j: u32, id: f32) -> u32 { +#ifdef DST_Q8_0 + let base = src_block + j * 4u; + return (u32(i32(round(src[base + 0u] * id)) & 0xFF) << 0u) | + (u32(i32(round(src[base + 1u] * id)) & 0xFF) << 8u) | + (u32(i32(round(src[base + 2u] * id)) & 0xFF) << 16u) | + (u32(i32(round(src[base + 3u] * id)) & 0xFF) << 24u); +#elif defined(DST_Q4_0) + var packed_q = 0u; + for (var k: u32 = 0u; k < 4u; k++) { + let x0 = src[src_block + j * 4u + k] * id; + let x1 = src[src_block + 16u + j * 4u + k] * id; + let q0 = u32(clamp(i32(x0 + 8.5), 0, 15)); + let q1 = u32(clamp(i32(x1 + 8.5), 0, 15)); + packed_q |= (q0 & 0xFu) << (8u * k); + packed_q |= (q1 & 0xFu) << (8u * k + 4u); + } + return packed_q; +#endif +} + +fn quantize_block(src_block: u32, dst_word_idx: u32, block_byte_offset: u32) { + let params = quantize_block_params(src_block); + let d = params.x; + let id = params.y; + let packed_d = pack2x16float(vec2(d, 0.0)) & 0xFFFFu; + store_u16(dst_word_idx, block_byte_offset, 0u, packed_d); + + for (var j: u32 = 0u; j < QS_WORDS; j++) { + store_u32(dst_word_idx, block_byte_offset, 2u + j * 4u, quantize_block_word(src_block, j, id)); + } +} + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(global_invocation_id) gid: vec3) { + let blocks_per_row = params.ne0 / BLOCK_SIZE; +#ifdef PAIR_BLOCKS + let blocks_per_invocation = 2u; +#else + let blocks_per_invocation = 1u; +#endif + let invocations_per_row = blocks_per_row / blocks_per_invocation; + let total_invocations = params.ne3 * params.ne2 * params.n_rows * invocations_per_row; + if (gid.x >= total_invocations) { + return; + } + + var i = gid.x / invocations_per_row; + let block_in_row = (gid.x % invocations_per_row) * blocks_per_invocation; + + let i_src3 = i / (params.ne2 * params.n_rows); + i = i % (params.ne2 * params.n_rows); + let i_src2 = i / params.n_rows; + let i_src1 = i % params.n_rows; + + let i_idx2 = i_src3 % params.idx2; + let i_idx1 = i_src2 % params.idx1; + let i_idx0 = i_src1; + +#ifdef I64_IDX + let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2u; + let idx_val = idx[idx_high]; + let idx_low_val = idx[idx_high + 1u]; + + if (idx_low_val != 0u) { + atomicStore(&error, 1u); + return; + } +#else + let idx_i = params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2; + let idx_val = idx[idx_i]; +#endif + + let dst_row_blocks = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; + let src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; + let src_block = src_row + block_in_row * BLOCK_SIZE; + let dst_block_byte = (dst_row_blocks + block_in_row) * BLOCK_BYTES; + + let dst_word_idx = dst_block_byte / 4u; +#ifdef PAIR_BLOCKS + quantize_block(src_block, dst_word_idx, 0u); + quantize_block(src_block + BLOCK_SIZE, dst_word_idx, BLOCK_BYTES); +#else + quantize_block(src_block, dst_word_idx, dst_block_byte & 3u); +#endif +} diff --git a/ggml/src/ggml-zendnn/CMakeLists.txt b/ggml/src/ggml-zendnn/CMakeLists.txt index f1e4f991fae..e4ba9cfbd0f 100644 --- a/ggml/src/ggml-zendnn/CMakeLists.txt +++ b/ggml/src/ggml-zendnn/CMakeLists.txt @@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") ExternalProject_Add( zendnn GIT_REPOSITORY https://github.com/amd/ZenDNN.git - GIT_TAG ac9e580d9434b7b98985f2627a7ebfb5eba4bb0d # ZenDNN-2026-WW17 + GIT_TAG 253b94ce0d7e9284c265fefb485714944caff9d3 # ZenDNN-2026-WW19 PREFIX ${ZENDNN_PREFIX} SOURCE_DIR ${ZENDNN_SOURCE_DIR} BINARY_DIR ${ZENDNN_BUILD_DIR} diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index 6a83bb6b1ec..3c33dcb11a0 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -2,6 +2,10 @@ #include "ggml-backend-impl.h" #include "ggml-impl.h" + +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" + #include "zendnnl.hpp" #include @@ -19,6 +23,8 @@ zendnnl::common::data_type_t ggml_to_zendnn_type() { return zendnnl::common::data_type_t::f32; } else if constexpr (std::is_same_v) { return zendnnl::common::data_type_t::bf16; + } else if constexpr (std::is_same_v) { + return zendnnl::common::data_type_t::s8; } else { return zendnnl::common::data_type_t::none; } @@ -48,6 +54,17 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int params.num_threads = ctx->n_threads; zendnnl::lowoha::matmul::matmul_batch_params_t batch_params; + + if constexpr (std::is_same_v) { + params.dtypes.compute = zendnnl::common::data_type_t::s8; + const int64_t num_groups = k / QK8_0; + params.dynamic_quant = true; + params.quant_params.src_scale.buff = nullptr; + params.quant_params.src_scale.dt = zendnnl::common::data_type_t::bf16; + params.quant_params.src_scale.dims = {n, num_groups}; + params.packing.pack_format_b = 1; + } + zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct( 'r', false, true, // row-major, don't transpose B, transpose A (because it's column-major) n, // M: rows of B and C @@ -71,7 +88,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int return true; } -static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int64_t n, int64_t k, +static bool ggml_zendnn_gemm(ggml_backend_zendnn_context * ctx, int64_t m, int64_t n, int64_t k, const void * A, int64_t lda, const void * B, int64_t ldb, void * C, int64_t ldc, int Atype, int Btype, int Ctype) { @@ -108,6 +125,14 @@ static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int6 (const ggml_bf16_t *)B, ldb, (float *)C, ldc); return false; + case GGML_TYPE_Q8_0: + if (Btype != GGML_TYPE_F32 || Ctype != GGML_TYPE_F32) + return false; + return ggml_zendnn_matmul( + ctx, m, n, k, + (const block_q8_0 *)A, lda, + (const float *)B, ldb, + (float *)C, ldc); default: return false; // unsupported type } @@ -145,7 +170,9 @@ static void ggml_zendnn_compute_forward_mul_mat( const int64_t r3 = ne13/ne03; void * work_data = ctx->work_data.get(); - if (src1->type != vec_dot_type) { + + // ZenDNN requires FP32 for dynamic quantization, so conversion is skipped + if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) { const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1 * ne11; const size_t nbw3 = nbw2 * ne12; @@ -171,9 +198,9 @@ static void ggml_zendnn_compute_forward_mul_mat( for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i12 = 0; i12 < ne12; i12++) { - const void* wdata = src1->type == vec_dot_type ? src1->data : work_data; + const void* wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data; const size_t row_size = ggml_row_size(vec_dot_type, ne10); - if (!ggml_zendnn_sgemm(ctx, + if (!ggml_zendnn_gemm(ctx, ne01, // m ne11, // n ne10, // k @@ -184,9 +211,9 @@ static void ggml_zendnn_compute_forward_mul_mat( static_cast(dst->data) + i12*nb2 + i13*nb3, ne01, // ldc src0->type, - vec_dot_type, + src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type, dst->type)) - GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__); + GGML_ABORT("%s: ZenDNN gemm failed\n", __func__); } } } @@ -261,10 +288,15 @@ static void ggml_zendnn_compute_forward_mul_mat_id( const size_t nbw1 = row_size; const size_t nbw2 = nbw1 * ne11; const size_t nbw3 = nbw2 * ne12; - const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0; + const size_t src1_conv_size = (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) ? ne13 * nbw3 : 0; + + // For Q8_0, src1 is always F32; the gather buffer must hold F32 rows (ne10*4 bytes), + // not Q8_0-encoded rows (row_size โ‰ˆ ne10/32*34 bytes) โ€” they differ by ~4x. + const size_t f32_row_size = (size_t)ne10 * sizeof(float); + const size_t gather_row_size = (src0->type == GGML_TYPE_Q8_0) ? f32_row_size : row_size; // size for MoE gather/scatter buffers - const size_t wdata_cur_size = max_rows * row_size; + const size_t wdata_cur_size = max_rows * gather_row_size; const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01); // allocate single buffer for all needs @@ -279,7 +311,8 @@ static void ggml_zendnn_compute_forward_mul_mat_id( char * wdata_cur = work_data + src1_conv_size; char * dst_cur = wdata_cur + wdata_cur_size; - if (src1->type != vec_dot_type) { + // ZenDNN requires FP32 for dynamic quantization, so conversion is skipped + if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) { GGML_ASSERT(src1->type == GGML_TYPE_F32); #pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static) @@ -294,7 +327,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id( } } - const void * wdata = src1->type == vec_dot_type ? src1->data : work_data; + const void * wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data; // process each expert with gather -> gemm -> scatter pattern for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) { @@ -315,14 +348,14 @@ static void ggml_zendnn_compute_forward_mul_mat_id( const int64_t i12 = row_mapping.i2; std::memcpy( - wdata_cur + ir1 * row_size, - (const char *) wdata + (i11 + i12*ne11) * row_size, - row_size + wdata_cur + ir1 * gather_row_size, + (const char *) wdata + (i11 + i12*ne11) * gather_row_size, + gather_row_size ); } // batched gemm for all tokens in this expert - if (!ggml_zendnn_sgemm(ctx, + if (!ggml_zendnn_gemm(ctx, ne01, // m cne1, // n ne10, // k @@ -333,9 +366,9 @@ static void ggml_zendnn_compute_forward_mul_mat_id( dst_cur, ne01, // ldc src0->type, - vec_dot_type, + src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type, dst->type)) { - GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__); + GGML_ABORT("%s: ZenDNN gemm failed\n", __func__); } // scatter output rows to destination @@ -577,6 +610,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const switch (weights->type) { case GGML_TYPE_F32: case GGML_TYPE_BF16: + case GGML_TYPE_Q8_0: return true; default: return false; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 191cf2fa106..8815c67d8bc 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5223,7 +5223,7 @@ static struct ggml_tensor * ggml_fill_impl( struct ggml_tensor * a, float c, bool inplace) { - GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16); GGML_ASSERT(ggml_is_contiguous(a)); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -6210,11 +6210,13 @@ struct ggml_tensor * ggml_gated_delta_net( GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v); GGML_ASSERT(beta->ne[0] == 1); - GGML_ASSERT(ggml_nelements(state) == S_v * S_v * H * n_seqs); - - // concat output and new_state into a single tensor - // output: S_v * H * n_tokens * n_seqs, state: S_v * S_v * H * n_seqs - const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + S_v * n_seqs, 1, 1 }; + // state is a 3D tensor (S_v*S_v*H, K, n_seqs). K is the snapshot slot count. + GGML_ASSERT(state->ne[0] == S_v * S_v * H); + GGML_ASSERT(state->ne[2] == n_seqs); + GGML_ASSERT(state->ne[3] == 1); + const int64_t K = state->ne[1]; + const int64_t state_rows = K * S_v * n_seqs; + const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + state_rows, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_GATED_DELTA_NET; diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab3cc974867..5e198618251 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -228,9 +228,18 @@ struct gguf_context { }; struct gguf_reader { - gguf_reader(FILE * file) : file(file) { - // read the remaining bytes once and update on each read - nbytes_remain = file_remain(file); + gguf_reader( + gguf_reader_callback_t callback, + void * userdata, + size_t max_chunk_read, + uint64_t data_offset = 0, + uint64_t nbytes_remain = 0) + : callback(callback), + userdata(userdata), + max_chunk_read(max_chunk_read), + data_offset(data_offset), + nbytes_remain(nbytes_remain) { + GGML_ASSERT(max_chunk_read > 0); } // helper for remaining bytes in a file @@ -257,12 +266,10 @@ struct gguf_reader { template bool read(T & dst) const { const size_t size = sizeof(dst); - if (nbytes_remain < size) { + if (size > nbytes_remain) { return false; } - const size_t nread = fread(&dst, 1, size, file); - nbytes_remain -= nread; - return nread == size; + return read_raw(&dst, size) == size; } template @@ -344,24 +351,71 @@ struct gguf_reader { return false; } dst.resize(static_cast(size)); - const size_t nread = fread(dst.data(), 1, size, file); - nbytes_remain -= nread; - return nread == size; + return read_raw(dst.data(), static_cast(size)) == size; } bool read(void * dst, const size_t size) const { if (size > nbytes_remain) { return false; } - const size_t nread = fread(dst, 1, size, file); - nbytes_remain -= nread; - return nread == size; + return read_raw(dst, size) == size; + } + + uint64_t tell() const { + return data_offset; + } + + bool seek(uint64_t absolute_offset) const { + const uint64_t end_offset = uint64_t(data_offset) + nbytes_remain; + if (absolute_offset > end_offset) { + return false; + } + + data_offset = absolute_offset; + nbytes_remain = end_offset - absolute_offset; + + return true; } private: - FILE * file; + size_t read_raw(void * dst, size_t size) const { + if (callback == nullptr || size == 0) { + return 0; + } + + uint8_t * data = static_cast(dst); + size_t total_nread = 0; + bool reached_eof = false; - mutable uint64_t nbytes_remain; + while (total_nread < size) { + const size_t chunk_size = std::min(max_chunk_read, size - total_nread); + if (data_offset + total_nread < data_offset) { + break; + } + const size_t nread = callback(userdata, static_cast(data + total_nread), data_offset + total_nread, chunk_size); + total_nread += nread; + if (nread != chunk_size) { + reached_eof = true; + break; + } + } + + data_offset += total_nread; + GGML_ASSERT(total_nread <= nbytes_remain); + nbytes_remain -= total_nread; + + if (reached_eof) { + nbytes_remain = 0; + } + + return total_nread; + } + + gguf_reader_callback_t callback = nullptr; + void * userdata = nullptr; + size_t max_chunk_read = 0; + mutable uint64_t data_offset = 0; + mutable uint64_t nbytes_remain = 0; }; struct gguf_context * gguf_init_empty(void) { @@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorinfo.size()) == n_tensors); // we require the data section to be aligned, so take into account any padding - if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) { + if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) { GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); gguf_free(ctx); return nullptr; } // store the current file offset - this is where the data section starts - ctx->offset = gguf_ftell(file); + ctx->offset = gr.tell(); // compute the total size of the data section, taking into account the alignment { @@ -844,6 +893,89 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para return ctx; } +struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params) { + if (callback == nullptr) { + return nullptr; + } + + const struct gguf_reader gr(callback, userdata, max_chunk_read == 0 ? SIZE_MAX : max_chunk_read, 0, max_expected_size); + return gguf_init_from_reader(gr, params); +} + +struct gguf_file_reader { + FILE * file; + uint64_t offset; +}; + +static size_t gguf_file_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) { + GGML_ASSERT(len > 0); + + gguf_file_reader & reader = *static_cast(userdata); + + if (reader.offset != offset) { + if (offset > INT64_MAX || gguf_fseek(reader.file, static_cast(offset), SEEK_SET) != 0) { + return 0; + } + + reader.offset = offset; + } + + const size_t nread = fread(static_cast(output), 1, len, reader.file); + reader.offset += nread; + return nread; +} + +struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) { + if (!file) { + return nullptr; + } + + const int64_t cur = gguf_ftell(file); + if (cur < 0) { + return nullptr; + } + + gguf_file_reader reader = { + /*.file = */ file, + /*.offset = */ static_cast(cur), + }; + const struct gguf_reader gr(gguf_file_reader_callback, &reader, SIZE_MAX, reader.offset, gguf_reader::file_remain(file)); + return gguf_init_from_reader(gr, params); +} + +struct gguf_buffer_reader { + const uint8_t * data; + size_t size; +}; + +static size_t gguf_buffer_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) { + GGML_ASSERT(len > 0); + + const gguf_buffer_reader & reader = *static_cast(userdata); + + if (offset > reader.size || len > reader.size - offset) { + return 0; + } + + const size_t data_offset = static_cast(offset); + const size_t nread = std::min(len, reader.size - data_offset); + memcpy(static_cast(output), reader.data + data_offset, nread); + return nread; +} + +struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) { + if (data == nullptr || size == 0) { + return nullptr; + } + + gguf_buffer_reader reader = { + /*.data = */ static_cast(data), + /*.size = */ size, + }; + const struct gguf_reader gr(gguf_buffer_reader_callback, &reader, SIZE_MAX, 0, size); + return gguf_init_from_reader(gr, params); +} + struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { FILE * file = ggml_fopen(fname, "rb"); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4055ec2873a..8aed0d76671 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -268,6 +268,8 @@ class Tokenizer: CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" + # Normalizer constants + NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase" # FIM/Infill special tokens constants FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" @@ -451,6 +453,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK = auto() DEEPSEEK2 = auto() DEEPSEEK2OCR = auto() + DEEPSEEK32 = auto() CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() @@ -505,6 +508,7 @@ class MODEL_ARCH(IntEnum): LLAMA_EMBED = auto() MAINCODER = auto() KIMI_LINEAR = auto() + TALKIE = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -747,7 +751,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_OUT_SCALE = auto() V_PRE_NORM = auto() V_POST_NORM = auto() - V_MM_PRE_NORM = auto() # hunyuanocr + V_MM_PRE_NORM = auto() # hunyuanvl V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 @@ -791,8 +795,8 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm - V_TOK_IMG_BEGIN = auto() # hunyuanocr - V_TOK_IMG_END = auto() # hunyuanocr + V_TOK_IMG_BEGIN = auto() # hunyuanvl + V_TOK_IMG_END = auto() # hunyuanvl V_STD_BIAS = auto() # gemma4 V_STD_SCALE = auto() # gemma4 V_SAM_POS_EMBD = auto() # Deepseek-OCR @@ -810,6 +814,8 @@ class MODEL_TENSOR(IntEnum): V_SAM_NET_3 = auto() # Deepseek-OCR V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR V_ENC_EMBD_VSEP = auto() # Deepseek-OCR + V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2 + V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2 # audio (mtmd) A_ENC_EMBD_POS = auto() @@ -966,6 +972,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", + MODEL_ARCH.DEEPSEEK32: "deepseek32", MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", @@ -1021,6 +1028,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", MODEL_ARCH.KIMI_LINEAR: "kimi-linear", + MODEL_ARCH.TALKIE: "talkie", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1325,6 +1333,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR + MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2 + MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2 # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1503,6 +1513,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_SAM_NECK, MODEL_TENSOR.V_SAM_NET_2, MODEL_TENSOR.V_SAM_NET_3, + MODEL_TENSOR.V_RESMPL_QUERY_768, + MODEL_TENSOR.V_RESMPL_QUERY_1024, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -2114,7 +2126,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_NORM, MODEL_TENSOR.SSM_BETA, MODEL_TENSOR.SSM_ALPHA, - MODEL_TENSOR.SSM_OUT + MODEL_TENSOR.SSM_OUT, + # NextN/MTP tensors - preserved but unused + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], MODEL_ARCH.QWEN35MOE: [ MODEL_TENSOR.TOKEN_EMBD, @@ -2145,7 +2164,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_NORM, MODEL_TENSOR.SSM_BETA, MODEL_TENSOR.SSM_ALPHA, - MODEL_TENSOR.SSM_OUT + MODEL_TENSOR.SSM_OUT, + # NextN/MTP tensors - preserved but unused + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, @@ -2914,6 +2940,46 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_SHEXP, MODEL_TENSOR.FFN_EXP_PROBS_B, ], + MODEL_ARCH.DEEPSEEK32: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.INDEXER_K_NORM, + MODEL_TENSOR.INDEXER_PROJ, + MODEL_TENSOR.INDEXER_ATTN_K, + MODEL_TENSOR.INDEXER_ATTN_Q_B, + # NextN/MTP tensors - preserved but unused + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, + ], MODEL_ARCH.ERNIE4_5_MOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -3244,6 +3310,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_POST_NORM, + # NextN/MTP tensors - preserved but unused + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], MODEL_ARCH.EXAONE_MOE: [ MODEL_TENSOR.TOKEN_EMBD, @@ -3999,6 +4072,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.TALKIE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.LAYER_OUT_SCALE, + ], # TODO } @@ -4048,6 +4134,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DEEPSEEK32: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.ROPE_FREQS, ], @@ -4235,6 +4325,7 @@ class VisionProjectorType: LLAMA4 = "llama4" QWEN2VL = "qwen2vl_merger" QWEN25VL = "qwen2.5vl_merger" + EXAONE4_5 = "exaone4_5" QWEN3VL = "qwen3vl_merger" STEP3VL = "step3vl" ULTRAVOX = "ultravox" @@ -4254,12 +4345,12 @@ class VisionProjectorType: JANUS_PRO = "janus_pro" DOTSOCR = "dots_ocr" DEEPSEEKOCR = "deepseekocr" + DEEPSEEKOCR2 = "deepseekocr2" LFM2A = "lfm2a" # audio MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" - HUNYUANOCR = "hunyuanocr" HUNYUANVL = "hunyuanvl" MINICPMV4_6 = "minicpmv4_6" GRANITE_SPEECH = "granite_speech" # audio diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a101382719d..e94b47badb4 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1110,6 +1110,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def add_normalizer_lowercase(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value) + def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 1d9d9ab7d70..80966b6ef15 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -28,6 +28,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: rows = arr.reshape((-1, arr.shape[-1])) + assert len(rows.shape) osize = 1 for dim in oshape: osize *= dim diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f40cb828201..444f0f2855a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -34,6 +34,7 @@ class TensorNameMap: "encoder", # neobert "model.transformer.wte", # llada "embed_tokens", # qwen3-embedding + "model.embed", # talkie ), # Token type embeddings @@ -259,6 +260,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.q_proj", # nemotron-h + "model.blocks.{bid}.attn.attn_query", # talkie ), # Attention key @@ -279,6 +281,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.k_proj", # nemotron-h + "model.blocks.{bid}.attn.attn_key", # talkie ), # Attention value @@ -298,6 +301,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h + "model.blocks.{bid}.attn.attn_value", # talkie ), # Attention output @@ -336,6 +340,7 @@ class TensorNameMap: "layers.{bid}.self_attn.o_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.o_proj", # nemotron-h "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm + "model.blocks.{bid}.attn.attn_resid", # talkie ), # Attention output norm @@ -508,6 +513,7 @@ class TensorNameMap: "layers.{bid}.mlp.up_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.up_proj", # nemotron-h "model.layers.{bid}.mlp.language_mlp.up_proj", # cogvlm + "model.blocks.{bid}.mlp.mlp_linear", # talkie ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -561,6 +567,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.ff_proj", # llada "layers.{bid}.mlp.gate_proj", # qwen3-embedding "model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm + "model.blocks.{bid}.mlp.mlp_gate", # talkie ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -636,6 +643,7 @@ class TensorNameMap: "layers.{bid}.mlp.down_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.down_proj", # nemotron-h "model.layers.{bid}.mlp.language_mlp.down_proj", # cogvlm + "model.blocks.{bid}.mlp.mlp_resid", # talkie ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -682,6 +690,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mixer.q_norm", # plamo3 "layers.{bid}.self_attn.q_norm", # qwen3-embedding "model.layers.{bid}.attention.query_layernorm", # apertus + "model.blocks.{bid}.attn.head_gain.head_g", # talkie ), MODEL_TENSOR.ATTN_K_NORM: ( @@ -716,6 +725,7 @@ class TensorNameMap: MODEL_TENSOR.LAYER_OUT_SCALE: ( "model.layers.{bid}.layer_scalar", # gemma4 + "model.blocks.{bid}.embed_skip.a_g", # talkie ), MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: ( @@ -1366,7 +1376,7 @@ class TensorNameMap: "mlp_AR.linear_{bid}", # PaddleOCR-VL "merger.mlp.{bid}", "vision_tower.merger.mlp.{bid}", # dots.ocr - "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2) + "vit.perceive.proj.{bid}", # HunyuanVL (proj.0 = conv1, proj.2 = conv2) ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1374,7 +1384,7 @@ class TensorNameMap: "model.vision.linear_proj.linear_proj", # cogvlm "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v - "vit.perceive.mlp", # HunyuanOCR + "vit.perceive.mlp", # HunyuanVL ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1403,7 +1413,7 @@ class TensorNameMap: "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM - "vit.embeddings.patch_embedding", # HunyuanOCR + "vit.embeddings.patch_embedding", # HunyuanVL "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 @@ -1429,7 +1439,7 @@ class TensorNameMap: "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM - "vit.embeddings.position_embedding", # HunyuanOCR + "vit.embeddings.position_embedding", # HunyuanVL "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl @@ -1442,12 +1452,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( "model.image_newline", # Deepseek-OCR - "vit.perceive.image_newline", # HunyuanOCR + "vit.perceive.image_newline", # HunyuanVL ), MODEL_TENSOR.V_ENC_EMBD_VSEP: ( "model.view_seperator", # Deepseek-OCR - "vit.perceive.image_sep", # HunyuanOCR + "vit.perceive.image_sep", # HunyuanVL ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1466,7 +1476,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM - "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.q_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral @@ -1475,6 +1485,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated "vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1490,7 +1501,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM - "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.k_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral @@ -1499,6 +1510,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj", "vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1514,7 +1526,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM - "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.v_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral @@ -1523,6 +1535,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated "vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1532,7 +1545,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM - "vit.layers.{bid}.input_layernorm", # HunyuanOCR + "vit.layers.{bid}.input_layernorm", # HunyuanVL "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4 @@ -1544,6 +1557,7 @@ class TensorNameMap: "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL "vision_tower.blocks.{bid}.norm1", # dots.ocr "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL + "model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1553,7 +1567,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM - "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.o_proj", # HunyuanVL "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf @@ -1564,6 +1578,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL + "model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2 "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4 "vision_tower.blocks.{bid}.attn.proj", # dots.ocr "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL @@ -1580,7 +1595,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM - "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR + "vit.layers.{bid}.post_attention_layernorm", # HunyuanVL "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral @@ -1593,6 +1608,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4 "vision_tower.blocks.{bid}.norm2", # dots.ocr "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL + "model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1601,7 +1617,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 - "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR + "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanVL "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 @@ -1615,6 +1631,7 @@ class TensorNameMap: "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4 "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL + "model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1622,6 +1639,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl "vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( @@ -1630,7 +1648,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 - "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR + "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanVL "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 @@ -1642,6 +1660,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL + "model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2 "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4 "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL ), @@ -1689,12 +1708,13 @@ class TensorNameMap: "vision_tower.encoder.final_layernorm", # kimi-vl "visual.post_layernorm", # glm4v "siglip2.vision_model.post_layernorm", + "model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_MM_POST_NORM: ( "visual.merger.post_projection_norm", # glm4v "vision_tower.post_trunk_norm", # dots.ocr - "vit.perceive.after_rms", # HunyuanOCR + "vit.perceive.after_rms", # HunyuanVL ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1869,6 +1889,14 @@ class TensorNameMap: "model.sam_model.net_3", ), + MODEL_TENSOR.V_RESMPL_QUERY_768: ( + "model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2 + ), + + MODEL_TENSOR.V_RESMPL_QUERY_1024: ( + "model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2 + ), + MODEL_TENSOR.V_MM_POST_FC_NORM: ( "model.vision.linear_proj.norm1", # cogvlm ), @@ -1899,15 +1927,15 @@ class TensorNameMap: ), MODEL_TENSOR.V_MM_PRE_NORM: ( - "vit.perceive.before_rms", # HunyuanOCR + "vit.perceive.before_rms", # HunyuanVL ), MODEL_TENSOR.V_TOK_IMG_BEGIN: ( - "vit.perceive.image_begin", # HunyuanOCR + "vit.perceive.image_begin", # HunyuanVL ), MODEL_TENSOR.V_TOK_IMG_END: ( - "vit.perceive.image_end", # HunyuanOCR + "vit.perceive.image_end", # HunyuanVL ), MODEL_TENSOR.V_STD_BIAS: ( diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 09a9b7d1835..27d3845852d 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -52,6 +52,7 @@ class SpecialVocab: add_special_token: dict[str, bool] special_token_ids: dict[str, int] chat_template: str | Sequence[Mapping[str, str]] | None + normalizer_lowercase: bool | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -64,6 +65,7 @@ def __init__( self.load_merges = load_merges self.merges = [] self.chat_template = None + self.normalizer_lowercase = None if special_token_types is not None: self.special_token_types = special_token_types else: @@ -102,6 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) + if self.normalizer_lowercase is not None: + if not quiet: + logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}') + gw.add_normalizer_lowercase(self.normalizer_lowercase) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -146,6 +152,24 @@ def _set_special_token(self, typ: str, tid: Any) -> None: return logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') + def _parse_normalizer(self, normalizer: dict) -> None: + # ref: https://huggingface.co/docs/tokenizers/api/normalizers + # + # Detects lowercase normalization in three possible formats: + # 1. Standalone: {"type": "Lowercase"} + # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...} + # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]} + + normalizer_type = normalizer.get('type') + if normalizer_type == 'Lowercase': + self.normalizer_lowercase = True + elif normalizer_type == 'BertNormalizer': + if 'lowercase' in normalizer: + self.normalizer_lowercase = normalizer['lowercase'] + elif normalizer_type == 'Sequence': + for norm in normalizer.get('normalizers', []): + self._parse_normalizer(norm) + def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = None tokenizer_file = path / 'tokenizer.json' @@ -178,6 +202,9 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: ] else: raise ValueError("Unknown tokenizer merges format") + # Parse normalizer configuration (e.g. Lowercase) into metadata + if normalizer := tokenizer.get('normalizer'): + self._parse_normalizer(normalizer) added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index b2a36460ad8..d11c34a2186 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -37,7 +37,7 @@ packages = [ [tool.poetry.dependencies] python = ">=3.10" -[tool.poetry.dev-dependencies] +[tool.poetry.group.dev.dependencies] pytest = "^5.2" [build-system] diff --git a/include/llama.h b/include/llama.h index 308e8ba9dbd..a79a491c592 100644 --- a/include/llama.h +++ b/include/llama.h @@ -198,6 +198,11 @@ extern "C" { LLAMA_SPLIT_MODE_TENSOR = 3, }; + enum llama_context_type { + LLAMA_CONTEXT_TYPE_DEFAULT = 0, + LLAMA_CONTEXT_TYPE_MTP = 1, + }; + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id @@ -333,9 +338,12 @@ extern "C" { uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) + uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] + uint32_t n_outputs_max; // max outputs in a ubatch (0 = n_batch) int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing + enum llama_context_type ctx_type; // set the context type (e.g. MTP) enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_attention_type attention_type; // attention type to use for embeddings @@ -530,6 +538,7 @@ extern "C" { LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead"); @@ -866,7 +875,8 @@ extern "C" { // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba) #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1 -// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +// Keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load). +// Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag. #define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2 typedef uint32_t llama_state_seq_flags; diff --git a/models/templates/ibm-granite-granite-4.1.jinja b/models/templates/ibm-granite-granite-4.1.jinja new file mode 100644 index 00000000000..903cac6443f --- /dev/null +++ b/models/templates/ibm-granite-granite-4.1.jinja @@ -0,0 +1,114 @@ +{%- set tools_system_message_prefix = 'You are a helpful assistant with access to the following tools. You may call one or more tools to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' %} +{%- set tools_system_message_suffix = '\n\n\nFor each tool call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.' %} +{%- set documents_system_message_prefix = 'You are a helpful assistant with access to the following documents. You may use one or more documents to assist with the user query.\n\nYou are given a list of documents within XML tags:\n' %} +{%- set documents_system_message_suffix = '\n\n\nWrite the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.' %} +{%- if available_tools is defined and available_tools %} + {%- set tools = available_tools %} +{%- endif %} +{%- set ns = namespace(tools_system_message=tools_system_message_prefix, + documents_system_message=documents_system_message_prefix, + system_message='' + ) %} +{%- if tools %} + {%- for tool in tools %} + {%- set ns.tools_system_message = ns.tools_system_message + '\n' + (tool | tojson) %} + {%- endfor %} + {%- set ns.tools_system_message = ns.tools_system_message + tools_system_message_suffix %} +{%- else %} + {%- set ns.tools_system_message = '' %} +{%- endif %} +{%- if documents %} + {%- for document in documents %} + {%- set ns.documents_system_message = ns.documents_system_message + '\n' + (document | tojson) %} + {%- endfor %} + {%- set ns.documents_system_message = ns.documents_system_message + documents_system_message_suffix %} +{%- else %} + {%- set ns.documents_system_message = '' %} +{%- endif %} +{%- if messages[0].role == 'system' %} + {%- if messages[0].content is string %} + {%- set ns.system_message = messages[0].content %} + {%- elif messages[0].content is iterable %} + {%- for entry in messages[0].content %} + {%- if entry.type== 'text' %} + {%- if ns.system_message != '' %} + {%- set ns.system_message = ns.system_message + '\n' %} + {%- endif %} + {%- set ns.system_message = ns.system_message + entry.text %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if tools and documents %} + {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message + '\n\n' + ns.documents_system_message %} + {%- elif tools %} + {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message %} + {%- elif documents %} + {%- set ns.system_message = ns.system_message + '\n\n' + ns.documents_system_message %} + {%- endif %} +{%- else %} + {%- if tools and documents %} + {%- set ns.system_message = ns.tools_system_message + '\n\n' + ns.documents_system_message %} + {%- elif tools %} + {%- set ns.system_message = ns.tools_system_message %} + {%- elif documents %} + {%- set ns.system_message = ns.documents_system_message %} + {%- endif %} +{%- endif %} +{%- if ns.system_message %} + {{- '<|start_of_role|>system<|end_of_role|>' + ns.system_message + '<|end_of_text|>\n' }} +{%- endif %} +{%- for message in messages %} + {%- set content = namespace(val='') %} + {%- if message.content is string %} + {%- set content.val = message.content %} + {%- else %} + {%- if message.content is iterable %} + {%- for entry in message.content %} + {%- if entry.type== 'text' %} + {%- if content.val != '' %} + {%- set content.val = content.val + '\n' %} + {%- endif %} + {%- set content.val = content.val + entry.text %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %} + {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val + '<|end_of_text|>\n' }} + {%- elif message.role == 'assistant' %} + {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content.val) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|end_of_text|>\n' }} + {%- elif message.role == 'tool' %} + {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %} + {{- '<|start_of_role|>user<|end_of_role|>' }} + {%- endif %} + {{- '\n\n' }} + {{- content.val }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %} + {{- '<|end_of_text|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_of_role|>assistant<|end_of_role|>' }} +{%- endif %} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a39c4f010bb..e4f8c86b951 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,13 +6,13 @@ version = "0.0.0" dynamic = ["classifiers"] readme = "README.md" authors = [{name = "GGML", email = "ggml@ggml.ai"}] -requires-python = '>=3.10' +requires-python = '>=3.10,<3.15' dependencies = [ - 'numpy (>=1.25.0,<2.0.0)', + 'numpy (>=1.26.4,<3.0.0)', 'sentencepiece (>=0.1.98,<0.3.0)', 'transformers (==5.5.1)', - 'protobuf (>=4.21.0)', - 'torch (>=2.2.0,<3.0.0)', + 'protobuf (>=4.21.0,<5.0.0)', + 'torch (>=2.6.0,<3.0.0)', 'gguf @ ./gguf-py', ] classifiers = [ @@ -32,17 +32,20 @@ llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main" llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main" [tool.poetry] -packages = [{ include = "*.py", from = "." }] +packages = [ + { include = "*.py", from = "." }, + { include = "conversion", from = "." }, +] [tool.poetry.dependencies] torch = [ - { version = "~=2.6.0", source = "pypi", markers = "sys_platform == 'darwin'" }, - { version = "~=2.6.0+cpu", source = "pytorch", markers = "sys_platform == 'linux'" }, - { version = "~=2.6.0", source = "pypi", markers = "sys_platform == 'win32'" } + { version = "==2.11.0", source = "pypi", markers = "sys_platform == 'darwin'" }, + { version = "==2.11.0+cpu", source = "pytorch", markers = "sys_platform == 'linux'" }, + { version = "==2.11.0", source = "pypi", markers = "sys_platform == 'win32'" }, ] [tool.poetry.group.dev.dependencies] -pytest = "^5.2" +pytest = "~=8.3.3" # Force wheel + cpu # For discussion and context see https://github.com/python-poetry/poetry#6409 diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index 122b4788d91..f80fdc1f640 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -1,8 +1,8 @@ -r ./requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu -## Embedding Gemma requires PyTorch 2.6.0 or later -torch~=2.6.0; platform_machine != "s390x" +## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility +torch==2.11.0; platform_machine != "s390x" # torch s390x packages can only be found from nightly builds --extra-index-url https://download.pytorch.org/whl/nightly diff --git a/requirements/requirements-server-bench.txt b/requirements/requirements-server-bench.txt index ea5849fa104..fb3b0d2664b 100644 --- a/requirements/requirements-server-bench.txt +++ b/requirements/requirements-server-bench.txt @@ -1,4 +1,4 @@ -datasets~=3.2.0 +datasets~=4.8.0 matplotlib~=3.10.0 numpy~=1.26.4 requests~=2.32.3 diff --git a/scripts/server-bench.py b/scripts/server-bench.py index 1b557a495a5..2eabb3bce85 100755 --- a/scripts/server-bench.py +++ b/scripts/server-bench.py @@ -25,7 +25,7 @@ def get_prompts_text(dataset_name: str, n_prompts: int) -> Optional[list[str]]: ret = [] if dataset_name.lower() == "mmlu": logger.info("Loading MMLU dataset...") - ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"] # type: ignore + ret = datasets.load_dataset("cais/mmlu", "all")["test"]["question"] else: return None if n_prompts >= 0: diff --git a/scripts/snapdragon/adb/run-bench.sh b/scripts/snapdragon/adb/run-bench.sh index 27459df241b..bbe7146b444 100755 --- a/scripts/snapdragon/adb/run-bench.sh +++ b/scripts/snapdragon/adb/run-bench.sh @@ -45,5 +45,5 @@ adb $adbserial $adbhost shell " \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ $ndev $nhvx $opmask $verbose $profile $hb ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ - --ubatch-size 256 -fa 1 -ngl 99 $cli_opts $@ \ + --ubatch-size 1024 -fa 1 -ngl 99 $cli_opts $@ \ " diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh index 3f89a7777c6..48127dfa252 100755 --- a/scripts/snapdragon/adb/run-cli.sh +++ b/scripts/snapdragon/adb/run-cli.sh @@ -73,6 +73,6 @@ adb $adbserial $adbhost shell " \ $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \ ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ - --ctx-size 8192 --ubatch-size 256 -fa on \ + --ctx-size 8192 --ubatch-size 1024 -fa on \ -ngl 99 --device $device $cli_opts $@ \ " diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh index 1b904c77b3f..fe14bb14225 100755 --- a/scripts/snapdragon/adb/run-completion.sh +++ b/scripts/snapdragon/adb/run-completion.sh @@ -51,6 +51,9 @@ opbatch= opqueue= [ "$OQ" != "" ] && opqueue="GGML_HEXAGON_OPQUEUE=$OQ" +oppoll= +[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP" + opflt= [ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF" @@ -66,9 +69,9 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $opflt $vmem $mbuf \ + $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \ ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ - --ctx-size 8192 --ubatch-size 256 -fa on \ - -ngl 99 --device $device $cli_opts $@ \ + --ctx-size 8192 --ubatch-size 1024 -fa on \ + -ngl 99 --device $device $cli_opts $@ \ " diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh index 38467beba3d..992045cb9b3 100755 --- a/scripts/snapdragon/adb/run-mtmd.sh +++ b/scripts/snapdragon/adb/run-mtmd.sh @@ -66,6 +66,6 @@ adb $adbserial $adbhost shell " \ --mmproj $basedir/../gguf/$mmproj \ --image $basedir/../gguf/$image \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ - --ctx-size 8192 --ubatch-size 256 -fa on \ + --ctx-size 8192 --ubatch-size 1024 -fa on \ -ngl 99 --device $device -v $cli_opts $@ \ " diff --git a/scripts/snapdragon/adb/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh index 27cbb2b6d05..6d7e32b3218 100755 --- a/scripts/snapdragon/adb/run-tool.sh +++ b/scripts/snapdragon/adb/run-tool.sh @@ -42,6 +42,15 @@ ndev= hb= [ "$HB" != "" ] && hb="GGML_HEXAGON_HOSTBUF=$HB" +opbatch= +[ "$OB" != "" ] && opbatch="GGML_HEXAGON_OPBATCH=$OB" + +opqueue= +[ "$OQ" != "" ] && opqueue="GGML_HEXAGON_OPQUEUE=$OQ" + +oppoll= +[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP" + set -x tool=$1; shift @@ -50,5 +59,5 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb ./$branch/bin/$tool $@ \ + $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \ " diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py index 3edaacd2749..aa1f20dcc23 100755 --- a/scripts/snapdragon/ggml-hexagon-profile.py +++ b/scripts/snapdragon/ggml-hexagon-profile.py @@ -24,7 +24,7 @@ } op_pattern = re.compile( - r"profile-op\s+(?P[A-Z_0-9]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P\d+)\s+cycles\s+(?P\d+)(?:\s+pmu\s+\[(?P[\d,\s]+)\])?" + r"profile-op\s+(?P[A-Z_0-9+]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P\d+)\s+cycles\s+(?P\d+)(?:\s+pmu\s+\[(?P[\d,\s]+)\])?" ) logger = logging.getLogger("ggml-hexagon-profile") diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1 index 8bf6939d2c0..5ee81df6889 100644 --- a/scripts/snapdragon/windows/run-bench.ps1 +++ b/scripts/snapdragon/windows/run-bench.ps1 @@ -45,4 +45,4 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-bench.exe" ` --mmap 0 -m $basedir\..\..\gguf\$model ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --batch-size 128 -ngl 99 --device $device $cli_opts + --ubatch-size 1024 -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1 index 104452f9ba7..b51149bec25 100644 --- a/scripts/snapdragon/windows/run-cli.ps1 +++ b/scripts/snapdragon/windows/run-cli.ps1 @@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-cli.exe" ` --no-mmap -m $basedir\..\..\gguf\$model ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --ctx-size 8192 --ubatch-size 256 -fa on ` + --ctx-size 8192 --ubatch-size 1024 -fa on ` -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1 index 5841a82fa99..ffce8184dc0 100644 --- a/scripts/snapdragon/windows/run-completion.ps1 +++ b/scripts/snapdragon/windows/run-completion.ps1 @@ -49,5 +49,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-completion.exe" ` --no-mmap -m $basedir\..\..\gguf\$model ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --ctx-size 8192 --batch-size 256 -fa on ` + --ctx-size 8192 --ubatch-size 1024 -fa on ` -ngl 99 -no-cnv --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-mtmd.ps1 b/scripts/snapdragon/windows/run-mtmd.ps1 index be817875142..b38fae35fe4 100644 --- a/scripts/snapdragon/windows/run-mtmd.ps1 +++ b/scripts/snapdragon/windows/run-mtmd.ps1 @@ -64,5 +64,5 @@ $env:ADSP_LIBRARY_PATH="$basedir\lib" --mmproj $basedir\..\..\gguf\$mmproj ` --image $basedir\..\..\gguf\$image ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --ctx-size 8192 --ubatch-size 256 -fa on ` + --ctx-size 8192 --ubatch-size 1024 -fa on ` -ngl 99 --device $device -v $cli_opts diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1 index 0f3244cc9d2..d8ef24d4413 100644 --- a/scripts/snapdragon/windows/setup-build.ps1 +++ b/scripts/snapdragon/windows/setup-build.ps1 @@ -7,10 +7,10 @@ $ErrorActionPreference = "Stop" $BaseDir = "C:\Qualcomm" # SDK 1: Hexagon -$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz" +$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz" $HexagonParent = Join-Path $BaseDir "Hexagon_SDK" -$HexagonSdkVersion = "6.4.0.2" -$HexagonToolsVersion = "19.0.04" +$HexagonSdkVersion = "6.6.0.0" +$HexagonToolsVersion = "19.0.07" $HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion $HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion" diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 15685a0718f..538ef80bc7a 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -628249b398293fc8d2fa81a449ae2920a02c6523 +1e33fed33e87c43aa4c4078e2a9c239d4c1f1bd3 diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index 0ddef236a4c..8306cf93ec5 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -5,7 +5,7 @@ import sys import subprocess -HTTPLIB_VERSION = "refs/tags/v0.44.0" +HTTPLIB_VERSION = "refs/tags/v0.46.1" vendor = { "https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp", diff --git a/scripts/ui-assets.cmake b/scripts/ui-assets.cmake new file mode 100644 index 00000000000..ae7a1cc26d3 --- /dev/null +++ b/scripts/ui-assets.cmake @@ -0,0 +1,342 @@ +# Provision UI assets and generate ui.cpp/ui.h. +# +# Asset provisioning priority: +# 1. Pre-built assets in SRC_DIST_DIR (manually built by user) +# 2. If BUILD_UI=ON: npm build +# 3. If above did not produce assets and HF_ENABLED=ON: HF Bucket download + +cmake_minimum_required(VERSION 3.16) + +set(UI_SOURCE_DIR "" CACHE STRING "UI source directory (to run npm build)") +set(UI_BINARY_DIR "" CACHE STRING "UI binary directory (to store generated files)") +set(LLAMA_SOURCE_DIR "" CACHE STRING "Project source root (to resolve version from git)") +set(HF_BUCKET "" CACHE STRING "Hugging Face bucket name") +set(HF_VERSION "" CACHE STRING "Version to download (empty = resolve from git)") +set(HF_ENABLED "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)") +set(BUILD_UI "" CACHE STRING "Build UI via npm (ON/OFF)") +set(LLAMA_UI_EMBED "" CACHE STRING "Path to llama-ui-embed helper") + +set(ASSETS + bundle.css + bundle.js + index.html + loading.html +) + +set(DIST_DIR "${UI_BINARY_DIR}/dist") +set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist") +set(STAMP_FILE "${UI_BINARY_DIR}/.ui-stamp") +set(UI_CPP "${UI_BINARY_DIR}/ui.cpp") +set(UI_H "${UI_BINARY_DIR}/ui.h") + +function(assets_present out_var) + set(present TRUE) + foreach(asset ${ASSETS}) + if(NOT EXISTS "${DIST_DIR}/${asset}") + set(present FALSE) + break() + endif() + endforeach() + set(${out_var} ${present} PARENT_SCOPE) +endfunction() + +function(copy_src_dist out_var) + set(${out_var} FALSE PARENT_SCOPE) + + foreach(asset ${ASSETS}) + if(NOT EXISTS "${SRC_DIST_DIR}/${asset}") + return() + endif() + endforeach() + + file(MAKE_DIRECTORY "${DIST_DIR}") + message(STATUS "UI: using pre-built assets from ${SRC_DIST_DIR}") + foreach(asset ${ASSETS}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${SRC_DIST_DIR}/${asset}" "${DIST_DIR}/${asset}" + ) + endforeach() + set(${out_var} TRUE PARENT_SCOPE) +endfunction() + +function(npm_build_should_skip out_var) + set(${out_var} FALSE PARENT_SCOPE) + + assets_present(present) + if(NOT present) + return() + endif() + + if(EXISTS "${STAMP_FILE}") + return() + endif() + + if(NOT EXISTS "${UI_SOURCE_DIR}/sources.cmake") + return() + endif() + include("${UI_SOURCE_DIR}/sources.cmake") + + set(globs "") + foreach(g ${UI_SOURCE_GLOBS}) + list(APPEND globs "${UI_SOURCE_DIR}/${g}") + endforeach() + file(GLOB_RECURSE sources ${globs}) + foreach(f ${UI_SOURCE_FILES}) + list(APPEND sources "${UI_SOURCE_DIR}/${f}") + endforeach() + + file(TIMESTAMP "${DIST_DIR}/index.html" out_ts) + + foreach(s ${sources}) + if(NOT EXISTS "${s}") + continue() + endif() + file(TIMESTAMP "${s}" s_ts) + if(s_ts STRGREATER out_ts) + return() + endif() + endforeach() + + set(${out_var} TRUE PARENT_SCOPE) +endfunction() + +function(npm_build out_var) + set(${out_var} FALSE PARENT_SCOPE) + + if(NOT EXISTS "${UI_SOURCE_DIR}/package.json") + message(STATUS "UI: ${UI_SOURCE_DIR}/package.json not found, skipping npm") + return() + endif() + + npm_build_should_skip(skip) + if(skip) + message(STATUS "UI: npm output up-to-date, skipping build") + set(${out_var} TRUE PARENT_SCOPE) + return() + endif() + + if(CMAKE_HOST_WIN32) + find_program(NPM_EXECUTABLE NAMES npm.cmd npm.bat npm) + else() + find_program(NPM_EXECUTABLE npm) + endif() + if(NOT NPM_EXECUTABLE) + message(STATUS "UI: npm not found, skipping npm build") + return() + endif() + + if(NOT EXISTS "${UI_SOURCE_DIR}/node_modules") + message(STATUS "UI: running npm install (first time)") + execute_process( + COMMAND ${NPM_EXECUTABLE} install + WORKING_DIRECTORY "${UI_SOURCE_DIR}" + RESULT_VARIABLE rc + ERROR_VARIABLE err + ) + if(NOT rc EQUAL 0) + message(STATUS "UI: npm install failed (${rc})") + message(STATUS " stderr: ${err}") + return() + endif() + endif() + + file(MAKE_DIRECTORY "${DIST_DIR}") + + message(STATUS "UI: running npm run build, output -> ${DIST_DIR}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" + ${NPM_EXECUTABLE} run build + WORKING_DIRECTORY "${UI_SOURCE_DIR}" + RESULT_VARIABLE rc + ERROR_VARIABLE err + ) + if(NOT rc EQUAL 0) + message(STATUS "UI: npm run build failed (${rc})") + message(STATUS " stderr: ${err}") + return() + endif() + + assets_present(present) + if(NOT present) + message(STATUS "UI: npm build finished but assets missing in ${DIST_DIR}") + return() + endif() + + message(STATUS "UI: npm build succeeded") + file(REMOVE "${STAMP_FILE}") + set(${out_var} TRUE PARENT_SCOPE) +endfunction() + +function(resolve_version out_var) + if(NOT "${HF_VERSION}" STREQUAL "") + set(${out_var} "${HF_VERSION}" PARENT_SCOPE) + return() + endif() + + if(EXISTS "${LLAMA_SOURCE_DIR}/cmake/build-info.cmake") + include("${LLAMA_SOURCE_DIR}/cmake/build-info.cmake") + if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0) + set(${out_var} "b${BUILD_NUMBER}" PARENT_SCOPE) + return() + endif() + endif() + + set(${out_var} "" PARENT_SCOPE) +endfunction() + +function(hf_download version out_var out_resolved) + set(${out_var} FALSE PARENT_SCOPE) + set(${out_resolved} "" PARENT_SCOPE) + + file(MAKE_DIRECTORY "${DIST_DIR}") + + set(candidates "") + if(NOT "${version}" STREQUAL "") + list(APPEND candidates "${version}") + endif() + list(APPEND candidates "latest") + + foreach(resolved ${candidates}) + set(base "https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${resolved}") + + message(STATUS "UI: downloading from ${resolved}: ${base}") + + set(ok TRUE) + foreach(asset ${ASSETS}) + file(DOWNLOAD "${base}/${asset}?download=true" "${DIST_DIR}/${asset}" + STATUS status TIMEOUT 60 + ) + list(GET status 0 rc) + if(NOT rc EQUAL 0) + list(GET status 1 errmsg) + message(STATUS "UI: download ${asset} from ${resolved} failed: ${errmsg}") + set(ok FALSE) + break() + endif() + message(STATUS "UI: downloaded ${asset}") + endforeach() + + if(NOT ok) + continue() + endif() + + # Best-effort checksum verification + file(DOWNLOAD "${base}/checksums.txt?download=true" "${DIST_DIR}/checksums.txt" + STATUS cs_status TIMEOUT 30 + ) + list(GET cs_status 0 cs_rc) + if(cs_rc EQUAL 0) + message(STATUS "UI: verifying checksums") + file(STRINGS "${DIST_DIR}/checksums.txt" cs_lines) + foreach(asset ${ASSETS}) + file(SHA256 "${DIST_DIR}/${asset}" h) + string(TOLOWER "${h}" h) + string(REGEX MATCH "${h}[ \t]+${asset}" m "${cs_lines}") + if(NOT m) + message(WARNING "UI: checksum verification failed for ${asset}") + set(ok FALSE) + break() + endif() + endforeach() + if(ok) + message(STATUS "UI: all checksums verified") + endif() + endif() + + if(ok) + set(${out_var} TRUE PARENT_SCOPE) + set(${out_resolved} "${resolved}" PARENT_SCOPE) + return() + endif() + endforeach() +endfunction() + +function(emit_files) + assets_present(present) + + set(args "${UI_CPP}" "${UI_H}") + if(present) + foreach(asset ${ASSETS}) + list(APPEND args "${asset}" "${DIST_DIR}/${asset}") + endforeach() + endif() + + execute_process( + COMMAND "${LLAMA_UI_EMBED}" ${args} + RESULT_VARIABLE rc + ) + if(NOT rc EQUAL 0) + message(FATAL_ERROR "UI: llama-ui-embed failed (${rc})") + endif() +endfunction() + +# --------------------------------------------------------------------------- +# 1. Priority 1: pre-built assets supplied in tools/ui/dist +# --------------------------------------------------------------------------- +copy_src_dist(SRC_OK) +if(SRC_OK) + emit_files() + return() +endif() + +# --------------------------------------------------------------------------- +# 2. Priority 2: npm build (if BUILD_UI=ON) +# --------------------------------------------------------------------------- +set(provisioned FALSE) + +if(BUILD_UI) + npm_build(NPM_OK) + if(NPM_OK) + set(provisioned TRUE) + endif() +endif() + +# --------------------------------------------------------------------------- +# 3. Priority 3: HF Bucket download (if npm did not produce assets and HF_ENABLED=ON) +# --------------------------------------------------------------------------- +if(NOT provisioned AND HF_ENABLED) + resolve_version(VERSION) + + set(stamp_ok FALSE) + if(EXISTS "${STAMP_FILE}" AND NOT "${VERSION}" STREQUAL "") + file(READ "${STAMP_FILE}" stamped) + string(STRIP "${stamped}" stamped) + if("${stamped}" STREQUAL "${VERSION}") + set(stamp_ok TRUE) + endif() + endif() + + assets_present(have_assets) + if(stamp_ok AND have_assets) + message(STATUS "UI: HF stamp '${stamped}' matches version, skipping HF fetch") + set(provisioned TRUE) + else() + hf_download("${VERSION}" HF_OK HF_RESOLVED) + if(HF_OK) + file(WRITE "${STAMP_FILE}" "${HF_RESOLVED}") + message(STATUS "UI: HF download succeeded, stamp updated (${HF_RESOLVED})") + set(provisioned TRUE) + else() + message(STATUS "UI: HF download failed") + endif() + endif() +endif() + +# --------------------------------------------------------------------------- +# 4. Fallback: warn about stale or missing assets, then emit whatever we have +# --------------------------------------------------------------------------- +if(NOT provisioned) + assets_present(have_assets) + if(have_assets) + message(WARNING "UI: provisioning failed; embedding stale assets from ${DIST_DIR}") + else() + message(WARNING "UI: no assets available - building without an embedded UI. " + "In a disconnected environment, download the pre-built UI " + "from a llama.cpp release at " + "https://github.com/ggml-org/llama.cpp/releases and " + "extract to tools/ui/dist.") + endif() +endif() + +emit_files() diff --git a/scripts/wc2wt.sh b/scripts/wc2wt.sh index 157881b458f..b6e92f86934 100755 --- a/scripts/wc2wt.sh +++ b/scripts/wc2wt.sh @@ -37,7 +37,7 @@ dir=$(basename $(pwd)) # sanitize branch name for directory name (replace / with -) dir_suffix=$(echo "$BRANCH" | tr '/' '-') -git worktree add -b "$BRANCH" "../$dir-$dir_suffix" HEAD +git worktree add "../$dir-$dir_suffix" "$BRANCH" || git worktree add -b "$BRANCH" "../$dir-$dir_suffix" HEAD og_path=$(pwd) wt_path=$(cd "../$dir-$dir_suffix" && pwd) diff --git a/scripts/webui-download.cmake b/scripts/webui-download.cmake deleted file mode 100644 index 344ba5ecf53..00000000000 --- a/scripts/webui-download.cmake +++ /dev/null @@ -1,222 +0,0 @@ -# Download webui assets from Hugging Face Bucket at build time -# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/webui-download.cmake -# -# Asset provisioning priority: -# 1. Pre-built assets already in PUBLIC_DIR (cached from a previous run) -# 2. Local npm build (if NPM_DIR is provided and has package.json) -# 3. Hugging Face Bucket download (version-specific, then 'latest' fallback) - -cmake_minimum_required(VERSION 3.16) - -set(PUBLIC_DIR "" CACHE STRING "Directory to store/download assets") -set(HF_BUCKET "" CACHE STRING "Hugging Face bucket name") -set(HF_VERSION "" CACHE STRING "Version to download (empty = resolve from git)") -set(ASSETS "" CACHE STRING "Plus-separated list of asset filenames (+)") -set(STAMP_FILE "" CACHE STRING "Stamp file to create on success (optional)") -set(SOURCE_DIR "" CACHE STRING "Project source root (to resolve version from git)") -set(NPM_DIR "" CACHE STRING "WebUI source directory (to run npm build)") -set(HF_ENABLED "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)") - -# --------------------------------------------------------------------------- -# 1. Resolve version from git if not provided at configure time -# --------------------------------------------------------------------------- -set(RESOLVED_VERSION "${HF_VERSION}") -if("${RESOLVED_VERSION}" STREQUAL "" AND NOT "${SOURCE_DIR}" STREQUAL "") - if(EXISTS "${SOURCE_DIR}/cmake/build-info.cmake") - include("${SOURCE_DIR}/cmake/build-info.cmake") - if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0) - set(RESOLVED_VERSION "${BUILD_NUMBER}") - message(STATUS "WebUI: resolved version from git: ${RESOLVED_VERSION}") - endif() - endif() -endif() - -# Convert + back to CMake list (+ is used as separator instead of ; to -# avoid platform-specific escaping issues when passing via -D arguments) -string(REGEX REPLACE "\\+" ";" ASSETS "${ASSETS}") - -# --------------------------------------------------------------------------- -# 2. Check stamp freshness โ€” re-download if resolved version changed -# --------------------------------------------------------------------------- -set(FORCE_REBUILD FALSE) -if(NOT "${STAMP_FILE}" STREQUAL "" AND EXISTS "${STAMP_FILE}") - file(READ "${STAMP_FILE}" STAMPED_VERSION) - string(STRIP "${STAMPED_VERSION}" STAMPED_VERSION) - if(NOT "${STAMPED_VERSION}" STREQUAL "${RESOLVED_VERSION}") - message(STATUS "WebUI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building") - set(FORCE_REBUILD TRUE) - endif() -endif() - -# --------------------------------------------------------------------------- -# 3. Check if assets already exist (cached from a previous run) -# --------------------------------------------------------------------------- -set(ALL_EXISTS TRUE) -foreach(asset ${ASSETS}) - if(NOT EXISTS "${PUBLIC_DIR}/${asset}") - set(ALL_EXISTS FALSE) - break() - endif() -endforeach() - -if(ALL_EXISTS AND NOT FORCE_REBUILD) - message(STATUS "WebUI: all assets already exist in ${PUBLIC_DIR}, skipping") - return() -endif() - -file(MAKE_DIRECTORY "${PUBLIC_DIR}") - -# --------------------------------------------------------------------------- -# 4. Priority 2: build from source via npm (fast path for developers) -# --------------------------------------------------------------------------- -set(PROVISION_SUCCESS FALSE) - -if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "") - if(EXISTS "${NPM_DIR}/package.json") - # Check if npm is available before attempting npm build - find_program(NPM_EXECUTABLE npm) - if(NPM_EXECUTABLE) - message(STATUS "WebUI: building from source in ${NPM_DIR}") - - # Run npm install if node_modules is missing - if(NOT EXISTS "${NPM_DIR}/node_modules") - message(STATUS "WebUI: running npm install (first time)") - execute_process( - COMMAND npm install - WORKING_DIRECTORY "${NPM_DIR}" - RESULT_VARIABLE NPM_INSTALL_RESULT - OUTPUT_VARIABLE NPM_OUT - ERROR_VARIABLE NPM_ERR - ) - if(NOT NPM_INSTALL_RESULT EQUAL 0) - message(STATUS "WebUI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download") - message(STATUS " stderr: ${NPM_ERR}") - endif() - endif() - - # Run the build - execute_process( - COMMAND npm run build - WORKING_DIRECTORY "${NPM_DIR}" - RESULT_VARIABLE NPM_BUILD_RESULT - OUTPUT_VARIABLE NPM_OUT - ERROR_VARIABLE NPM_ERR - ) - - if(NPM_BUILD_RESULT EQUAL 0) - # Verify that the expected assets were produced - set(ALL_BUILT TRUE) - foreach(asset ${ASSETS}) - if(NOT EXISTS "${PUBLIC_DIR}/${asset}") - set(ALL_BUILT FALSE) - break() - endif() - endforeach() - - if(ALL_BUILT) - message(STATUS "WebUI: local npm build succeeded") - set(PROVISION_SUCCESS TRUE) - else() - message(STATUS "WebUI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download") - endif() - else() - message(STATUS "WebUI: npm build failed (${NPM_BUILD_RESULT}), falling back to download") - message(STATUS " stderr: ${NPM_ERR}") - endif() - else() - message(STATUS "WebUI: npm not found, skipping npm build and trying HF Bucket download") - endif() - else() - message(STATUS "WebUI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build") - endif() -endif() - -# --------------------------------------------------------------------------- -# 5. Priority 3: download from Hugging Face Bucket (if enabled) -# --------------------------------------------------------------------------- -if(NOT PROVISION_SUCCESS AND HF_ENABLED) - # Build list of URLs to try โ€” version-specific first, then 'latest' - set(URL_ENTRIES "") - if(NOT "${RESOLVED_VERSION}" STREQUAL "") - list(APPEND URL_ENTRIES - "version:https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${RESOLVED_VERSION}") - endif() - list(APPEND URL_ENTRIES - "latest:https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/latest") - - foreach(entry ${URL_ENTRIES}) - string(REGEX REPLACE "^([^:]+):.*$" "\\1" url_label "${entry}") - string(REGEX REPLACE "^[^:]+:(.*)$" "\\1" base_url "${entry}") - - message(STATUS "WebUI: downloading assets from ${url_label}: ${base_url}") - - # Download each asset - set(ALL_OK TRUE) - foreach(asset ${ASSETS}) - set(download_url "${base_url}/${asset}?download=true") - set(download_path "${PUBLIC_DIR}/${asset}") - file(DOWNLOAD "${download_url}" "${download_path}" - STATUS download_status TIMEOUT 60 - ) - list(GET download_status 0 download_result) - if(NOT download_result EQUAL 0) - list(GET download_status 1 error_message) - message(STATUS "WebUI: failed to download ${asset} from ${url_label}: ${error_message}") - set(ALL_OK FALSE) - break() - endif() - message(STATUS "WebUI: downloaded ${asset}") - endforeach() - - if(NOT ALL_OK) - continue() - endif() - - # Verify checksums if the server provides them - file(DOWNLOAD "${base_url}/checksums.txt?download=true" - "${PUBLIC_DIR}/checksums.txt" - STATUS checksum_status TIMEOUT 30 - ) - list(GET checksum_status 0 checksum_result) - if(checksum_result EQUAL 0) - message(STATUS "WebUI: verifying checksums...") - file(STRINGS "${PUBLIC_DIR}/checksums.txt" CHECKSUMS_CONTENT) - foreach(asset ${ASSETS}) - set(download_path "${PUBLIC_DIR}/${asset}") - file(SHA256 "${download_path}" asset_hash) - string(TOUPPER "${asset_hash}" EXPECTED_HASH_UPPER) - string(REGEX MATCH "${EXPECTED_HASH_UPPER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}") - if(NOT CHECKSUM_LINE) - message(WARNING "WebUI: checksum verification failed for ${asset}") - message(WARNING " downloaded file may not match expected checksum, but will be used") - endif() - endforeach() - if(ALL_OK) - message(STATUS "WebUI: all checksums verified") - endif() - endif() - - if(ALL_OK) - set(PROVISION_SUCCESS TRUE) - break() - endif() - endforeach() - - if(PROVISION_SUCCESS) - message(STATUS "WebUI: provisioning complete") - else() - message(WARNING "WebUI: failed to download assets from HF Bucket (${HF_BUCKET})") - endif() -endif() - -# --------------------------------------------------------------------------- -# 6. Write stamp file on success (stores resolved version for freshness check) -# --------------------------------------------------------------------------- -if(PROVISION_SUCCESS) - if(NOT "${STAMP_FILE}" STREQUAL "") - file(WRITE "${STAMP_FILE}" "${RESOLVED_VERSION}") - endif() -else() - message(WARNING "WebUI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.") - message(WARNING "WebUI: building server without embedded WebUI. Set LLAMA_BUILD_WEBUI=OFF to suppress this warning.") -endif() diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake deleted file mode 100644 index 14d2753808a..00000000000 --- a/scripts/xxd.cmake +++ /dev/null @@ -1,16 +0,0 @@ -# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}` -# Usage: cmake -DINPUT=tools/server/public/index.html -DOUTPUT=tools/server/index.html.hpp -P scripts/xxd.cmake - -SET(INPUT "" CACHE STRING "Input File") -SET(OUTPUT "" CACHE STRING "Output File") - -get_filename_component(filename "${INPUT}" NAME) -string(REGEX REPLACE "\\.|-" "_" name "${filename}") - -file(READ "${INPUT}" hex_data HEX) -string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}") - -string(LENGTH ${hex_data} hex_len) -math(EXPR len "${hex_len} / 2") - -file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7b1fcfca0ad..d15ccfd99f1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,6 +24,7 @@ add_library(llama llama-io.cpp llama-kv-cache.cpp llama-kv-cache-iswa.cpp + llama-kv-cache-dsa.cpp llama-memory.cpp llama-memory-hybrid.cpp llama-memory-hybrid-iswa.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 59dde99e362..be8f73cc1ed 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -75,6 +75,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" }, + { LLM_ARCH_DEEPSEEK32, "deepseek32" }, { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, @@ -133,6 +134,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_TALKIE, "talkie" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -317,6 +319,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, + { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, @@ -757,17 +760,19 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - // NextN/MTP tensors are currently ignored (reserved for future MTP support) - // These tensors only exist in the last layer(s) and are treated as output tensors - {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, - {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, - {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, - {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the + // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so + // the model loader doesn't fault on the block index. + {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // Nemotron 3 Super - {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU + {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} @@ -877,6 +882,16 @@ bool llm_arch_is_diffusion(const llm_arch & arch) { } } +bool llm_arch_supports_rs_rollback(const llm_arch & arch) { + switch (arch) { + case LLM_ARCH_QWEN35: + case LLM_ARCH_QWEN35MOE: + return true; + default: + return false; + } +} + bool llm_arch_supports_sm_tensor(const llm_arch & arch) { switch (arch) { case LLM_ARCH_GROK: @@ -891,6 +906,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) { case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK32: case LLM_ARCH_GLM_DSA: case LLM_ARCH_BITNET: case LLM_ARCH_T5: diff --git a/src/llama-arch.h b/src/llama-arch.h index e37d548c98e..2c71bbe8156 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -79,6 +79,7 @@ enum llm_arch { LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK2, LLM_ARCH_DEEPSEEK2OCR, + LLM_ARCH_DEEPSEEK32, LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, @@ -137,6 +138,7 @@ enum llm_arch { LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_TALKIE, LLM_ARCH_UNKNOWN, }; @@ -306,6 +308,7 @@ enum llm_kv { LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_CHAT_TEMPLATE, + LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, LLM_KV_TOKENIZER_FIM_PRE_ID, LLM_KV_TOKENIZER_FIM_SUF_ID, LLM_KV_TOKENIZER_FIM_MID_ID, @@ -637,3 +640,4 @@ bool llm_arch_is_recurrent (const llm_arch & arch); bool llm_arch_is_hybrid (const llm_arch & arch); bool llm_arch_is_diffusion (const llm_arch & arch); bool llm_arch_supports_sm_tensor(const llm_arch & arch); +bool llm_arch_supports_rs_rollback(const llm_arch & arch); diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 6554a89b28a..6d822ec62d6 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -62,6 +62,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD }, { "granite", LLM_CHAT_TEMPLATE_GRANITE_3_X }, { "granite-4.0", LLM_CHAT_TEMPLATE_GRANITE_4_0 }, + { "granite-4.1", LLM_CHAT_TEMPLATE_GRANITE_4_1 }, { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, { "megrez", LLM_CHAT_TEMPLATE_MEGREZ }, { "yandex", LLM_CHAT_TEMPLATE_YANDEX }, @@ -73,7 +74,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, - { "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR }, + { "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, @@ -194,7 +195,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_RWKV_WORLD; } else if (tmpl_contains("<|start_of_role|>")) { if (tmpl_contains("") || tmpl_contains("")) { - return LLM_CHAT_TEMPLATE_GRANITE_4_0; + if (tmpl_contains("g4_default_system_message")) { + return LLM_CHAT_TEMPLATE_GRANITE_4_0; + } + return LLM_CHAT_TEMPLATE_GRANITE_4_1; } return LLM_CHAT_TEMPLATE_GRANITE_3_X; } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) { @@ -218,7 +222,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) { return LLM_CHAT_TEMPLATE_OPENAI_MOE; } else if (tmpl_contains("<๏ฝœhy_Assistant๏ฝœ>") && tmpl_contains("<๏ฝœhy_beginโ–ofโ–sentence๏ฝœ>")) { - return LLM_CHAT_TEMPLATE_HUNYUAN_OCR; + return LLM_CHAT_TEMPLATE_HUNYUAN_VL; } else if (tmpl_contains("<๏ฝœhy_Assistant๏ฝœ>") && tmpl_contains("<๏ฝœhy_placeโ–holderโ–noโ–3๏ฝœ>")) { return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { @@ -651,6 +655,20 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|start_of_role|>assistant<|end_of_role|>"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE_4_1) { + // IBM Granite 4.1 template + for (const auto & message : chat) { + std::string role(message->role); + if (role == "assistant_tool_call") { + ss << "<|start_of_role|>assistant<|end_of_role|><|tool_call|>"; + } else { + ss << "<|start_of_role|>" << role << "<|end_of_role|>"; + } + ss << message->content << "<|end_of_text|>\n"; + } + if (add_ass) { + ss << "<|start_of_role|>assistant<|end_of_role|>"; + } } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) { // GigaChat template bool has_system = !chat.empty() && std::string(chat[0]->role) == "system"; @@ -825,8 +843,8 @@ int32_t llm_chat_apply_template( ss << "<๏ฝœhy_User๏ฝœ>" << chat[i]->content << "<๏ฝœhy_Assistant๏ฝœ>"; } } - } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) { - // tencent/HunyuanOCR + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) { + // tencent/HunyuanOCR & tencent/HunyuanVL ss << "<๏ฝœhy_beginโ–ofโ–sentence๏ฝœ>"; for (size_t i = 0; i < chat.size(); i++) { std::string role(chat[i]->role); diff --git a/src/llama-chat.h b/src/llama-chat.h index 13f936a946c..dc37f919a96 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -41,6 +41,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_RWKV_WORLD, LLM_CHAT_TEMPLATE_GRANITE_3_X, LLM_CHAT_TEMPLATE_GRANITE_4_0, + LLM_CHAT_TEMPLATE_GRANITE_4_1, LLM_CHAT_TEMPLATE_GIGACHAT, LLM_CHAT_TEMPLATE_MEGREZ, LLM_CHAT_TEMPLATE_YANDEX, @@ -53,7 +54,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, - LLM_CHAT_TEMPLATE_HUNYUAN_OCR, + LLM_CHAT_TEMPLATE_HUNYUAN_VL, LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_GROK_2, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bf745baa833..7985315b68e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2,6 +2,7 @@ #include "ggml.h" #include "llama-arch.h" +#include "llama-graph.h" #include "llama-impl.h" #include "llama-batch.h" #include "llama-io.h" @@ -22,6 +23,14 @@ // llama_context // +static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) { + switch (ctx_type) { + case LLAMA_CONTEXT_TYPE_DEFAULT: return LLM_GRAPH_TYPE_DEFAULT; + case LLAMA_CONTEXT_TYPE_MTP : return LLM_GRAPH_TYPE_DECODER_MTP; + } + throw std::runtime_error("Unsupported ctx type"); +} + llama_context::llama_context( const llama_model & model, llama_context_params params) : @@ -43,13 +52,22 @@ llama_context::llama_context( throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ)); } + cparams.n_rs_seq = params.n_rs_seq; + if (cparams.n_rs_seq > 0 && !llm_arch_supports_rs_rollback(model.arch)) { + LLAMA_LOG_DEBUG("%s: n_rs_seq=%u requested but model arch does not support recurrent partial rollback; clamping to 0\n", + __func__, cparams.n_rs_seq); + cparams.n_rs_seq = 0; + } + cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor; cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor; cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow; - cparams.embeddings = params.embeddings; + cparams.embeddings = params.embeddings; + cparams.embeddings_pre_norm = false; + cparams.embeddings_pre_norm_masked = false; cparams.offload_kqv = params.offload_kqv; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; @@ -66,6 +84,8 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; + cparams.ctx_type = params.ctx_type; + // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. @@ -163,6 +183,8 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max; + cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; @@ -207,6 +229,8 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false"); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: n_rs_seq = %u\n", __func__, cparams.n_rs_seq); + LLAMA_LOG_INFO("%s: n_outputs_max = %u\n", __func__, cparams.n_outputs_max); if (cparams.n_ctx_seq < hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", @@ -279,6 +303,7 @@ llama_context::llama_context( /*.type_k =*/ params.type_k, /*.type_v =*/ params.type_v, /*.swa_full =*/ params.swa_full, + /*.ctx_type= */ cparams.ctx_type, }; memory.reset(model.create_memory(params_mem, cparams)); @@ -510,7 +535,7 @@ void llama_context::sched_reserve() { // note: n_outputs must match n_tokens for embedding models with mean/rank pooling, // because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies // it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens, - // the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553). + // the ggml_mul_mat assertion fails. const uint32_t n_tokens_ch = 16*n_seqs; auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true); if (!gf) { @@ -556,16 +581,18 @@ void llama_context::sched_reserve() { int n_splits_tg = -1; int n_nodes_tg = -1; + const uint32_t n_outputs_pp = std::min(n_tokens, cparams.n_outputs_max); + // reserve pp (prompt processing) graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), + auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get(), model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr); if (!gf) { if (cparams.pipeline_parallel) { LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__); cparams.pipeline_parallel = false; sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload)); - gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get()); } if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); @@ -593,7 +620,7 @@ void llama_context::sched_reserve() { // // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get()); // - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc); + auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -753,7 +780,9 @@ bool llama_context::memory_update(bool optimize) { const uint32_t n_seqs = cparams.n_seq_max; const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + const uint32_t n_outputs_max = std::min(n_tokens, cparams.n_outputs_max); + + auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_max, mctx.get()); if (!gf) { LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__); } @@ -861,6 +890,42 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } +float * llama_context::get_embeddings_pre_norm() { + output_reorder(); + + return embd_pre_norm.data; +} + +float * llama_context::get_embeddings_pre_norm_ith(int32_t i) { + output_reorder(); + + try { + if (embd_pre_norm.data == nullptr) { + throw std::runtime_error("no pre-norm embeddings"); + } + + const uint32_t n_embd = model.hparams.n_embd; + + if (!cparams.embeddings_pre_norm_masked) { + // unmasked: pre-norm rows are stored densely, indexed by raw token position. + if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) { + throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd)); + } + return embd_pre_norm.data + (size_t) i * n_embd; + } + + const int64_t j = output_resolve_row(i); + return embd_pre_norm.data + j*n_embd; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} + llama_token llama_context::get_sampled_token_ith(int32_t idx) { output_reorder(); @@ -1041,6 +1106,13 @@ void llama_context::set_embeddings(bool value) { //sched_need_reserve = true; } +void llama_context::set_embeddings_pre_norm(bool value, bool masked) { + LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked); + + cparams.embeddings_pre_norm = value; + cparams.embeddings_pre_norm_masked = masked; +} + void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1073,6 +1145,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler); + if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) { + static bool warned = false; + if (!warned) { + LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__); + warned = true; + } + if (sampling.samplers.count(seq_id) > 0) { + sched_need_reserve = true; + } + sampling.samplers.erase(seq_id); + return false; + } + const bool can_offload = sampler && sampler->iface->backend_init && @@ -1242,7 +1327,9 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll } int llama_context::encode(const llama_batch & batch_inp) { - GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT + // MTP hook batches carry both token (next-token id) and embd (h_pre_norm row), + // so accept either present rather than requiring exactly one. + GGML_ASSERT(batch_inp.token || batch_inp.embd); if (batch_inp.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); @@ -1313,8 +1400,9 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - auto * t_logits = res->get_logits(); - auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); + auto * t_logits = res->get_logits(); + auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); + auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr; // extract logits if (logits.data && t_logits) { @@ -1380,6 +1468,16 @@ int llama_context::encode(const llama_batch & batch_inp) { } } + // extract pre-norm embeddings (hidden state before the final output norm) + if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); + GGML_ASSERT(backend_h != nullptr); + + const uint32_t n_embd = hparams.n_embd; + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_pre_norm.size); + ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm.data, 0, n_tokens*n_embd*sizeof(float)); + } + // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { //cross.t_embd = t_embd; @@ -1532,7 +1630,9 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::mapget_ubatch(); @@ -1690,7 +1791,8 @@ int llama_context::decode(const llama_batch & batch_inp) { } ggml_status status; - const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status); + + const auto * res = process_ubatch(ubatch, ctx_type_to_graph_type(cparams.ctx_type), mctx.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module @@ -1728,8 +1830,9 @@ int llama_context::decode(const llama_batch & batch_inp) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} - auto * t_logits = res->get_logits(); - auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; + auto * t_logits = res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; + auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr; if (t_embd && res->get_embd_pooled()) { t_embd = res->get_embd_pooled(); @@ -1810,6 +1913,25 @@ int llama_context::decode(const llama_batch & batch_inp) { } } + // extract pre-norm embeddings (hidden state before the final output norm) + // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. + { + const bool masked = cparams.embeddings_pre_norm_masked; + const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens; + const int64_t offset = masked ? n_outputs_prev : n_tokens_prev; + + if (embd_pre_norm.data && t_h_pre_norm && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); + GGML_ASSERT(backend_h != nullptr); + + const uint32_t n_embd = hparams.n_embd; + float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd; + + GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size); + ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_rows*n_embd*sizeof(float)); + } + } + // Copy backend sampling output if this ubatch produced any sampling tensors. if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) { const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev); @@ -1824,6 +1946,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } n_outputs_prev += n_outputs; + n_tokens_prev += ubatch.n_tokens; } while (mctx->next()); // set to total number of outputs in the batch, for use in llama_get_logits_ith @@ -1894,10 +2017,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); - bool has_logits = true; - bool has_embd = cparams.embeddings; + bool has_logits = true; + bool has_embd = cparams.embeddings; + bool has_embd_pre_norm = cparams.embeddings_pre_norm; // TODO: hacky enc-dec support if (model.arch == LLM_ARCH_T5) { @@ -1909,8 +2034,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { size_t backend_float_count = 0; size_t backend_token_count = 0; - logits.size = has_logits ? n_vocab*n_outputs_max : 0; - embd.size = has_embd ? n_embd_out*n_outputs_max : 0; + logits.size = has_logits ? n_vocab*n_outputs_max : 0; + embd.size = has_embd ? n_embd_out*n_outputs_max : 0; + embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0; + + if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) { + // unmasked: pre-norm row exists for every token in the batch, not just + // those flagged via batch.logits[i] -> size by token count instead. + embd_pre_norm.size = (size_t) n_embd * n_batch; + } // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); @@ -1926,8 +2058,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = - (logits.size + embd.size + backend_float_count) * sizeof(float) + - ( backend_token_count) * sizeof(llama_token); + (logits.size + embd.size + embd_pre_norm.size + backend_float_count) * sizeof(float) + + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -1943,6 +2075,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { buf_output = nullptr; logits.data = nullptr; embd.data = nullptr; + embd_pre_norm.data = nullptr; } auto * buft = ggml_backend_cpu_buffer_type(); @@ -1971,6 +2104,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd = has_embd ? buffer_view{(float *) (base + offset), embd.size} : buffer_view{nullptr, 0}; offset += embd.size * sizeof(float); + embd_pre_norm = has_embd_pre_norm ? buffer_view{(float *) (base + offset), embd_pre_norm.size} : buffer_view{nullptr, 0}; + offset += embd_pre_norm.size * sizeof(float); + if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; offset += sampling.logits.size * sizeof(float); @@ -2012,6 +2148,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { this->n_outputs = 0; + GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max); + return n_outputs_max; } @@ -2035,6 +2173,12 @@ void llama_context::output_reorder() { } } + if (embd_pre_norm.size > 0) { + for (uint64_t k = 0; k < n_embd; k++) { + std::swap(embd_pre_norm.data[i0*n_embd + k], embd_pre_norm.data[i1*n_embd + k]); + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); @@ -2092,8 +2236,6 @@ ggml_cgraph * llama_context::graph_reserve( if (n_tokens % n_seqs != 0) { n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs - n_outputs = std::max(n_outputs, n_tokens); - LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); } @@ -2122,7 +2264,7 @@ ggml_cgraph * llama_context::graph_reserve( auto * res = gf_res_reserve.get(); - const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT); + const auto gparams = graph_params(res, ubatch, mctx, ctx_type_to_graph_type(cparams.ctx_type)); res->reset(); @@ -3101,7 +3243,7 @@ void llama_context::opt_epoch_iter( auto * res = gf_res_prev.get(); - const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT); + const auto gparams = graph_params(res, ubatch, mctx.get(), ctx_type_to_graph_type(cparams.ctx_type)); res->reset(); @@ -3202,8 +3344,11 @@ llama_context_params llama_context_default_params() { /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, /*.n_seq_max =*/ 1, + /*.n_rs_seq =*/ 0, + /*.n_outputs_max =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, + /*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT, /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED, @@ -3267,10 +3412,6 @@ llama_context * llama_init_from_model( LLAMA_LOG_ERROR("%s: SPLIT_MODE_TENSOR requires flash_attn to be enabled\n", __func__); return nullptr; } - if (ggml_is_quantized(params.type_k) || ggml_is_quantized(params.type_v)) { - LLAMA_LOG_ERROR("%s: simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented\n", __func__); - return nullptr; - } } if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) { @@ -3307,6 +3448,13 @@ llama_context * llama_init_from_model( model->hparams.pooling_type, params.pooling_type); } + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && + model->hparams.nextn_predict_layers == 0) { + LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); + return nullptr; + } + + try { auto * ctx = new llama_context(*model, params); return ctx; @@ -3348,6 +3496,10 @@ uint32_t llama_n_seq_max(const llama_context * ctx) { return ctx->n_seq_max(); } +uint32_t llama_n_rs_seq(const llama_context * ctx) { + return ctx->get_cparams().n_rs_seq; +} + const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->get_model(); } @@ -3437,6 +3589,22 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { return ctx->get_embeddings_seq(seq_id); } +void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked) { + ctx->set_embeddings_pre_norm(value, masked); +} + +float * llama_get_embeddings_pre_norm(llama_context * ctx) { + ctx->synchronize(); + + return ctx->get_embeddings_pre_norm(); +} + +float * llama_get_embeddings_pre_norm_ith(llama_context * ctx, int32_t i) { + ctx->synchronize(); + + return ctx->get_embeddings_pre_norm_ith(i); +} + bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { return ctx->set_sampler(seq_id, smpl); } diff --git a/src/llama-context.h b/src/llama-context.h index 92d1b0cf95a..d03f681d4a1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -84,6 +84,9 @@ struct llama_context { float * get_embeddings_ith(int32_t i); float * get_embeddings_seq(llama_seq_id seq_id); + float * get_embeddings_pre_norm(); + float * get_embeddings_pre_norm_ith(int32_t i); + llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -107,6 +110,7 @@ struct llama_context { void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); void set_embeddings (bool value); + void set_embeddings_pre_norm(bool value, bool masked); void set_causal_attn(bool value); void set_warmup(bool value); @@ -278,6 +282,11 @@ struct llama_context { // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE buffer_view embd = {nullptr, 0}; + // hidden state before the final output norm (2-dimensional array: [n_outputs][n_embd]) + // populated only when cparams.embeddings_pre_norm is enabled and the model graph + // sets llm_graph_result::t_h_pre_norm + buffer_view embd_pre_norm = {nullptr, 0}; + struct sampling_info { // !samplers.empty() to check if any samplers are active std::map samplers; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 9d359474132..ba4951a09a2 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -12,6 +12,8 @@ struct llama_cparams { uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; + uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback + uint32_t n_outputs_max; // max outputs supported by the context int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing @@ -27,6 +29,8 @@ struct llama_cparams { float yarn_beta_slow; bool embeddings; + bool embeddings_pre_norm; // also extract the hidden state before the final output norm + bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0 bool causal_attn; bool offload_kqv; bool flash_attn; @@ -40,6 +44,7 @@ struct llama_cparams { bool kv_unified; bool pipeline_parallel; + enum llama_context_type ctx_type; enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; diff --git a/src/llama-ext.h b/src/llama-ext.h index 8ce29d217cb..edfa71c207c 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -88,3 +88,19 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model); LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i); LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx); + +// +// pre-norm embeddings (hidden state before the final output norm) +// + +// Set whether the context outputs pre-norm embeddings or not +// If masked == true, output the embeddings only for the tokens with batch.logits != 0 +// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits +LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked); + +// mirrors: +// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx); + +// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); +LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index fe155c92dea..e6ec3054daf 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -7,6 +7,7 @@ #include "llama-kv-cache.h" #include "llama-kv-cache-iswa.h" +#include "llama-kv-cache-dsa.h" #include "llama-memory-hybrid.h" #include "llama-memory-hybrid-iswa.h" #include "llama-memory-recurrent.h" @@ -29,7 +30,10 @@ static ggml_tensor * build_attn_inp_kq_mask( const auto n_tokens = ubatch.n_tokens; const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq; - ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); + // flash attention requires an f16 mask + const auto type = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32; + + ggml_tensor * res = ggml_new_tensor_4d(ctx, type, n_kv, n_tokens/n_stream, 1, n_stream); ggml_set_input(res); ggml_set_name(res, "attn_inp_kq_mask"); @@ -102,6 +106,39 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_embd_h::set_input(const llama_ubatch * ubatch) { + const int64_t n_tokens = ubatch->n_tokens; + + if (ubatch->token) { + ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); + } else { + // note: mtmd embedding input goes through here + GGML_ASSERT(ubatch->embd); + GGML_ASSERT(n_embd == embd->ne[0]); + + ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h)); + } + + // TODO: extend llama_ubatch to differentiate between token embeddings and hidden states + // for now, we assume that the hidden state is always provided as an embedding + // ref: https://github.com/ggml-org/llama.cpp/pull/23643 + if (ubatch->embd) { + GGML_ASSERT(n_embd == h->ne[0]); + + ggml_backend_tensor_set(h, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h)); + } +} + +bool llm_graph_input_embd_h::can_reuse(const llm_graph_params & params) { + bool res = true; + + res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (h && h->ne[1] == params.ubatch.n_tokens); + + return res; +} + void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; @@ -348,7 +385,8 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { } } -static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) { +template +static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) { LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__); const char * swa_type_str = "unknown"; @@ -372,7 +410,7 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64 for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) { LLAMA_LOG_DEBUG(" %2d ", i); for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) { - float val = data[i * n_kv + j]; + float val = llama_cast(data[i * n_kv + j]); if (val == -INFINITY) { LLAMA_LOG_DEBUG(" โˆž"); } else { @@ -387,7 +425,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { const int64_t n_kv = ubatch->n_tokens; const int64_t n_tokens = ubatch->n_tokens; - const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) { + const auto fill_mask = [&](auto * data, int64_t ne, int n_swa, llama_swa_type swa_type) { + using T = std::remove_reference_t; + std::fill(data, data + ne, llama_cast(-INFINITY)); + for (int i1 = 0; i1 < n_tokens; ++i1) { const llama_seq_id s1 = ubatch->seq_id[i1][0]; const llama_pos p1 = ubatch->pos[i1]; @@ -413,38 +454,30 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { continue; } - data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f; + data[idst + i0] = llama_cast(hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f); } } - }; - - { - GGML_ASSERT(self_kq_mask); - GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); - - float * data = (float *) self_kq_mask->data; - - std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY); - - fill_mask(data, 0, LLAMA_SWA_TYPE_NONE); if (debug) { - print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE); + print_mask(data, n_tokens, n_kv, n_swa, swa_type); } + }; + + GGML_ASSERT(self_kq_mask); + GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer)); + if (self_kq_mask->type == GGML_TYPE_F16) { + fill_mask((ggml_fp16_t *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); + } else { + fill_mask((float *) self_kq_mask->data, ggml_nelements(self_kq_mask), 0, LLAMA_SWA_TYPE_NONE); } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(self_kq_mask_swa); GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer)); - - float * data = (float *) self_kq_mask_swa->data; - - std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY); - - fill_mask(data, hparams.n_swa, hparams.swa_type); - - if (debug) { - print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type); + if (self_kq_mask_swa->type == GGML_TYPE_F16) { + fill_mask((ggml_fp16_t *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); + } else { + fill_mask((float *) self_kq_mask_swa->data, ggml_nelements(self_kq_mask_swa), hparams.n_swa, hparams.swa_type); } } } @@ -499,16 +532,50 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_attn_k_dsa::set_input(const llama_ubatch * ubatch) { + mctx->get_mla()->set_input_k_idxs(self_k_idxs_mla, ubatch); + + mctx->get_mla()->set_input_kq_mask(self_kq_mask_mla, ubatch, cparams.causal_attn); + + mctx->get_lid()->set_input_k_idxs(self_k_idxs_lid, ubatch); + + mctx->get_lid()->set_input_kq_mask(self_kq_mask_lid, ubatch, cparams.causal_attn); + + mctx->get_lid()->set_input_k_rot(self_k_rot_lid); +} + +bool llm_graph_input_attn_k_dsa::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= self_k_idxs_mla->ne[0] == params.ubatch.n_tokens; + res &= self_k_idxs_lid->ne[0] == params.ubatch.n_tokens; + + res &= can_reuse_kq_mask(self_kq_mask_mla, mctx->get_mla(), params.ubatch, params.cparams); + res &= can_reuse_kq_mask(self_kq_mask_lid, mctx->get_lid(), params.ubatch, params.cparams); + + return res; +} + void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { - mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); - mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); + mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } - mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); - mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); + mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); @@ -534,14 +601,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { bool res = true; - res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } - res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } @@ -555,23 +629,30 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer)); GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing - float * data = (float *) cross_kq_mask->data; + const auto fill_mask = [&](auto * data) { + using T = std::remove_reference_t; + for (int i = 0; i < n_tokens; ++i) { + GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first"); + for (int j = 0; j < n_enc; ++j) { + float f = -INFINITY; - for (int i = 0; i < n_tokens; ++i) { - GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first"); - for (int j = 0; j < n_enc; ++j) { - float f = -INFINITY; + for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[i][s]; - for (int s = 0; s < ubatch->n_seq_id[i]; ++s) { - const llama_seq_id seq_id = ubatch->seq_id[i][s]; - - if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) { - f = 0.0f; + if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) { + f = 0.0f; + } } - } - data[i*n_enc + j] = f; + data[i*n_enc + j] = llama_cast(f); + } } + }; + + if (cross_kq_mask->type == GGML_TYPE_F16) { + fill_mask((ggml_fp16_t *) cross_kq_mask->data); + } else { + fill_mask((float *) cross_kq_mask->data); } } @@ -848,6 +929,9 @@ void llm_graph_result::set_outputs() { if (t_embd_pooled != nullptr) { ggml_set_output(t_embd_pooled); } + if (t_h_pre_norm != nullptr) { + ggml_set_output(t_h_pre_norm); + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); @@ -2072,17 +2156,20 @@ ggml_tensor * llm_graph_context::build_attn_mha( llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() const { auto inp = std::make_unique(hparams, cparams); + // flash attention requires an f16 mask + const auto type_mask = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32; + // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch - inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); + inp->self_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1); ggml_set_input(inp->self_kq_mask); - inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; + inp->self_kq_mask_cnv = inp->self_kq_mask; if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { - inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); + inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, type_mask, n_tokens, n_tokens, 1, 1); ggml_set_input(inp->self_kq_mask_swa); - inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; + inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa; } else { inp->self_kq_mask_swa = nullptr; inp->self_kq_mask_swa_cnv = nullptr; @@ -2159,7 +2246,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams); - inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; + inp->self_kq_mask_cnv = inp->self_kq_mask; } inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0); @@ -2266,7 +2353,7 @@ static std::unique_ptr build_attn_inp_k_impl( inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams); - inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; + inp->self_kq_mask_cnv = inp->self_kq_mask; } return inp; @@ -2338,6 +2425,82 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } +ggml_tensor * llm_graph_context::build_attn( + llm_graph_input_attn_k_dsa * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * wo_s, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + ggml_tensor * sinks, + ggml_tensor * v_mla, + ggml_tensor * top_k, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + // expand k later to enable rope fusion which directly writes into k-v cache + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, v_cur); + ggml_build_forward_expand(gf, k_cur); + + const auto * mctx_cur = inp->mctx->get_mla(); + + // store to KV cache + { + const auto & k_idxs = inp->get_k_idxs_mla(); + + ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il)); + } + + const auto & kq_mask = inp->get_kq_mask_mla(); + + // prepare new kq mask - starts filled with -INFINITY + ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask, -INFINITY); + + // reshape KQ mask into tensor with rows of size 1: + // [n_kv, n_batch, 1, n_stream] -> [1, n_kv, n_batch, n_stream] + kq_mask_all = ggml_view_4d(ctx0, kq_mask_all, 1, kq_mask_all->ne[0], kq_mask_all->ne[1], kq_mask_all->ne[3], kq_mask_all->nb[0], kq_mask_all->nb[1], kq_mask_all->nb[2], 0); + + // reshape top_k indices: [n_top_k, n_batch, 1, n_stream] -> [n_top_k, n_batch, n_stream, 1] + ggml_tensor * top_k_3d = ggml_view_4d(ctx0, top_k, top_k->ne[0], top_k->ne[1], top_k->ne[3], 1, top_k->nb[1], top_k->nb[2], top_k->ne[3]*top_k->nb[3], 0); + + // prepare zero-filled tensor with rows of size 1: [1, n_top_k, n_batch, n_stream] + // this will be our source of zero values for unmasking top k mask elements + ggml_tensor * zeros = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 1, top_k_3d->ne[0], top_k_3d->ne[1], top_k_3d->ne[2]); + zeros = ggml_fill(ctx0, zeros, 0.0f); + + // modify KQ mask by unmasking elements that are in top_k indices + // ggml_set_rows([1, n_kv, n_batch, n_stream], [1, n_top_k, n_batch, n_stream], [n_top_k, n_batch, n_stream, 1]) + ggml_tensor * kq_mask_top_k = ggml_set_rows(ctx0, kq_mask_all, zeros, top_k_3d); + + // reshape to restore the original shape of KQ mask: + // [1, n_kv, n_batch, n_stream] -> [n_kv, n_batch, 1, n_stream] + kq_mask_top_k = ggml_view_4d(ctx0, kq_mask_top_k, kq_mask_top_k->ne[1], kq_mask_top_k->ne[2], 1, kq_mask_top_k->ne[3], kq_mask_top_k->nb[2], kq_mask_top_k->nb[3], kq_mask_top_k->nb[3], 0); + + // combine with the original kq mask + kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask); + + ggml_tensor * q = q_cur; + ggml_tensor * k = mctx_cur->get_k(ctx0, il); + ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0); + + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_top_k, sinks, v_mla, kq_scale, il); + cb(cur, "kqv_out", il); + + if (wo) { + cur = build_lora_mm(wo, cur, wo_s); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_kv_iswa * inp, ggml_tensor * wo, @@ -2430,10 +2593,13 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; - inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1); + // flash attention requires an f16 mask + const auto type_mask = cparams.flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32; + + inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, type_mask, n_enc, n_tokens, 1, 1); ggml_set_input(inp->cross_kq_mask); - inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask; + inp->cross_kq_mask_cnv = inp->cross_kq_mask; return (llm_graph_input_attn_cross *) res->add_input(std::move(inp)); } @@ -2481,6 +2647,34 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } +llm_graph_input_attn_k_dsa * llm_graph_context::build_attn_inp_k_dsa() const { + const auto * mctx_cur = static_cast(mctx); + + auto inp = std::make_unique(hparams, cparams, mctx_cur); + + { + inp->self_k_idxs_mla = mctx_cur->get_mla()->build_input_k_idxs(ctx0, ubatch); + + inp->self_kq_mask_mla = build_attn_inp_kq_mask(ctx0, mctx_cur->get_mla(), ubatch, cparams); + inp->self_kq_mask_mla_cnv = inp->self_kq_mask_mla; + } + + { + inp->self_k_idxs_lid = mctx_cur->get_lid()->build_input_k_idxs(ctx0, ubatch); + + // ensure F32 mask + auto cparams_copy = cparams; + cparams_copy.flash_attn = false; + + inp->self_kq_mask_lid = build_attn_inp_kq_mask(ctx0, mctx_cur->get_lid(), ubatch, cparams_copy); + inp->self_kq_mask_lid_cnv = inp->self_kq_mask_lid; + + inp->self_k_rot_lid = mctx_cur->get_lid()->build_input_k_rot(ctx0); + } + + return (llm_graph_input_attn_k_dsa *) res->add_input(std::move(inp)); +} + // TODO: maybe separate the inner implementation into a separate function // like with the non-sliding window equivalent // once sliding-window hybrid caches are a thing. @@ -2494,7 +2688,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams); - inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; + inp->self_kq_mask_cnv = inp->self_kq_mask; } { @@ -2504,7 +2698,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams); - inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa; + inp->self_kq_mask_swa_cnv = inp->self_kq_mask_swa; } inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0); @@ -2528,7 +2722,8 @@ ggml_tensor * llm_graph_context::build_rs( int32_t rs_zero, const llm_graph_get_rows_fn & get_state_rows) const { - ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size); + GGML_UNUSED(rs_size); + ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, s->ne[1]); // Clear a single state which will then be copied to the other cleared states. // Note that this is a no-op when the view is zero-sized. @@ -2672,7 +2867,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch); inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams); - inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask; + inp_attn->self_kq_mask_cnv = inp_attn->self_kq_mask; } { @@ -2680,7 +2875,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch); inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams); - inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa; + inp_attn->self_kq_mask_swa_cnv = inp_attn->self_kq_mask_swa; } auto inp = std::make_unique(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur); diff --git a/src/llama-graph.h b/src/llama-graph.h index 5cb1756c6a9..eab82bd0d70 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -22,6 +22,7 @@ struct llama_layer; struct llama_memory_context_i; class llama_kv_cache_context; +class llama_kv_cache_dsa_context; class llama_kv_cache_iswa_context; class llama_memory_recurrent_context; class llama_memory_hybrid_context; @@ -32,6 +33,7 @@ enum llm_graph_type { LLM_GRAPH_TYPE_DEFAULT, LLM_GRAPH_TYPE_ENCODER, LLM_GRAPH_TYPE_DECODER, + LLM_GRAPH_TYPE_DECODER_MTP, }; enum llm_ffn_op_type { @@ -120,6 +122,23 @@ class llm_graph_input_embd : public llm_graph_input_i { const int64_t n_embd = 0; }; +// similar to llm_graph_input_embd but with an additional hidden state input +class llm_graph_input_embd_h : public llm_graph_input_i { +public: + llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {} + virtual ~llm_graph_input_embd_h() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tokens = nullptr; // I32 [n_batch] + ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + ggml_tensor * h = nullptr; // F32 [n_embd, n_batch] + + const int64_t n_embd = 0; +}; + class llm_graph_input_pos : public llm_graph_input_i { public: llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} @@ -273,10 +292,10 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i { ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } // n_tokens == n_batch - ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa = nullptr; // F32/F16 [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] const llama_hparams hparams; const llama_cparams cparams; @@ -306,8 +325,8 @@ class llm_graph_input_attn_kv : public llm_graph_input_i { ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] // note: assumes v_rot^2 == I ggml_tensor * self_k_rot = nullptr; @@ -346,8 +365,8 @@ class llm_graph_input_attn_k : public llm_graph_input_i { ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] const llama_hparams hparams; const llama_cparams cparams; @@ -355,6 +374,44 @@ class llm_graph_input_attn_k : public llm_graph_input_i { const llama_kv_cache_context * mctx; }; +class llm_graph_input_attn_k_dsa : public llm_graph_input_i { +public: + llm_graph_input_attn_k_dsa( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_dsa_context * mctx) : + hparams(hparams), + cparams(cparams), + mctx(mctx) { + } + ~llm_graph_input_attn_k_dsa() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * get_k_idxs_mla() const { return self_k_idxs_mla; } + ggml_tensor * get_k_idxs_lid() const { return self_k_idxs_lid; } + + ggml_tensor * get_kq_mask_mla() const { return self_kq_mask_mla_cnv; } + ggml_tensor * get_kq_mask_lid() const { return self_kq_mask_lid; } + + ggml_tensor * self_k_idxs_mla = nullptr; // I64 [n_batch] + ggml_tensor * self_k_idxs_lid = nullptr; // I64 [n_batch] + + ggml_tensor * self_kq_mask_mla = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_mla_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_lid = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_lid_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + + ggml_tensor * self_k_rot_lid = nullptr; + + const llama_hparams hparams; + const llama_cparams cparams; + + const llama_kv_cache_dsa_context * mctx; +}; + class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { public: llm_graph_input_attn_kv_iswa( @@ -384,10 +441,10 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch] ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] - ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] - ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] ggml_tensor * self_k_rot = nullptr; ggml_tensor * self_v_rot = nullptr; @@ -410,8 +467,8 @@ class llm_graph_input_attn_cross : public llm_graph_input_i { ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; } - ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1] - ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1] + ggml_tensor * cross_kq_mask = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1] + ggml_tensor * cross_kq_mask_cnv = nullptr; // F32/F16 [n_outputs_enc, n_batch, 1, 1] const llama_cross * cross = nullptr; }; @@ -580,7 +637,8 @@ struct llm_graph_params { ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && ( (!ubatch.token && !other.ubatch.token) || - (!ubatch.embd && !other.ubatch.embd) + (!ubatch.embd && !other.ubatch.embd) || + (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd) ); // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same @@ -644,6 +702,7 @@ class llm_graph_result { ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } + ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; } ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -672,6 +731,7 @@ class llm_graph_result { ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; + ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm std::map t_sampled_logits; std::map t_candidates; @@ -952,6 +1012,23 @@ struct llm_graph_context { float kq_scale, int il) const; + llm_graph_input_attn_k_dsa * build_attn_inp_k_dsa() const; + + ggml_tensor * build_attn( + llm_graph_input_attn_k_dsa * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * wo_s, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + ggml_tensor * top_k, // [n_indexer_top_k, n_tokens] + float kq_scale, + int il) const; + llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const; // note: if k_cur or v_cur are not provided, they will not be stored in the memory diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 002d15d415f..2239309c8fb 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -229,6 +229,12 @@ uint32_t llama_hparams::n_embd_head_v_mla() const { } bool llama_hparams::has_kv(uint32_t il) const { + if (kv_only_nextn) { + // MTP head: only the trailing nextn_predict_layers blocks own a KV cache; + // the leading trunk blocks are not executed in this graph. + return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers); + } + if (n_layer_kv_from_start >= 0) { if (il < (uint32_t) n_layer_kv_from_start) { return true; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 0160a89caa2..e2d051edc6c 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -92,6 +92,8 @@ struct llama_hparams { uint32_t moe_latent_size = 0; uint32_t nextn_predict_layers = 0; + bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches) + float f_norm_eps; float f_norm_rms_eps; float f_norm_group_eps; diff --git a/src/llama-impl.h b/src/llama-impl.h index e4f35c8e53d..7923c3f7ed5 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -3,6 +3,7 @@ #include "ggml.h" // for ggml_log_level #include +#include #include #ifdef __GNUC__ @@ -40,6 +41,19 @@ struct no_init { no_init() = default; }; +template +static inline dst_t llama_cast(src_t v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v && std::is_same_v) { + return ggml_fp16_to_fp32(v); + } else if constexpr (std::is_same_v && std::is_same_v) { + return ggml_fp32_to_fp16(v); + } else { + static_assert(std::is_same_v, "unsupported type combination"); + } +} + struct time_meas { time_meas(int64_t & t_acc, bool disable = false); ~time_meas(); diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp new file mode 100644 index 00000000000..e44004b5586 --- /dev/null +++ b/src/llama-kv-cache-dsa.cpp @@ -0,0 +1,261 @@ +#include "llama-kv-cache-dsa.h" + +#include "llama-impl.h" +#include "llama-batch.h" +#include "llama-model.h" + +#include +#include + +// +// llama_kv_cache_dsa +// + +llama_kv_cache_dsa::llama_kv_cache_dsa( + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool v_trans, + bool offload, + bool unified, + uint32_t kv_size, + uint32_t n_seq_max, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse) : + hparams_lid(model.hparams), n_stream(unified ? 1 : n_seq_max) { + + LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size); + + kv_mla = std::make_unique( + model, model.hparams, type_k, type_v, + v_trans, offload, unified, kv_size, n_seq_max, n_pad, + n_swa, swa_type, filter, reuse); + + // we use llama_kv_cache for caching indexer keys + // by hand-tweaking some hparams we fool it to create + // indexer key cache tensors with correct dimensions + // https://github.com/ggml-org/llama.cpp/pull/21149#discussion_r3015940823 + + // DSA lightning indexer uses MQA with single key head + std::fill(hparams_lid.n_head_kv_arr.begin(), hparams_lid.n_head_kv_arr.end(), 1); + hparams_lid.n_embd_head_k_full = model.hparams.indexer_head_size; + hparams_lid.rope_type = LLAMA_ROPE_TYPE_NEOX; + + LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size); + + kv_lid = std::make_unique( + model, hparams_lid, type_k, type_v, + v_trans, offload, unified, kv_size, n_seq_max, n_pad, + n_swa, swa_type, filter, reuse); +} + +void llama_kv_cache_dsa::clear(bool data) { + kv_mla->clear(data); + kv_lid->clear(data); +} + +bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + bool res = true; + + res = res & kv_mla->seq_rm(seq_id, p0, p1); + res = res & kv_lid->seq_rm(seq_id, p0, p1); + + return res; +} + +void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + kv_mla->seq_cp(seq_id_src, seq_id_dst, p0, p1); + kv_lid->seq_cp(seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) { + kv_mla->seq_keep(seq_id); + kv_lid->seq_keep(seq_id); +} + +void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + kv_mla->seq_add(seq_id, p0, p1, shift); + kv_lid->seq_add(seq_id, p0, p1, shift); +} + +void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + kv_mla->seq_div(seq_id, p0, p1, d); + kv_lid->seq_div(seq_id, p0, p1, d); +} + +llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const { + return kv_mla->seq_pos_min(seq_id); +} + +llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const { + return kv_mla->seq_pos_max(seq_id); +} + +std::map llama_kv_cache_dsa::memory_breakdown() const { + std::map mb = kv_mla->memory_breakdown(); + for (const auto & buft_size : kv_lid->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_batch( + llama_batch_allocr & balloc, + uint32_t n_ubatch, + bool embd_all) { + GGML_UNUSED(embd_all); + + do { + balloc.split_reset(); + + std::vector ubatches; + while (true) { + auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true); + + if (ubatch.n_tokens == 0) { + break; + } + + ubatches.push_back(std::move(ubatch)); // NOLINT + } + + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + + auto sinfos_mla = kv_mla->prepare(ubatches); + if (sinfos_mla.empty()) { + break; + } + + auto sinfos_lid = kv_lid->prepare(ubatches); + if (sinfos_lid.empty()) { + break; + } + + assert(sinfos_mla.size() == sinfos_lid.size()); + + return std::make_unique( + this, std::move(sinfos_mla), std::move(sinfos_lid), std::move(ubatches)); + } while (false); + + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_full() { + return std::make_unique(this); +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); +} + +bool llama_kv_cache_dsa::get_can_shift() const { + return kv_mla->get_can_shift() && + kv_lid->get_can_shift() && + kv_mla->get_size() == kv_lid->get_size(); +} + +void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + kv_mla->state_write(io, seq_id, flags); + kv_lid->state_write(io, seq_id, flags); +} + +void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + kv_mla->state_read(io, seq_id, flags); + kv_lid->state_read(io, seq_id, flags); +} + +llama_kv_cache * llama_kv_cache_dsa::get_mla() const { + return kv_mla.get(); +} + +llama_kv_cache * llama_kv_cache_dsa::get_lid() const { + return kv_lid.get(); +} + +// +// llama_kv_cache_dsa_context +// + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv) : + ctx_mla(kv->get_mla()->init_full()), + ctx_lid(kv->get_lid()->init_full()), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + llama_context * lctx, + bool optimize) : + ctx_mla(kv->get_mla()->init_update(lctx, optimize)), + ctx_lid(kv->get_lid()->init_update(lctx, optimize)), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + slot_info_vec_t sinfos_mla, + slot_info_vec_t sinfos_lid, + std::vector ubatches) : + ubatches(std::move(ubatches)), + // note: here we copy the ubatches. not sure if this is ideal + ctx_mla(new llama_kv_cache_context(kv->get_mla(), std::move(sinfos_mla), this->ubatches)), + ctx_lid(new llama_kv_cache_context(kv->get_lid(), std::move(sinfos_lid), this->ubatches)), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default; + +bool llama_kv_cache_dsa_context::next() { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + ctx_mla->next(); + ctx_lid->next(); + + if (++i_next >= ubatches.size()) { + return false; + } + + return true; +} + +bool llama_kv_cache_dsa_context::apply() { + assert(!llama_memory_status_is_fail(status)); + + bool res = true; + + res = res & ctx_mla->apply(); + res = res & ctx_lid->apply(); + + return res; +} + +llama_memory_status llama_kv_cache_dsa_context::get_status() const { + return status; +} + +const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return ubatches[i_next]; +} + +const llama_kv_cache_context * llama_kv_cache_dsa_context::get_mla() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return static_cast(ctx_mla.get()); +} + +const llama_kv_cache_context * llama_kv_cache_dsa_context::get_lid() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return static_cast(ctx_lid.get()); +} diff --git a/src/llama-kv-cache-dsa.h b/src/llama-kv-cache-dsa.h new file mode 100644 index 00000000000..e2b330993b8 --- /dev/null +++ b/src/llama-kv-cache-dsa.h @@ -0,0 +1,138 @@ +#pragma once + +#include "llama-kv-cache.h" + +#include + +// +// llama_kv_cache_dsa +// + +// utilizes two instances of llama_kv_cache: +// - the first instance is for caching key tensors of the model, +// - the second instance is for caching lightning indexer key tensors + +class llama_kv_cache_dsa : public llama_memory_i { +public: + llama_kv_cache_dsa( + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool v_trans, + bool offload, + bool unified, + uint32_t kv_size, + uint32_t n_seq_max, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse); + + ~llama_kv_cache_dsa() = default; + + // + // llama_memory_i + // + + llama_memory_context_ptr init_batch( + llama_batch_allocr & balloc, + uint32_t n_ubatch, + bool embd_all) override; + + llama_memory_context_ptr init_full() override; + + llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override; + + bool get_can_shift() const override; + + void clear(bool data) override; + + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; + void seq_keep(llama_seq_id seq_id) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; + + llama_pos seq_pos_min(llama_seq_id seq_id) const override; + llama_pos seq_pos_max(llama_seq_id seq_id) const override; + + std::map memory_breakdown() const override; + + // state write/load + + void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; + void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override; + + // + // llama_kv_cache_dsa specific API + // + + llama_kv_cache * get_mla() const; + llama_kv_cache * get_lid() const; + +private: + // we keep indexer KV cache hparams instance here as llama_kv_cache stores only reference to it + llama_hparams hparams_lid; + const uint32_t n_stream = 1; + + std::unique_ptr kv_mla; + std::unique_ptr kv_lid; +}; + +class llama_kv_cache_dsa_context : public llama_memory_context_i { +public: + using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; + + // used for errors + llama_kv_cache_dsa_context(llama_memory_status status); + + // used to create a full-cache context + llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv); + + // used to create an update context + llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + llama_context * lctx, + bool optimize); + + // used to create a batch processing context from a batch + llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + slot_info_vec_t sinfos_base, + slot_info_vec_t sinfos_ik, + std::vector ubatches); + + virtual ~llama_kv_cache_dsa_context(); + + // + // llama_memory_context_i + // + + bool next() override; + bool apply() override; + + llama_memory_status get_status() const override; + const llama_ubatch & get_ubatch() const override; + + // + // llama_kv_cache_dsa_context specific API + // + + const llama_kv_cache_context * get_mla() const; + const llama_kv_cache_context * get_lid() const; + +private: + //llama_kv_cache_dsa * kv; + + // the index of the next ubatch to process + size_t i_next = 0; + + std::vector ubatches; + + const llama_memory_context_ptr ctx_mla; + const llama_memory_context_ptr ctx_lid; + + const llama_memory_status status; +}; diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 26e2cb4270b..9b9f1790363 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); kv_base = std::make_unique( - model, type_k, type_v, + model, hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( - model, type_k, type_v, + model, hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, hparams.n_swa, hparams.swa_type, filter_swa, reuse); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index a49a055a630..ac11f96c22d 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux( llama_kv_cache::llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache( llama_swa_type swa_type, const layer_filter_cb & filter, const layer_reuse_cb & reuse) : - model(model), hparams(model.hparams), v_trans(v_trans), + model(model), hparams(hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); @@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache( // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto & [buft, ctx] : ctx_map) { ggml_backend_buffer_t buf; - if (model.hparams.no_alloc) { + if (hparams.no_alloc) { buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it @@ -293,6 +294,11 @@ llama_kv_cache::llama_kv_cache( ggml_is_quantized(type_k) && hparams.n_embd_head_k() % 64 == 0; + // always create Hadamard rotation tensors for DeepSeek V3.2 DSA lightning indexer + if (model.arch == LLM_ARCH_DEEPSEEK32 && hparams.n_embd_head_k_full == hparams.indexer_head_size) { + attn_rot_k = true; + } + attn_rot_v = !attn_rot_disable && n_embd_head_v_all > 0 && @@ -1430,8 +1436,8 @@ struct args_set_input_kq_mask { int64_t n_tps; }; -template -static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) { +template +static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) { //const auto & hparams = args.hparams; const auto & ubatch = args.ubatch; @@ -1445,6 +1451,9 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * const int64_t n_stream = args.n_stream; const int64_t n_tps = args.n_tps; + const T mask_keep = llama_cast(0.0f); + const T mask_drop = llama_cast(-INFINITY); + // the min position in the batch for each sequence llama_pos seq_pos_min[LLAMA_MAX_SEQ]; std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX); @@ -1563,46 +1572,55 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * } if (alibi) { - data[idst + j] = -std::abs(p0 - p1); + data[idst + j] = llama_cast(static_cast(-std::abs(p0 - p1))); } else { - data[idst + j] = 0.0f; + data[idst + j] = mask_keep; } continue; skip: - data[idst + j] = -INFINITY; + data[idst + j] = mask_drop; } } } } -template -static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) { +template +static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) { const bool alibi = args.hparams.use_alibi; if (alibi) { - set_input_kq_mask_impl (args, data); + set_input_kq_mask_impl (args, data); } else { - set_input_kq_mask_impl(args, data); + set_input_kq_mask_impl(args, data); } } -template -static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) { +template +static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) { const bool is_2d = args.ubatch->is_pos_2d(); if (is_2d) { - set_input_kq_mask_impl (args, data); + set_input_kq_mask_impl (args, data); } else { - set_input_kq_mask_impl(args, data); + set_input_kq_mask_impl(args, data); } } -template -static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) { +template +static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data) { const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE; if (swa) { - set_input_kq_mask_impl (args, data); + set_input_kq_mask_impl (args, data); + } else { + set_input_kq_mask_impl(args, data); + } +} + +template +static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data, bool causal_attn) { + if (causal_attn) { + set_input_kq_mask_impl (args, data); } else { - set_input_kq_mask_impl(args, data); + set_input_kq_mask_impl(args, data); } } @@ -1610,7 +1628,6 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u const uint32_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); - float * data = (float *) dst->data; const int64_t n_kv = dst->ne[0]; const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch @@ -1634,10 +1651,10 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u /*.n_tps =*/ n_tps, }; - if (causal_attn) { - set_input_kq_mask_impl (args, data); + if (dst->type == GGML_TYPE_F16) { + set_input_kq_mask_impl(args, (ggml_fp16_t *) dst->data, causal_attn); } else { - set_input_kq_mask_impl(args, data); + set_input_kq_mask_impl(args, (float *) dst->data, causal_attn); } //const int64_t t_end = ggml_time_us(); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 0b62dc7b232..649269af6dd 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -93,8 +93,12 @@ class llama_kv_cache : public llama_memory_i { using slot_info_vec_t = std::vector; + // TODO: refactor the memory instances to not depend on `llama_model` + // instead pass all necessary info (e.g. hparams, dev layers, arch, etc.) directly + // likely through `struct llama_memory_params` llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index 10e6b459797..72f5c2fea72 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -24,6 +24,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( uint32_t rs_size, /* common */ uint32_t n_seq_max, + uint32_t n_rs_seq, bool offload, bool unified, /* layer filters */ @@ -54,6 +55,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( offload, rs_size, n_seq_max, + n_rs_seq, filter_recr == nullptr ? [&](int32_t il) { return hparams.is_recurrent(il); } : filter_recr @@ -73,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_base()->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_base()->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-hybrid-iswa.h b/src/llama-memory-hybrid-iswa.h index 807c8aac96c..c9d3f9f57c5 100644 --- a/src/llama-memory-hybrid-iswa.h +++ b/src/llama-memory-hybrid-iswa.h @@ -34,6 +34,7 @@ class llama_memory_hybrid_iswa : public llama_memory_i { uint32_t rs_size, /* common */ uint32_t n_seq_max, + uint32_t n_rs_seq, bool offload, bool unified, /* layer filters */ diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 4ce1af592c1..6bd2ec18ce3 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -24,6 +24,7 @@ llama_memory_hybrid::llama_memory_hybrid( uint32_t rs_size, /* common */ uint32_t n_seq_max, + uint32_t n_rs_seq, bool offload, bool unified, /* layer filters */ @@ -32,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid( hparams(model.hparams), mem_attn(new llama_kv_cache( model, + model.hparams, type_k, type_v, v_trans, @@ -54,6 +56,7 @@ llama_memory_hybrid::llama_memory_hybrid( offload, rs_size, n_seq_max, + n_rs_seq, filter_recr == nullptr ? [&](int32_t il) { return hparams.is_recurrent(il); } : filter_recr @@ -73,9 +76,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index 558cafdf984..484eafb7499 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -34,6 +34,7 @@ class llama_memory_hybrid : public llama_memory_i { uint32_t rs_size, /* common */ uint32_t n_seq_max, + uint32_t n_rs_seq, bool offload, bool unified, /* layer filters */ diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index c07f1d969cb..ec5dc5835dd 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -24,6 +24,7 @@ llama_memory_recurrent::llama_memory_recurrent( bool offload, uint32_t mem_size, uint32_t n_seq_max, + uint32_t n_rs_seq, const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) { const int32_t n_layer = hparams.n_layer; @@ -31,6 +32,9 @@ llama_memory_recurrent::llama_memory_recurrent( size = mem_size; used = 0; + this->n_rs_seq = n_rs_seq; + rs_idx.assign(n_seq_max, 0); + cells.clear(); cells.resize(mem_size); @@ -92,8 +96,9 @@ llama_memory_recurrent::llama_memory_recurrent( throw std::runtime_error("failed to create ggml context for rs cache"); } - ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), mem_size); - ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), mem_size); + const uint32_t n_rows = mem_size * (1 + n_rs_seq); + ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), n_rows); + ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), n_rows); ggml_format_name(r, "cache_r_l%d", i); ggml_format_name(s, "cache_s_l%d", i); r_l[i] = r; @@ -115,8 +120,8 @@ llama_memory_recurrent::llama_memory_recurrent( const size_t memory_size_r = size_r_bytes(); const size_t memory_size_s = size_s_bytes(); - LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__, - (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max, + LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs %2u rs_seq), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__, + (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max, n_rs_seq, ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f), ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f)); } @@ -138,10 +143,11 @@ void llama_memory_recurrent::clear(bool data) { ggml_backend_buffer_clear(buf.get(), 0); } } + + std::fill(rs_idx.begin(), rs_idx.end(), 0); } bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - //printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1); uint32_t new_head = size; if (p0 < 0) { @@ -152,6 +158,15 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1 = std::numeric_limits::max(); } + const bool rm_all = p0 == 0 && p1 == std::numeric_limits::max(); + if (rm_all) { + if (seq_id >= 0) { + set_rs_idx(seq_id, 0); + } else { + std::fill(rs_idx.begin(), rs_idx.end(), 0); + } + } + // models like Mamba or RWKV can't have a state partially erased at the end // of the sequence because their state isn't preserved for previous tokens if (seq_id >= (int64_t) size) { @@ -161,10 +176,16 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos if (0 <= seq_id) { int32_t & tail_id = cells[seq_id].tail; if (tail_id >= 0) { - const auto & cell = cells[tail_id]; - // partial intersection is invalid if it includes the final pos + auto & cell = cells[tail_id]; + + // partial rollback via per-token snapshot index (bounded by n_rs_seq) if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) { - //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1); + const llama_pos rollback = cell.pos - (p0 - 1); + if (rollback >= 1 && rollback <= (llama_pos) n_rs_seq) { + set_rs_idx(seq_id, (uint32_t) rollback); + cell.pos = p0 - 1; + return true; + } return false; } // invalidate tails which will be cleared @@ -368,6 +389,13 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { return result; } +void llama_memory_recurrent::set_rs_idx(llama_seq_id seq_id, uint32_t idx) { + if (seq_id < 0 || (size_t) seq_id >= rs_idx.size()) { + return; + } + rs_idx[seq_id] = (idx > n_rs_seq) ? n_rs_seq : idx; +} + std::map llama_memory_recurrent::memory_breakdown() const { std::map ret; for (const auto & [_, buf] : ctxs_bufs) { @@ -388,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // TODO: non-sequential equal split can be done if using unified KV cache - // for simplicity, we always use sequential equal split for now - ubatch = balloc.split_equal(n_ubatch, true); + if (n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // TODO: non-sequential equal split can be done if using unified KV cache + // for simplicity, we always use sequential equal split for now + ubatch = balloc.split_equal(n_ubatch, true); + } } if (ubatch.n_tokens == 0) { @@ -703,6 +737,7 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq GGML_UNUSED(flags); std::vector> cell_ranges; // ranges, from inclusive, to exclusive + std::vector> cell_ranges_data; // logical source row ranges uint32_t cell_count = 0; // Count the number of cells with the specified seq_id @@ -712,6 +747,35 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq const auto & cell = cells[i]; if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { ++cell_count; + uint32_t rs_idx_cur = 0; + + if (n_rs_seq != 0) { + if (seq_id != -1) { + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < rs_idx.size()); + rs_idx_cur = rs_idx[seq_id]; + } else { + bool has_rs_idx = false; + for (const llama_seq_id cell_seq_id : cell.seq_id) { + GGML_ASSERT(cell_seq_id >= 0 && (size_t) cell_seq_id < rs_idx.size()); + + const uint32_t seq_rs_idx = rs_idx[cell_seq_id]; + if (!has_rs_idx) { + rs_idx_cur = seq_rs_idx; + has_rs_idx = true; + } else if (rs_idx_cur != seq_rs_idx) { + GGML_ABORT("cannot write shared recurrent state with different rollback indices"); + } + } + } + } + + const uint32_t cell_id = rs_idx_cur * size + (cell.src >= 0 ? cell.src : (int32_t) i); + if (cell_ranges_data.empty() || cell_ranges_data.back().second != cell_id) { + cell_ranges_data.emplace_back(cell_id, cell_id + 1); + } else { + cell_ranges_data.back().second++; + } + if (cell_range_begin == size) { cell_range_begin = i; } @@ -726,7 +790,7 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq cell_ranges.emplace_back(cell_range_begin, size); } - if (flags % LLAMA_STATE_SEQ_FLAGS_ON_DEVICE && cell_ranges.size() > 1) { + if ((flags & LLAMA_STATE_SEQ_FLAGS_ON_DEVICE) && cell_ranges.size() > 1) { GGML_ABORT("cannot save/load multiple ranges of cells to/from device memory\n"); } @@ -737,10 +801,16 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq } GGML_ASSERT(cell_count == cell_count_check); + cell_count_check = 0; + for (const auto & range : cell_ranges_data) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + io.write(&cell_count, sizeof(cell_count)); state_write_meta(io, cell_ranges, seq_id); - state_write_data(io, cell_ranges); + state_write_data(io, cell_ranges_data); } void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { @@ -762,6 +832,14 @@ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_i } throw std::runtime_error("failed to restore kv cache"); } + + if (n_rs_seq != 0) { + if (seq_id == -1) { + std::fill(rs_idx.begin(), rs_idx.end(), 0); + } else { + set_rs_idx(seq_id, 0); + } + } } void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { @@ -804,7 +882,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r()); io.write(&r_size_row, sizeof(r_size_row)); - // Write each range of cells of r_size_row length + // Write each logical cell row range. With pending recurrent rollback, + // the logical current state may live in a rollback snapshot plane. for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * r_size_row; @@ -825,7 +904,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); io.write(&s_size_row, sizeof(s_size_row)); - // Write each range of S tensor rows + // Write each logical cell row range. With pending recurrent rollback, + // the logical current state may live in a rollback snapshot plane. for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * s_size_row; @@ -852,9 +932,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // Write GQA embedding size io.write(&n_embd_s, sizeof(n_embd_s)); - // For each row, we get the element values of each cell + // For each row, we get the element values of each logical cell for (uint32_t j = 0; j < n_embd_s; ++j) { - // Write each range of cells of s_size_el length for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * mem_size) * s_size_el; @@ -1163,5 +1242,21 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const { } int32_t llama_memory_recurrent_context::s_copy(int i) const { - return mem->cells[i + mem->head].src0; + const uint32_t cell_idx = i + mem->head; + const int32_t src0 = mem->cells[cell_idx].src0; + + if (mem->n_rs_seq == 0) { + return src0; + } + + uint32_t idx = 0; + if (!mem->cells[cell_idx].seq_id.empty()) { + const llama_seq_id seq = *mem->cells[cell_idx].seq_id.begin(); + if (seq >= 0 && (size_t) seq < mem->rs_idx.size()) { + idx = mem->rs_idx[seq]; + // reset rollback idx + mem->rs_idx[seq] = 0; + } + } + return (int32_t)(idx * mem->size) + src0; } diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 47f01d73912..b13b7b748f5 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -23,6 +23,7 @@ class llama_memory_recurrent : public llama_memory_i { bool offload, uint32_t mem_size, uint32_t n_seq_max, + uint32_t n_rs_seq, const layer_filter_cb & filter); ~llama_memory_recurrent() = default; @@ -69,6 +70,14 @@ class llama_memory_recurrent : public llama_memory_i { uint32_t size = 0; // total number of cells, shared across all sequences uint32_t used = 0; // used cells (i.e. at least one seq_id) + // number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups + uint32_t n_rs_seq = 0; + + // per-seq rollback index + std::vector rs_idx; + + void set_rs_idx(llama_seq_id seq_id, uint32_t idx); + // computed before each graph build uint32_t n = 0; diff --git a/src/llama-memory.h b/src/llama-memory.h index 4a157b91fdb..4ad1612e45b 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -1,6 +1,7 @@ #pragma once #include "llama.h" +#include "llama-graph.h" #include #include @@ -20,6 +21,8 @@ struct llama_memory_params { // use full-size SWA cache bool swa_full; + + llama_context_type ctx_type; }; enum llama_memory_status { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4e65a45a50d..c645d0785ab 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1312,9 +1312,16 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte return tensor; } -void llama_model_loader::done_getting_tensors() const { - if (n_created != n_tensors) { - throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); +void llama_model_loader::done_getting_tensors(bool partial) const { + if (n_created > n_tensors) { + throw std::runtime_error(format("%s: too many tensors created; expected %d, got %d", __func__, n_tensors, n_created)); + } + if (n_created < n_tensors) { + if (!partial) { + throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + } + LLAMA_LOG_INFO("%s: partial load โ€” used %d of %d tensors in the file (rest belong to a sibling model on the same .gguf)\n", + __func__, n_created, n_tensors); } if (n_tensors_moved > 0) { LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n", diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 7b3d6703c03..c476026d3e5 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -184,7 +184,7 @@ struct llama_model_loader { struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required = true); - void done_getting_tensors() const; + void done_getting_tensors(bool partial = false) const; void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr); diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index e83056557bf..528e4c9c069 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() { add_tensor(model->output); add_tensor(model->output_b); add_tensor(model->output_norm_enc); + add_tensor(model->output_s); + add_tensor(model->output_in_s); add_tensor(model->cls); add_tensor(model->cls_b); add_tensor(model->cls_out); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ff30a2ae7a6..3e236f8c17d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10,6 +10,7 @@ #include "llama-kv-cache.h" #include "llama-kv-cache-iswa.h" +#include "llama-kv-cache-dsa.h" #include "llama-memory-hybrid.h" #include "llama-memory-hybrid-iswa.h" #include "llama-memory-recurrent.h" @@ -44,6 +45,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_llama_embed(params); case LLM_ARCH_MAINCODER: return new llama_model_maincoder(params); + case LLM_ARCH_TALKIE: + return new llama_model_talkie(params); case LLM_ARCH_DECI: return new llama_model_deci(params); case LLM_ARCH_BAICHUAN: @@ -170,6 +173,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_deepseek2(params); case LLM_ARCH_DEEPSEEK2OCR: return new llama_model_deepseek2ocr(params); + case LLM_ARCH_DEEPSEEK32: + return new llama_model_deepseek32(params); case LLM_ARCH_GLM_DSA: return new llama_model_glm_dsa(params); case LLM_ARCH_MISTRAL4: @@ -405,16 +410,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str auto get_tensor_config = [&]() -> tensor_config { // standard attention if (std::regex_match(tensor_name, pattern_q_weight) || std::regex_match(tensor_name, pattern_kv_weight)) { - return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight"); + return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight"); } if (std::regex_match(tensor_name, pattern_q_bias) || std::regex_match(tensor_name, pattern_kv_bias)) { - return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight"); + return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight"); } if (std::regex_match(tensor_name, pattern_qkv_weight)) { - return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1); + return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight"); } if ( std::regex_match(tensor_name, pattern_qkv_bias)) { - return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0); + return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "attn_output.weight", "ssm_out.weight"); } if (std::regex_match(tensor_name, pattern_qk_norm)) { return get_tensor_config_impl(tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight"); @@ -430,7 +435,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str } if (std::regex_match(tensor_name, pattern_attn_gate_weight)) { - return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1); + return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_1, "attn_output.weight", "ssm_out.weight"); } if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a)) { return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_0, "ssm_out.weight"); @@ -483,7 +488,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str return get_tensor_config_impl(GGML_BACKEND_SPLIT_AXIS_MIRRORED); }; - auto get_split_segments = [&](int axis, uint32_t il) -> std::vector { + auto get_split_segments = [&](int axis, uint32_t il) -> std::vector> { if (ud->model->arch == LLM_ARCH_QWEN3NEXT || ud->model->arch == LLM_ARCH_QWEN35 || ud->model->arch == LLM_ARCH_QWEN35MOE) { const int64_t head_k_dim = hparams.ssm_d_state; const int64_t head_v_dim = hparams.ssm_d_state; @@ -498,26 +503,26 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str if (ud->model->arch == LLM_ARCH_QWEN3NEXT) { if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) { GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim); - return {key_dim, key_dim, value_dim}; + return {{key_dim, 2}, {value_dim, 1}}; } } else { const int64_t head_ratio = n_v_heads / n_k_heads; if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) { GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim); - return std::vector(2 + head_ratio, key_dim); + return {{key_dim, 2 + head_ratio}}; } if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) { - return std::vector(head_ratio, key_dim); + return {{key_dim, head_ratio}}; } if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) || std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) { - return std::vector(head_ratio, n_k_heads); + return {{n_k_heads, head_ratio}}; } if (std::regex_match(tensor_name, pattern_r_cache)) { - return std::vector(2 + head_ratio, key_dim * (hparams.ssm_d_conv - 1)); + return {{key_dim * (hparams.ssm_d_conv - 1), 2 + head_ratio}}; } if (std::regex_match(tensor_name, pattern_s_cache)) { - return std::vector(head_ratio, n_k_heads * head_v_dim * head_v_dim); + return {{n_k_heads * head_v_dim * head_v_dim, head_ratio}}; } } @@ -525,9 +530,9 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) { const int64_t n_ff_exp = hparams.n_ff_exp; GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp); - return {n_ff_exp, n_ff_exp}; + return {{n_ff_exp, 2}}; } - return {tensor->ne[axis]}; + return {{tensor->ne[axis], 1}}; } if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) { @@ -535,17 +540,17 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str const int64_t n_embd_gqa = hparams.n_embd_v_gqa(il); GGML_ASSERT(hparams.n_embd_k_gqa() == n_embd_gqa); GGML_ASSERT(tensor->ne[axis] == n_embd + 2*n_embd_gqa); - return {n_embd, n_embd_gqa, n_embd_gqa}; + return {{n_embd, 1}, {n_embd_gqa, 2}}; } if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) { const int64_t n_ff_exp = hparams.n_ff_exp; GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp); - return {n_ff_exp, n_ff_exp}; + return {{n_ff_exp, 2}}; } - return {tensor->ne[axis]}; + return {{tensor->ne[axis], 1}}; }; - auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector & segments) -> std::vector { + auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector> & segments) -> std::vector { if (hparams.is_recurrent(il)) { // linear attention const int64_t head_dim = hparams.ssm_d_state; @@ -598,16 +603,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str return {granularity_kv}; } if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_qkv_bias)) { - GGML_ASSERT(segments.size() == 3); - return {granularity_q, granularity_kv, granularity_kv}; + GGML_ASSERT(segments.size() == 2); + return {granularity_q, granularity_kv}; } } // FFN if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) || std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) { - GGML_ASSERT(segments.size() <= 2); - return std::vector(segments.size(), blck_size); + GGML_ASSERT(segments.size() == 1); + return {blck_size}; } // everything else @@ -631,11 +636,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str tensor_split_scan[j] += tensor_split_scan[j - 1]; } } - const std::vector segments = get_split_segments(split_state.axis, tc.il); + const std::vector> segments = get_split_segments(split_state.axis, tc.il); const std::vector granularity = get_split_granularity(blck_size, tc.il, segments); for (size_t is = 0; is < segments.size(); is++) { - const int64_t ne_s = segments[is]; - const int64_t g_s = granularity[is]; + const int64_t ne_s = segments[is].first; + const uint32_t nr_s = segments[is].second; + const int64_t g_s = granularity[is]; GGML_ASSERT(ne_full % g_s == 0); int64_t low = 0; size_t j = 0; @@ -649,10 +655,12 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str low = high; } split_state.ne[is*ud->n_devices + (j + tc.rotation) % ud->n_devices] = ne_s - low; + split_state.nr[is] = nr_s; } split_state.n_segments = segments.size(); } else { memset(split_state.ne, 0, sizeof(split_state.ne)); + split_state.nr[0] = 1; split_state.n_segments = 1; } return split_state; @@ -777,6 +785,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_310B_A15B: return "310B.A15B"; case LLM_TYPE_355B_A32B: return "355B.A32B"; case LLM_TYPE_397B_A17B: return "397B.A17B"; + case LLM_TYPE_685B_A37B: return "685B.A37B"; case LLM_TYPE_744B_A40B: return "744B.A40B"; case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E4B: return "E4B"; @@ -1334,6 +1343,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_s && layer.ssm_beta) { layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // input scales if (!layer.wq_in_s && layer.wq) { @@ -1393,11 +1408,30 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_in_s && layer.ssm_beta) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + } + // output scales + if (output && output->type == GGML_TYPE_NVFP4) { + // weight scale + if (!output_s) { + output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED); + } + // input scale + if (!output_in_s) { + output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED); + } } } - ml.done_getting_tensors(); + GGML_ASSERT(!(output && tok_embd && + strcmp(output->name, tok_embd->name) == 0 && + output->type == GGML_TYPE_NVFP4)); // populate tensors_by_name for (auto & [_, ctx_ptr] : ml.ctx_map) { for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) { @@ -1742,7 +1776,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); } - if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { + if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); @@ -1930,10 +1964,33 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, { res = nullptr; } break; + case LLM_ARCH_DEEPSEEK32: + { + res = new llama_kv_cache_dsa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + 1, + hparams.n_swa, + hparams.swa_type, + nullptr, + nullptr); + } break; // Models that need standard caching should rely on recurrent/hybrid // checks default: { + // The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain + // attention KV cache for the MTP context instead of the hybrid wrapper. + const bool mtp_on_hybrid_qwen35 = + params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && + (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE); + if (llm_arch_is_recurrent(arch)) { res = new llama_memory_recurrent( *this, @@ -1942,8 +1999,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.offload_kqv, std::max((uint32_t) 1, cparams.n_seq_max), cparams.n_seq_max, + cparams.n_rs_seq, nullptr); - } else if (llm_arch_is_hybrid(arch)) { + } else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) { // The main difference between hybrid architectures is the // layer filters, so pick the right one here llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; @@ -1958,6 +2016,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, filter_recr = [&](int32_t il) { return hparams.is_recurrent(il) && hparams.n_ff(il) == 0; }; + } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { + const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + filter_attn = [&, n_main](int32_t il) { + return (uint32_t)il < n_main && !hparams.is_recurrent(il); + }; + filter_recr = [&, n_main](int32_t il) { + return (uint32_t)il < n_main && hparams.is_recurrent(il); + }; } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { @@ -1975,6 +2041,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_type_s */ GGML_TYPE_F32, /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, + /* n_rs_seq */ cparams.n_rs_seq, /* offload */ cparams.offload_kqv, /* unified */ cparams.kv_unified, /* filter_attn */ std::move(filter_attn), @@ -1993,6 +2060,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_type_v */ GGML_TYPE_F32, /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, + /* n_rs_seq */ cparams.n_rs_seq, /* offload */ cparams.offload_kqv, /* unified */ cparams.kv_unified, /* filter_attn */ std::move(filter_attn), @@ -2000,6 +2068,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } } else { llama_memory_i::layer_reuse_cb reuse = nullptr; + llama_kv_cache::layer_filter_cb filter = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { reuse = [&](int32_t il) { @@ -2011,6 +2080,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, }; } + if (mtp_on_hybrid_qwen35) { + const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + } + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); @@ -2026,13 +2100,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_seq_max, cparams.n_ubatch, 1, - nullptr, + filter, reuse); } else { GGML_ASSERT(!hparams.is_swa_any()); res = new llama_kv_cache( *this, + hparams, params.type_k, params.type_v, !cparams.flash_attn, @@ -2043,7 +2118,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, 1, hparams.n_swa, hparams.swa_type, - nullptr, + filter, nullptr); } } @@ -2146,6 +2221,7 @@ int32_t llama_model_n_swa(const llama_model * model) { return model->hparams.n_swa; } + uint32_t llama_model_n_cls_out(const struct llama_model * model) { return model->hparams.n_cls_out; } @@ -2221,6 +2297,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_DEEPSEEK2OCR: + case LLM_ARCH_DEEPSEEK32: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: @@ -2304,6 +2381,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_MIMO2: case LLM_ARCH_STEP35: + case LLM_ARCH_TALKIE: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/llama-model.h b/src/llama-model.h index d63c689185a..743feb970d9 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -137,6 +137,7 @@ enum llm_type { LLM_TYPE_310B_A15B, // /MiMo-V2-Flash LLM_TYPE_355B_A32B, // GLM-4.5 LLM_TYPE_397B_A17B, // Qwen3.5 + LLM_TYPE_685B_A37B, // DeepSeek V3.2 LLM_TYPE_744B_A40B, // GLM-5 LLM_TYPE_E2B, LLM_TYPE_E4B, @@ -202,12 +203,16 @@ struct llama_layer_shortconv { }; struct llama_layer_nextn { - struct ggml_tensor * eh_proj = nullptr; - struct ggml_tensor * embed_tokens = nullptr; - struct ggml_tensor * enorm = nullptr; - struct ggml_tensor * hnorm = nullptr; - struct ggml_tensor * shared_head_head = nullptr; - struct ggml_tensor * shared_head_norm = nullptr; + struct ggml_tensor * eh_proj = nullptr; + struct ggml_tensor * eh_proj_s = nullptr; + struct ggml_tensor * eh_proj_in_s = nullptr; + struct ggml_tensor * embed_tokens = nullptr; + struct ggml_tensor * enorm = nullptr; + struct ggml_tensor * hnorm = nullptr; + struct ggml_tensor * shared_head_head = nullptr; + struct ggml_tensor * shared_head_head_s = nullptr; + struct ggml_tensor * shared_head_head_in_s = nullptr; + struct ggml_tensor * shared_head_norm = nullptr; }; struct llama_layer { @@ -484,7 +489,7 @@ struct llama_layer { struct ggml_tensor * indexer_attn_k = nullptr; struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias - // gemma4 layer output scale + // gemma4 layer output scale, reused for talkie embedding skip scale struct ggml_tensor * out_scale = nullptr; struct llama_layer_posnet posnet; @@ -533,6 +538,11 @@ struct llama_model { struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + + // NVFP4 per-tensor scale2, input_scale for LM head + struct ggml_tensor * output_s = nullptr; + struct ggml_tensor * output_in_s = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 4b5439b3ab6..9c32cf6a85e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -542,6 +542,21 @@ struct llm_tokenizer_bpe : llm_tokenizer { }; byte_encode = false; break; + case LLAMA_VOCAB_PRE_TYPE_MINICPM5: + regex_exprs = { + // original regex from tokenizer.json (openbmb/MiniCPM5-1B) + "\\p{N}{1,3}", + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; + case LLAMA_VOCAB_PRE_TYPE_WHITESPACE: + // whitespace pre-tokenizer (jinaai/jina-embeddings-v2-base-zh) + regex_exprs = { + "\\S+", + }; + byte_encode = false; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -563,7 +578,11 @@ struct llm_tokenizer_bpe_session { vocab(vocab), tokenizer(tokenizer) {} - static void append(const llama_token token_id, std::vector & output) { output.push_back(token_id); } + virtual ~llm_tokenizer_bpe_session() = default; + + static void append(const llama_token token_id, std::vector & output) { + output.push_back(token_id); + } bool append_bos(std::vector & output) const { if (vocab.get_add_bos()) { @@ -600,9 +619,9 @@ struct llm_tokenizer_bpe_session { } } - void tokenize(const std::string & text, std::vector & output) { - int final_prev_index = -1; - const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode); + virtual void tokenize(const std::string & text, std::vector & output) { + int final_prev_index = -1; + const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode); symbols_final.clear(); auto tok_pre = vocab.get_pre_type(); @@ -770,7 +789,7 @@ struct llm_tokenizer_wpm_session { void tokenize(const std::string & text, std::vector & output) { // normalize and split by whitespace - std::vector words = preprocess(text); + std::vector words = preprocess(text, vocab.get_normalizer_lowercase()); // bos token prepended already // find the longest tokens that form the words @@ -815,7 +834,7 @@ struct llm_tokenizer_wpm_session { } // TODO: reduce string copies by using cpts_offs array - static std::vector preprocess(const std::string & text) { + static std::vector preprocess(const std::string & text, bool lowercase) { const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); std::vector words(1, ""); @@ -834,8 +853,8 @@ struct llm_tokenizer_wpm_session { continue; } - const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); - if (flags.is_punctuation || (cpt < 0x7F && flags.is_symbol) || is_chinese_char(cpt)) { + const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); + if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { if (words.back().size()) { // finish previous word if any words.emplace_back(); } @@ -1614,6 +1633,117 @@ struct llm_tokenizer_plamo2_session { const llm_tokenizer_plamo2 & tokenizer; }; +// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical +// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token +// text at load +static const std::string dna_kmer_marker = "\xee\x80\x80"; + +struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { + llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} + + void tokenize(const std::string & text, std::vector & output) override { + static const std::string open_tag = ""; + static const std::string close_tag = ""; + + const auto dna_begin_id = vocab.text_to_token(open_tag); + const auto dna_end_id = vocab.text_to_token(close_tag); + const auto dna_oov_id = vocab.text_to_token(""); + + // Fall back to plain BPE if the DNA pieces aren't in the vocab. + if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) { + llm_tokenizer_bpe_session::tokenize(text, output); + return; + } + + const size_t k = 6; + size_t pos = 0; + + while (pos < text.size()) { + const size_t start = text.find(open_tag, pos); + if (start == std::string::npos) { + if (pos < text.size()) { + llm_tokenizer_bpe_session::tokenize(text.substr(pos), output); + } + break; + } + if (start > pos) { + llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output); + } + output.push_back(dna_begin_id); + + const size_t content_start = start + open_tag.size(); + const size_t end = text.find(close_tag, content_start); + const size_t content_end = (end == std::string::npos) ? text.size() : end; + + emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output); + + if (end == std::string::npos) { + break; + } + output.push_back(dna_end_id); + pos = end + close_tag.size(); + } + } + +private: + void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector & output) { + std::string seq = raw; + for (char & c : seq) { + if (c >= 'a' && c <= 'z') { + c = char(c - 32); + } + } + + // k-mers carry the reserved marker suffix; a non-ACGT k-mer simply + // isn't in the vocab and falls back to + auto kmer_token = [&](const std::string & kmer) { + const auto tok = vocab.text_to_token(kmer + dna_kmer_marker); + return tok != LLAMA_TOKEN_NULL ? tok : oov_id; + }; + + size_t i = 0; + for (; i + k <= seq.size(); i += k) { + output.push_back(kmer_token(seq.substr(i, k))); + } + if (i < seq.size()) { + std::string kmer = seq.substr(i); + kmer.append(k - kmer.size(), 'A'); + output.push_back(kmer_token(kmer)); + } + } + + const llama_vocab & vocab; +}; + +struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session { + llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} + + void tokenize(const std::string & text, std::vector & output) override { + const bool lowercase = vocab.get_normalizer_lowercase(); + + std::string segment; + auto flush = [&]() { + if (!segment.empty()) { + llm_tokenizer_bpe_session::tokenize(segment, output); + segment.clear(); + } + }; + + for (uint32_t cpt : unicode_cpts_from_utf8(text)) { + // drop whitespace + if (unicode_cpt_flags_from_cpt(cpt).is_whitespace) { + flush(); + } else { + segment += unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); + } + } + flush(); + } + +private: + const llama_vocab & vocab; +}; + // // impl // @@ -1692,6 +1822,7 @@ struct llama_vocab::impl { bool remove_extra_whitespaces = false; bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; + bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json) std::unordered_map token_to_id; std::vector id_to_token; @@ -1827,7 +1958,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; - } else if (tokenizer_model == "gpt2") { + } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna" || tokenizer_model == "whitespace") { type = LLAMA_VOCAB_TYPE_BPE; // read bpe merges and populate bpe ranks @@ -1975,10 +2106,20 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if (tokenizer_pre == "default") { pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - } else if (tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe" || - tokenizer_pre == "falcon3" || tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral" || - tokenizer_pre == "midm-2.0" || tokenizer_pre == "lfm2" || tokenizer_pre == "jina-v5-nano") { - pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + } else if (tokenizer_pre == "minicpm5") { + pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5; + ignore_merges = true; + } else if ( + tokenizer_pre == "llama3" || + tokenizer_pre == "llama-v3" || + tokenizer_pre == "llama-bpe"|| + tokenizer_pre == "falcon3" || + tokenizer_pre == "falcon-h1" || + tokenizer_pre == "pixtral" || + tokenizer_pre == "midm-2.0" || + tokenizer_pre == "lfm2" || + tokenizer_pre == "jina-v5-nano") { + pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; } else if (tokenizer_pre == "deepseek-llm") { @@ -2020,8 +2161,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "jina-v2-code" || tokenizer_pre == "roberta-bpe") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; - add_sep = true; - } else if (tokenizer_pre == "refact") { + add_sep = true; + } else if ( + tokenizer_pre == "whitespace") { + pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE; + normalizer_lowercase = false; + } else if ( + tokenizer_pre == "refact") { pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; } else if (tokenizer_pre == "command-r") { pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; @@ -2080,8 +2226,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA; } else if (tokenizer_pre == "megrez") { pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; - } else if (tokenizer_pre == "gpt-4o" || tokenizer_pre == "llama4" || tokenizer_pre == "kanana2") { - pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; + } else if ( + tokenizer_pre == "gpt-4o" || + tokenizer_pre == "llama4" || + tokenizer_pre == "kanana2" || + tokenizer_pre == "talkie") { + pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; } else if (tokenizer_pre == "tiny_aya") { pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA; @@ -2237,6 +2387,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } GGML_ASSERT(id_to_token.size() == token_to_id.size()); + // hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase + // it from id_to_token so the k-mers detokenize to the bare DNA sequence. The + // k-mers are the block right after , so only scan from there. + if (tokenizer_model == "hybriddna") { + const auto idx = token_to_id.find(""); + if (idx != token_to_id.end()) { + auto it = id_to_token.begin() + idx->second + 1; + for (; it != id_to_token.end(); ++it) { + std::string & text = it->text; + if (text.size() > dna_kmer_marker.size() + && text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) { + text.erase(text.size() - dna_kmer_marker.size()); + } + } + } + } + init_tokenizer(type); // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' @@ -2330,6 +2497,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } } + // Lowercase normalizer flag (consulted by WPM / whitespace BPE) + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); + // auto-detect special tokens by text // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_... // for now, we apply this workaround to find the tokens based on their text @@ -3128,11 +3298,21 @@ std::vector llama_vocab::impl::tokenize(const std::string & raw_tex break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe_session session(vocab, *static_cast(tokenizer.get())); // it calls some other methods that are not exist in llm_tokenizer, // here just cast it to bpe tokenizer object + const llm_tokenizer_bpe * tok_bpe = static_cast(tokenizer.get()); + + std::unique_ptr session; + if (vocab.get_tokenizer_model() == "hybriddna") { + session = std::make_unique(vocab, *tok_bpe); + } else if (vocab.get_tokenizer_model() == "whitespace") { + session = std::make_unique(vocab, *tok_bpe); + } else { + session = std::make_unique(vocab, *tok_bpe); + } + if (add_special) { - session.append_bos(output); + session->append_bos(output); } for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -3146,15 +3326,15 @@ std::vector llama_vocab::impl::tokenize(const std::string & raw_tex LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(text, output); - } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - session.append(fragment.token, output); + session->tokenize(text, output); + } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) + session->append(fragment.token, output); } } if (add_special) { - session.append_eos(output); - session.check_double_bos_eos(output); + session->append_eos(output); + session->check_double_bos_eos(output); } } break; @@ -3820,6 +4000,10 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { return pimpl->treat_whitespace_as_suffix; } +bool llama_vocab::get_normalizer_lowercase() const { + return pimpl->normalizer_lowercase; +} + int llama_vocab::max_token_len() const { return pimpl->max_token_len; } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 8b040b912e2..093e5d02cda 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -60,6 +60,8 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51, + LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52, + LLAMA_VOCAB_PRE_TYPE_WHITESPACE = 53, }; struct LLM_KV; @@ -137,6 +139,7 @@ struct llama_vocab { bool get_remove_extra_whitespaces () const; bool get_escape_whitespaces () const; bool get_treat_whitespace_as_suffix() const; + bool get_normalizer_lowercase () const; int max_token_len() const; diff --git a/src/llama.cpp b/src/llama.cpp index dfe30ce8f61..a67fa8039a4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -225,7 +225,9 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama } case GGML_BACKEND_DEVICE_TYPE_IGPU: - igpus.push_back({false, dev}); + if (igpus.empty()) { + igpus.push_back({false, dev}); + } break; case GGML_BACKEND_DEVICE_TYPE_META: GGML_ABORT("fatal error"); @@ -239,8 +241,9 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama // add GPUs model->devices.insert(model->devices.end(), gpus.begin(), gpus.end()); - // add integrated GPUs only if no other devices were found - if (model->devices.empty()) { + // add integrated GPUs only if no discrete GPUs were found + // (RPC servers do not count, otherwise the local iGPU would be dropped on iGPU+RPC setups) + if (gpus.empty()) { model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); } } diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 602e3176afd..a7c77ee5d28 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index 136ff702957..bec7136521c 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index 70e86d41130..d086c4717ff 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index d8653a44639..27deadffeb7 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 79aa8c90899..9bd04127b25 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4e55290e4e5..4d26081cd5d 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index 030dd4f42a4..fe1ae10864b 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index e7fe3d5b45a..2f0d44a6259 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index b600fb0c954..30b0f3d07d0 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 8510b9e29f8..4bceaefd63b 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output_with_img_logits", -1); // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index e898eff7939..6766fa71c15 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index e9e85d96713..274dd3342a7 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 79236121bd5..2e231bb3f93 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; ggml_build_forward_expand(gf, cur); diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index 12edbae1094..a514cf88fc6 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index decb89f547b..adf7fcaa20f 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index bce6b04bcf9..af71c775365 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 9f1a959c32c..567e3535276 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp index c7946059662..f52ec9518b6 100644 --- a/src/models/deepseek.cpp +++ b/src/models/deepseek.cpp @@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deepseek32.cpp b/src/models/deepseek32.cpp new file mode 100644 index 00000000000..c92ab60d166 --- /dev/null +++ b/src/models/deepseek32.cpp @@ -0,0 +1,503 @@ +#include "models.h" + +#include "llama-kv-cache.h" +#include "llama-kv-cache-dsa.h" + +void llama_model_deepseek32::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + hparams.f_norm_eps = 1e-6; // eps for layer norm + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // MoE parameters + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + + // deepseek MLA parameters + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + + // DSA parameters + ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); + ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); + ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); + + // Expert gating function + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + + if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) { + // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + // cancel the factor from the convert script + hparams.rope_yarn_log_mul /= 0.1f; + } + + // NextN/MTP parameters + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + + switch (hparams.n_layer) { + case 62: type = LLM_TYPE_685B_A37B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_deepseek32::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + const bool is_mla = hparams.is_mla(); + if (!is_mla) { + throw std::runtime_error("DEEPSEEK32 architecture requires MLA"); + } + + // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); + + const int64_t n_embd_head_qk_rope = hparams.n_rot(); + const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + // try to load output.weight, if not found, use token_embd (tied embeddings) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (!output) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + int flags = 0; + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // skip all tensors in the NextN layers + // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later + flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; + } + + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags); + layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags); + + layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags); + + layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags); + + // note: only old legacy GGUF files will have the unsplit wkv_b tensor in + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); + + // DSA indexer + layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags); + layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags); + layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags); + layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags); + layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags); + if (i < (int) hparams.n_layer_dense_lead) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); + } else { + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0"); + } + + // MoE branch + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + + // Shared expert branch + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); + } + + // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + + // Optional tensors + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED); + } + } +} + +std::unique_ptr llama_model_deepseek32::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_deepseek32::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const bool is_mla = hparams.is_mla(); + GGML_ASSERT(is_mla); + + // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA + const int64_t n_embd_head_k = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v = hparams.n_embd_head_v_mla(); + GGML_UNUSED(n_embd_head_v); + + const int64_t n_embd_head_qk_rope = hparams.n_rot(); + const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope; + + const int64_t n_indexer_head = hparams.indexer_n_head; + const int64_t n_embd_indexer_head = hparams.indexer_head_size; + const int64_t n_embd_indexer_head_rope = hparams.n_rot(); + const int64_t n_embd_indexer_head_nope = n_embd_indexer_head - n_embd_indexer_head_rope; + const uint32_t n_indexer_top_k = hparams.indexer_top_k; + + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. + // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation. + // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + + // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor + GGML_ASSERT(ext_factor >= 0.0f); + const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale)); + + // use the original attn_factor to pre-scale the kq_scale + const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); + const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k)); + + ggml_tensor * cur; + ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + llm_graph_input_attn_k_dsa * inp_attn_dsa = build_attn_inp_k_dsa(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; + for (int il = 0; il < effective_n_layers; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + ggml_tensor * qr = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(qr, "qr", il); + + qr = build_norm(qr, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il); + cb(qr, "qr", il); + + ggml_tensor * top_k = nullptr; + + // lightning indexer + { + ggml_tensor * indexer_q = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_q_b, qr); + cb(indexer_q, "indexer_q", il); + + // split into {n_embd_indexer_head_rope, n_indexer_head, n_tokens} + ggml_tensor * indexer_q_pe = + ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_rope, n_indexer_head, n_tokens, + ggml_row_size(indexer_q->type, n_embd_indexer_head), + ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, 0); + cb(indexer_q_pe, "indexer_q_pe", il); + + // and {n_embd_indexer_head_nope, n_indexer_head, n_tokens} + ggml_tensor * indexer_q_nope = + ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_nope, n_indexer_head, n_tokens, + ggml_row_size(indexer_q->type, n_embd_indexer_head), + ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, + ggml_row_size(indexer_q->type, n_embd_indexer_head_nope)); + cb(indexer_q_nope, "indexer_q_nope", il); + + indexer_q_pe = ggml_rope_ext(ctx0, indexer_q_pe, inp_pos, nullptr, n_rot, + LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(indexer_q_pe, "indexer_q_pe", il); + + // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, n_head, n_tokens} + indexer_q = ggml_concat(ctx0, indexer_q_pe, indexer_q_nope, 0); + cb(indexer_q, "indexer_q", il); + + ggml_tensor * indexer_k = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_k, cur); + cb(indexer_k, "indexer_k", il); + + indexer_k = build_norm(indexer_k, model.layers[il].indexer_k_norm, model.layers[il].indexer_k_norm_b, LLM_NORM, il); + cb(indexer_k, "indexer_k", il); + + // split into {n_embd_indexer_head_rope, 1, n_tokens} + ggml_tensor * indexer_k_pe = + ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_rope, 1, n_tokens, + ggml_row_size(indexer_k->type, n_embd_indexer_head), + ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, 0); + cb(indexer_k_pe, "indexer_k_pe", il); + + // and {n_embd_indexer_head_nope, 1, n_tokens} + ggml_tensor * indexer_k_nope = + ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_nope, 1, n_tokens, + ggml_row_size(indexer_k->type, n_embd_indexer_head), + ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, + ggml_row_size(indexer_k->type, n_embd_indexer_head_nope)); + cb(indexer_k_nope, "indexer_k_nope", il); + + indexer_k_pe = ggml_rope_ext(ctx0, indexer_k_pe, inp_pos, nullptr, n_rot, + LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(indexer_k_pe, "indexer_k_pe", il); + + // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, 1, n_tokens} + indexer_k = ggml_concat(ctx0, indexer_k_pe, indexer_k_nope, 0); + cb(indexer_k, "indexer_k", il); + + // perform Hadamard transform on indexer q and k + indexer_q = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_q); + cb(indexer_q, "indexer_q", il); + indexer_k = ggml_mul_mat(ctx0, inp_attn_dsa->self_k_rot_lid, indexer_k); + cb(indexer_k, "indexer_k", il); + + // store indexer keys to KV cache + const auto * mctx_lid = inp_attn_dsa->mctx->get_lid(); + const auto & k_idxs_lid = inp_attn_dsa->get_k_idxs_lid(); + ggml_build_forward_expand(gf, mctx_lid->cpy_k(ctx0, indexer_k, k_idxs_lid, il)); + + // prepare indexer weights + ggml_tensor * indexer_weights = ggml_mul_mat(ctx0, model.layers[il].indexer_proj, cur); + cb(indexer_weights, "indexer_weights", il); + + // get cached indexer keys + indexer_k = mctx_lid->get_k(ctx0, il); + + // split the batch into streams if needed + const auto n_stream = indexer_k->ne[3]; + indexer_q = ggml_view_4d(ctx0, indexer_q, indexer_q->ne[0], indexer_q->ne[1], indexer_q->ne[2]/n_stream, n_stream, indexer_q->nb[1], indexer_q->nb[2], indexer_q->nb[3]/n_stream, 0); + indexer_weights = ggml_view_4d(ctx0, indexer_weights, indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream, indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0); + + // calculate indexer kq + indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3); + cb(indexer_q, "indexer_q", il); + indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3); + cb(indexer_k, "indexer_k", il); + + ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q); + cb(indexer_kq, "indexer_kq", il); + + // ReLU requires contiguous tensors + indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3)); + cb(indexer_kq, "indexer_kq", il); + + // apply ReLU + ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq); + cb(indexer_score, "indexer_score", il); + + // pre-scale weights to avoid scaling operations on huge indexer_score tensor + indexer_weights = ggml_scale(ctx0, indexer_weights, 1.0f / sqrtf(float(n_embd_indexer_head * n_indexer_head))); + cb(indexer_weights, "indexer_weights", il); + + // multiply scores by indexer weights + indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights); + cb(indexer_score, "indexer_score", il); + + // sum by q n_indexer_head dimension + indexer_score = ggml_sum_rows(ctx0, indexer_score); + cb(indexer_score, "indexer_score", il); + + // permute result to match KQ mask + indexer_score = ggml_cont(ctx0, ggml_permute(ctx0, indexer_score, 2, 1, 0, 3)); + cb(indexer_score, "indexer_score", il); + + // mask indexer scores + ggml_tensor * indexer_kq_mask = inp_attn_dsa->get_kq_mask_lid(); + indexer_score = ggml_add(ctx0, indexer_score, indexer_kq_mask); + cb(indexer_score, "indexer_score", il); + + // get indices of top k indexer scores + uint32_t n_top_k = indexer_score->ne[0] < n_indexer_top_k ? indexer_score->ne[0] : n_indexer_top_k; + top_k = ggml_cont(ctx0, ggml_top_k(ctx0, indexer_score, n_top_k)); + cb(top_k, "top_k", il); + } + + ggml_tensor * q = ggml_mul_mat(ctx0, model.layers[il].wq_b, qr); + cb(q, "q", il); + + // split into {n_embd_head_qk_nope, n_head, n_tokens} + ggml_tensor * q_nope = + ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k), + ggml_row_size(q->type, n_embd_head_k) * n_head, 0); + cb(q_nope, "q_nope", il); + + // and {n_embd_head_qk_rope, n_head, n_tokens} + ggml_tensor * q_pe = ggml_view_3d( + ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k), + ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_cmpr_pe, "kv_cmpr_pe", il); + + // split into {kv_lora_rank, n_tokens} + ggml_tensor * kv_cmpr = + ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0); + cb(kv_cmpr, "kv_cmpr", il); + + // and {n_embd_head_qk_rope, 1, n_tokens} + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); + + q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(q_pe, "q_pe", il); + + k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(k_pe, "k_pe", il); + + kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); + cb(kv_cmpr, "kv_cmpr", il); + + // MLA attention + { + // {n_embd_head_qk_nope, n_tokens, n_head} + q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); + cb(q_nope, "q_nope_perm", il); + + // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head} + ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope); + cb(q_nope_absorbed, "q_nope_absorbed", il); + + // {kv_lora_rank, n_head, n_tokens} + q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); + cb(q_nope_absorbed, "q_nope_absorbed_perm", il); + + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} + // note: rope must go first for in-place context shifting in build_rope_shift() + ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0); + cb(Qcur, "Qcur", il); + + kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); + cb(kv_cmpr, "kv_cmpr_reshape", il); + + // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} + ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0); + cb(Kcur, "Kcur", il); + + // {kv_lora_rank, 1, n_tokens} + ggml_tensor * Vcur = kv_cmpr; + cb(Vcur, "Vcur", il); + + // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group) + cur = build_attn(inp_attn_dsa, + model.layers[il].wo, NULL, model.layers[il].wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il); + } + } + if (il == effective_n_layers - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + if ((uint32_t) il < hparams.n_layer_dense_lead) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s, + model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il, + nullptr, + model.layers[il].ffn_gate_up_exps, + model.layers[il].ffn_up_exps_s, + model.layers[il].ffn_gate_exps_s, + model.layers[il].ffn_down_exps_s); + cb(moe_out, "ffn_moe_out", il); + + // FFN shared expert + { + ggml_tensor * ffn_shexp = + build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s, + model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s, + model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index 6bc989c9509..4f4c7cac7a8 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -1,6 +1,7 @@ #include "models.h" #include "llama-impl.h" +#include "llama-memory-recurrent.h" // utility to get one slice from the third dimension // input dim: [x, y, c, b] @@ -397,7 +398,9 @@ std::pair llm_build_delta_net_base::build_delta_ne GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs); GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs); - ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s); + // K=1 (final state only): reshape to 3D (S_v*S_v*H_v, 1, n_seqs) for ggml_gated_delta_net. + ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, S_v * S_v * H_v, 1, n_seqs); + ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d); if (n_tokens == 1) { cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il); } else { @@ -443,3 +446,162 @@ std::pair llm_build_delta_net_base::build_delta_ne return build_delta_net_chunking(q, k, v, g, b, s, il); } + +ggml_tensor * llm_build_delta_net_base::build_conv_state( + llm_graph_input_rs * inp, + ggml_tensor * conv_states_all, + ggml_tensor * qkv_mixed, + int64_t conv_kernel_size, + int64_t conv_channels, + int il) { + const auto * mctx_cur = inp->mctx; + + const auto kv_head = mctx_cur->get_head(); + const auto mem_size = mctx_cur->get_size(); + + const int64_t n_seqs = ubatch.n_seqs; + + ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); + cb(conv_states, "conv_states", il); + + conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); + cb(conv_states, "conv_states_reshaped", il); + + qkv_mixed = ggml_transpose(ctx0, qkv_mixed); + cb(qkv_mixed, "qkv_mixed_transposed", il); + + ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); + cb(conv_input, "conv_input", il); + + const int64_t row_count = (conv_kernel_size - 1) * conv_channels; + + const size_t row_size = ggml_row_size(conv_states_all->type, row_count); + + if (cparams.n_rs_seq == 0) { + const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0]; + const int64_t s_slot = 0; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + cb(conv_state_last, "conv_state_last", il); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, conv_states_all, + row_count, n_seqs, conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + cb(conv_state_update, "conv_state_update", il); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); + } else { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are + // inside the same ubatch. currently with `split_equal()` this is not correct + + const int64_t K = (int64_t) cparams.n_rs_seq + 1; + + for (int64_t t = 1; t <= K; ++t) { + const int64_t s_idx = std::max(0, conv_input->ne[0] - conv_states->ne[0] - K + t); + const int64_t s_slot = K - t; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, + conv_states_all, row_count, n_seqs, + conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); + } + } + + return conv_input; +} + +ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( + llm_graph_input_rs * inp, + ggml_tensor * ssm_states_all, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * b, + ggml_tensor * s, + int il) { + const auto * mctx_cur = inp->mctx; + const auto kv_head = mctx_cur->get_head(); + const uint32_t mem_size = mctx_cur->get_size(); + + const int64_t S_v = s->ne[0]; + const int64_t H_v = s->ne[2]; + const int64_t n_seqs = s->ne[3]; + const int64_t n_seq_tokens = q->ne[2]; + + const bool keep = cparams.n_rs_seq > 0; + + if (!keep) { + auto attn_out = build_delta_net(q, k, v, g, b, s, il); + ggml_tensor * output = attn_out.first; + ggml_tensor * new_state = attn_out.second; + cb(output, "attn_output", il); + cb(new_state, "new_state", il); + + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, new_state, + ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], + kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + + return output; + } + + const int64_t D = S_v * S_v * H_v; + const int64_t K = cparams.n_rs_seq + 1; + + // TODO: remove pad + simplify + ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); + ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0); + + ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad); + if (n_seq_tokens > 1) { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + } else { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il); + } + + const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs; + const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs; + + ggml_tensor * output = ggml_view_4d(ctx0, gdn_out, + S_v, H_v, n_seq_tokens, n_seqs, + ggml_row_size(gdn_out->type, S_v), + ggml_row_size(gdn_out->type, S_v * H_v), + ggml_row_size(gdn_out->type, S_v * H_v * n_seq_tokens), + 0); + cb(output, "attn_output", il); + + const size_t row_size = hparams.n_embd_s() * ggml_element_size(ssm_states_all); + for (int64_t k_i = 0; k_i < K; ++k_i) { + const uint32_t cache_slot = (uint32_t) (K - 1 - k_i); + ggml_tensor * src = ggml_view_4d(ctx0, gdn_out, + S_v, S_v, H_v, n_seqs, + ggml_row_size(gdn_out->type, S_v), + ggml_row_size(gdn_out->type, S_v * S_v), + ggml_row_size(gdn_out->type, S_v * S_v * H_v), + ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap)); + + ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all, + hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], + ((size_t) cache_slot * mem_size + kv_head) * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); + } + + return output; +} diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 93cbcf9d931..435d27281c6 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 60a3f0ec285..12ac6f1ce88 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp index 2bd01a2c512..8d9ff138676 100644 --- a/src/models/ernie4-5-moe.cpp +++ b/src/models/ernie4-5-moe.cpp @@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index fa989fe92cd..9b39c605e35 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 54bb3ca86b3..76d91982fc5 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index 75d5f60631c..c7e9960d718 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index 5506e76424d..b5030eb0545 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -15,6 +15,9 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; switch (hparams.n_layer) { case 30: type = LLM_TYPE_1_2B; break; @@ -38,21 +41,37 @@ void llama_model_exaone4::load_arch_tensors(llama_model_loader &) { } for (int i = 0; i < n_layer; ++i) { + const bool is_nextn = hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers; + int flags = 0; + if (is_nextn) { + // NextN/MTP layers are preserved in GGUF but are not executed yet. + flags |= TENSOR_SKIP; + } + auto & layer = layers[i]; - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, flags); + + if (!is_nextn) { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + } - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + if (is_nextn) { + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED); + } } } @@ -90,7 +109,11 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra } ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + // MTP / NextN tail blocks are loaded for compatibility but not executed (same as exaone-moe). + const int n_layer_main = int(n_layer) - int(hparams.nextn_predict_layers); + GGML_ASSERT(n_layer_main > 0); + + for (int il = 0; il < n_layer_main; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers or non-SWA models @@ -126,7 +149,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer_main - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -163,7 +186,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index d353befdb8e..94b65a3c7c9 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index 75f2cfef560..ad546ef2db5 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 06731670007..1519682fdf6 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index 6255bf740fc..ae3f9ffb530 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index ee510fe38b0..63a2b380e71 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -207,7 +207,7 @@ llama_model_gemma3::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (hparams.f_final_logit_softcapping) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 881499b0ca7..6ec3a006081 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); { // final logit soft-capping diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index f45ae4cad59..4f9d8b18bc7 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (hparams.f_final_logit_softcapping) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 45886b51ac1..27654b8cba3 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index d6ef76e26d6..7c242fed298 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // Output projection - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index ba49c31b56b..e2dcc8b1521 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 33ebe2d8800..443e35addf2 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 12e4790ae24..27f6706ea10 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // For Granite architectures - scale logits if (hparams.f_logit_scale) { diff --git a/src/models/granite.cpp b/src/models/granite.cpp index 5e7c7b68181..cda4aa231fa 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -145,7 +145,7 @@ llama_model_granite::graph::graph( res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // For Granite architectures - scale logits cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 0bc49d00206..7c46ec1c0f2 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -206,7 +206,7 @@ llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index feef815165b..1cab75adc7f 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -184,7 +184,7 @@ llama_model_grovemoe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index 44af42412f7..deb3c9671f3 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -179,7 +179,7 @@ llama_model_hunyuan_moe::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/hunyuan-vl.cpp b/src/models/hunyuan-vl.cpp index 5fb9154bec0..da9bb74de7e 100644 --- a/src/models/hunyuan-vl.cpp +++ b/src/models/hunyuan-vl.cpp @@ -181,7 +181,7 @@ llama_model_hunyuan_vl::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f0c5580a6f4..f9ee37a24b6 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -129,7 +129,7 @@ llama_model_internlm2::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index a6451dca095..2ba162605f1 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -123,7 +123,7 @@ llama_model_jais::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index ad59b953e8d..8966131441c 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -152,7 +152,7 @@ llama_model_jais2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // Output projection - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index e1b8d137e38..84ea63c3136 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -189,7 +189,7 @@ llama_model_jamba::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index df6a8028736..29081344b24 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -262,7 +262,7 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index b60f67f6c4b..9722dde9f17 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -153,7 +153,7 @@ llama_model_llada_moe::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llada.cpp b/src/models/llada.cpp index fa21c5fe32c..58b2c466e17 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -147,7 +147,7 @@ llama_model_llada::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 8ddb5936820..cef66d054b0 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -235,7 +235,7 @@ llama_model_llama::graph::graph(const llama_model & model, const llm_grap if constexpr (!embed) { // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 899611d53f6..0ff5376d571 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -260,7 +260,7 @@ llama_model_llama4::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 3dbd82fd362..84cfe399027 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -141,7 +141,7 @@ llama_model_maincoder::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index b7708d7fdd1..887a1fa509a 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -128,7 +128,7 @@ llama_model_mamba::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index 71996616611..d0295ec116f 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -231,7 +231,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index ff5eb6ffa5f..1ffc54fa7c6 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -251,7 +251,7 @@ llama_model_minicpm3::graph::graph(const llama_model & model, const llm_graph_pa cb(cur, "lmhead_scaling", -1); // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 0dee8934692..22e291d73a3 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -158,7 +158,7 @@ llama_model_minimax_m2::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 708da49af1f..1ac5a95ccdc 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -177,9 +177,9 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa cb(cur, "ffn_norm", il); cur = build_ffn(cur, - model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, - model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, - model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); @@ -200,7 +200,11 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa LLM_FFN_SILU, true, hparams.expert_weights_scale, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - il); + il, + nullptr, nullptr, + model.layers[il].ffn_up_exps_s, + model.layers[il].ffn_gate_exps_s, + model.layers[il].ffn_down_exps_s); cb(cur, "ffn_moe_out", il); } cur = ggml_add(ctx0, cur, ffn_inp); @@ -222,7 +226,7 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/models.h b/src/models/models.h index 6d5f18a8e20..5251e2d8280 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -46,7 +46,7 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * s, int il); - // use the ggml_gated_delta_net fused operator + // use the ggml_gated_delta_net fused operator (K=1; state has shape (D, 1, n_seqs)) std::pair build_delta_net_fused( ggml_tensor * q, ggml_tensor * k, @@ -65,6 +65,29 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * b, ggml_tensor * s, int il); + + // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) + // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) + ggml_tensor * build_conv_state( + llm_graph_input_rs * inp, + ggml_tensor * conv_states_all, + ggml_tensor * qkv_mixed, + int64_t conv_kernel_size, + int64_t conv_channels, + int il); + + // run delta-net attention and write the new recurrent state(s) back to ssm_states_all + // s: (head_v_dim, head_v_dim, num_v_heads, n_seqs); returns output: (head_v_dim, num_v_heads, n_seq_tokens, n_seqs) + ggml_tensor * build_recurrent_attn( + llm_graph_input_rs * inp, + ggml_tensor * ssm_states_all, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * b, + ggml_tensor * s, + int il); }; struct llm_build_rwkv6_base : public llm_graph_context { @@ -163,6 +186,19 @@ struct llama_model_maincoder : public llama_model_base { }; +struct llama_model_talkie : public llama_model_base { + llama_model_talkie(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_deci : public llama_model_base { llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; @@ -994,6 +1030,19 @@ struct llama_model_deepseek2 : public llama_model_base { }; +struct llama_model_deepseek32 : public llama_model_base { + llama_model_deepseek32(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_deepseek2ocr : public llama_model_base { llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; @@ -1739,6 +1788,10 @@ struct llama_model_qwen35 : public llama_model_base { const llama_model & model; }; + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); + }; + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; @@ -1781,6 +1834,10 @@ struct llama_model_qwen35moe : public llama_model_base { const llama_model & model; }; + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); + }; + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index cfc60e8de29..0229d20ed36 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -161,7 +161,7 @@ llama_model_mpt::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index 865461f61db..a82f9c170b4 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -174,7 +174,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 0c72ed297aa..5d4a3b5c69e 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -140,7 +140,7 @@ llama_model_nemotron::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index 161035e72bc..cfcf17bcb03 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -133,7 +133,7 @@ llama_model_olmo::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 9633f269965..7cc262f5504 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -198,7 +198,7 @@ llama_model_olmo2::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 4bb9013054c..7976ae44a51 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -164,7 +164,7 @@ llama_model_olmoe::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 13a590ce646..15b6c8c1205 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -160,7 +160,7 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index b4128e116e7..9f76350fd4d 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -162,7 +162,7 @@ llama_model_openelm::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/orion.cpp b/src/models/orion.cpp index 7ace0a5139d..bcb4bbba4b1 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -132,7 +132,7 @@ llama_model_orion::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/paddleocr.cpp b/src/models/paddleocr.cpp index 1c0eadefa98..d39220bd778 100644 --- a/src/models/paddleocr.cpp +++ b/src/models/paddleocr.cpp @@ -98,7 +98,7 @@ llama_model_paddleocr::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 41b7e2ac23e..7593f879b24 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -148,7 +148,7 @@ llama_model_pangu_embed::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index a333602c72d..8f3ed5f7b7d 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -130,7 +130,7 @@ llama_model_phi2::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index 0a65e91fefa..f8a4a4d5aa5 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -179,7 +179,7 @@ llama_model_phi3::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cb(cur, "result_output_no_bias", -1); diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index 4c16c20a0d4..c7ed1211c31 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -127,7 +127,7 @@ llama_model_plamo::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 29c8702606a..b713889fe72 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -185,7 +185,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); // Explicitly mark as output tensor to ensure proper backend assignment diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 849f1579e63..29f3e803d68 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -186,7 +186,7 @@ llama_model_plamo3::graph::graph(const llama_model & model, const llm_grap cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); res->t_logits = cur; ggml_build_forward_expand(gf, cur); diff --git a/src/models/plm.cpp b/src/models/plm.cpp index 57f5995103b..ce050919e6a 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -204,7 +204,7 @@ llama_model_plm::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index cdc076cdf77..00467dbad7d 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -131,7 +131,7 @@ llama_model_qwen::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index 6320458a13b..a5147460bae 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -141,7 +141,7 @@ llama_model_qwen2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7587c802c68..7cb03859deb 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -184,7 +184,7 @@ llama_model_qwen2moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen2vl.cpp b/src/models/qwen2vl.cpp index 1a40fa89be4..d79db682cd4 100644 --- a/src/models/qwen2vl.cpp +++ b/src/models/qwen2vl.cpp @@ -134,7 +134,7 @@ llama_model_qwen2vl::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index fa656c84ea0..41b97fed956 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -147,7 +147,7 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index f276be61ba8..ba63ae441df 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -12,16 +12,22 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - // Mark recurrent layers (linear attention layers) + // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + + // Mark recurrent layers (linear attention layers). MTP layers are dense + // attention-only and must be flagged non-recurrent. { + const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0); + hparams.recurrent_layer_arr[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer) { + switch (hparams.n_layer - hparams.nextn_predict_layers) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break; case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break; case 64: type = LLM_TYPE_27B; break; @@ -29,9 +35,14 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { } } -void llama_model_qwen35::load_arch_tensors(llama_model_loader &) { +void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; + const uint32_t n_main = n_layer - hparams.nextn_predict_layers; + const bool mtp_only = (hparams.nextn_predict_layers > 0) && + (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); // output @@ -43,50 +54,85 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); } - // Calculate dimensions from hyperparameters - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t head_v_dim = hparams.ssm_d_state; - const int64_t n_k_heads = hparams.ssm_n_group; - const int64_t n_v_heads = hparams.ssm_dt_rank; - const int64_t key_dim = head_k_dim * n_k_heads; - const int64_t value_dim = head_v_dim * n_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; + auto load_block_trunk = [&](int il, int flags) { + auto & layer = layers[il]; - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recurrent(il)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); // Q/K normalization for attention layers - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags); } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags); + layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags); } - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", il), {n_embd, n_ff}, flags); + }; + + auto load_block_mtp = [&](int il) { + auto & layer = layers[il]; + + // MTP block looks like a full-attention Qwen3.5 decoder block. + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0); + + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", il), {n_embd, n_ff}, 0); + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < (int) n_main; ++i) { + load_block_trunk(i, trunk_flags); + } + for (int i = (int) n_main; i < n_layer; ++i) { + load_block_mtp(i); } } std::unique_ptr llama_model_qwen35::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } return std::make_unique(*this, params); } @@ -111,7 +157,9 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. + const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; + for (int il = 0; il < n_transformer_layers; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -128,7 +176,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -160,6 +208,13 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para } cur = inpL; + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + if (!cparams.embeddings_pre_norm_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + // Final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); @@ -167,7 +222,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; @@ -297,8 +352,6 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear( const int64_t head_v_dim = d_inner / num_v_heads; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const auto kv_head = mctx_cur->get_head(); - GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); @@ -328,41 +381,14 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear( gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); - // Get convolution states from cache ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - // Build the convolution states tensor - ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); - cb(conv_states, "conv_states", il); - - // Calculate convolution kernel size ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; const int64_t conv_kernel_size = conv_kernel->ne[0]; const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; - conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); - cb(conv_states, "conv_states_reshaped", il); - - qkv_mixed = ggml_transpose(ctx0, qkv_mixed); - cb(qkv_mixed, "qkv_mixed_transposed", il); - - ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); - cb(conv_input, "conv_input", il); - - // Update convolution state cache - // Extract the last (conv_kernel_size - 1) states from conv_input - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); - - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); @@ -413,7 +439,7 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear( //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); // if head keys and value keys are different, repeat to force tensors into matching shapes - // note: need explicit repeat only if we are not using the fused GDN + // note: need explicit repeat only if we are not using the fused GDN. if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) { GGML_ASSERT(num_v_heads % num_k_heads == 0); q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); @@ -424,18 +450,7 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear( cb(k_conv, "k_conv_predelta", il); cb(v_conv, "v_conv_predelta", il); - auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il); - - ggml_tensor * output = attn_out.first; - ggml_tensor * new_state = attn_out.second; - cb(output, "attn_output", il); - cb(new_state, "new_state", il); - - // Update the recurrent states - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, new_state, - ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], - kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il); // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); @@ -471,3 +486,164 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons return cur; } + +// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series +llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block"); + + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + + // hparams.n_layer includes both main model layers and MTP layers. The MTP + // layer is stored immediately after the main layers in model.layers[]. + const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const auto & layer = model.layers[il]; + + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); + ggml_set_input(inp->embd); + + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } + cb(tok_embd, "mtp_tok_embd", il); + + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpSA = cur; + + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_norm", il); + + ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); + cb(Qcur_full, "mtp_Qcur_full", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + 0); + Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "mtp_Qcur_normed", il); + + ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + ggml_element_size(Qcur_full) * n_embd_head); + gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); + cb(gate, "mtp_gate", il); + + ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "mtp_Kcur_normed", il); + + ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + cb(Vcur, "mtp_Vcur", il); + + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + const float kq_scale = hparams.f_attention_scale == 0.0f + ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + cur = build_attn(inp_attn, + nullptr, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "mtp_attn_pregate", il); + + cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); + cur = build_lora_mm(layer.wo, cur, layer.wo_s); + cb(cur, "mtp_attn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "mtp_attn_residual", il); + + ggml_tensor * ffn_residual = cur; + cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_post_norm", il); + + cur = build_ffn(cur, + layer.ffn_up, nullptr, layer.ffn_up_s, + layer.ffn_gate, nullptr, layer.ffn_gate_s, + layer.ffn_down, nullptr, layer.ffn_down_s, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "mtp_ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_residual); + cb(cur, "mtp_post_ffn", il); + + // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. + // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.) + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; + GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur, head_s); + cb(cur, "result_output", -1); + + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index cf05dc9d61c..4f87d55d911 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -15,16 +15,22 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - // Mark recurrent layers (linear attention layers) + // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + + // Mark recurrent layers (linear attention layers). MTP layers are dense + // attention-only and must be flagged non-recurrent. { + const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0); + hparams.recurrent_layer_arr[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer) { + switch (hparams.n_layer - hparams.nextn_predict_layers) { case 40: type = LLM_TYPE_35B_A3B; break; case 48: type = LLM_TYPE_122B_A10B; break; case 60: type = LLM_TYPE_397B_A17B; break; @@ -32,9 +38,14 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { } } -void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) { +void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; + const uint32_t n_main = n_layer - hparams.nextn_predict_layers; + const bool mtp_only = (hparams.nextn_predict_layers > 0) && + (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); // output @@ -46,60 +57,105 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); } - const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + auto load_block_trunk = [&](int il, int flags) { + auto & layer = layers[il]; - // Calculate dimensions from hyperparameters - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t head_v_dim = hparams.ssm_d_state; - const int64_t n_k_heads = hparams.ssm_n_group; - const int64_t n_v_heads = hparams.ssm_dt_rank; - const int64_t key_dim = head_k_dim * n_k_heads; - const int64_t value_dim = head_v_dim * n_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recurrent(il)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); // Q/K normalization for attention layers - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags); } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags); + layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags); } - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); + // Routed experts + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, flags); + create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, flags); // Shared experts + layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, flags); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, flags); + }; + + auto load_block_mtp = [&](int il) { + auto & layer = layers[il]; + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; - layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); - layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0); - layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0); - layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0); + // MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN. + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0); + + create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0); + + // Routed experts + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, 0); + create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, 0); + + // Shared experts + layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, 0); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, 0); + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < (int) n_main; ++i) { + load_block_trunk(i, trunk_flags); + } + for (int i = (int) n_main; i < n_layer; ++i) { + load_block_mtp(i); } } std::unique_ptr llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } return std::make_unique(*this, params); } @@ -124,7 +180,9 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. + const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; + for (int il = 0; il < n_transformer_layers; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -141,7 +199,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -173,6 +231,13 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p } cur = inpL; + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + if (!cparams.embeddings_pre_norm_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + // Final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); @@ -180,7 +245,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; @@ -310,8 +375,6 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear( const int64_t head_v_dim = d_inner / num_v_heads; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const auto kv_head = mctx_cur->get_head(); - GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); @@ -341,41 +404,14 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear( gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); - // Get convolution states from cache ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - // Build the convolution states tensor - ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); - cb(conv_states, "conv_states", il); - - // Calculate convolution kernel size ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; const int64_t conv_kernel_size = conv_kernel->ne[0]; const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; - conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); - cb(conv_states, "conv_states_reshaped", il); - - qkv_mixed = ggml_transpose(ctx0, qkv_mixed); - cb(qkv_mixed, "qkv_mixed_transposed", il); - - ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); - cb(conv_input, "conv_input", il); - - // Update convolution state cache - // Extract the last (conv_kernel_size - 1) states from conv_input - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); - - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); @@ -426,7 +462,7 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear( //v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); // if head keys and value keys are different, repeat to force tensors into matching shapes - // note: need explicit repeat only if we are not using the fused GDN + // note: need explicit repeat only if we are not using the fused GDN. if (num_k_heads != num_v_heads && (!cparams.fused_gdn_ar || !cparams.fused_gdn_ch)) { GGML_ASSERT(num_v_heads % num_k_heads == 0); q_conv = ggml_repeat_4d(ctx0, q_conv, head_k_dim, num_v_heads, n_seq_tokens, n_seqs); @@ -437,18 +473,7 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear( cb(k_conv, "k_conv_predelta", il); cb(v_conv, "v_conv_predelta", il); - auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il); - - ggml_tensor * output = attn_out.first; - ggml_tensor * new_state = attn_out.second; - cb(output, "attn_output", il); - cb(new_state, "new_state", il); - - // Update the recurrent states - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, new_state, - ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], - kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il); // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); @@ -525,3 +550,195 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c return cur; } + +// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE +llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); + + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + + const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const auto & layer = model.layers[il]; + + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); + ggml_set_input(inp->embd); + + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } + cb(tok_embd, "mtp_tok_embd", il); + + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpSA = cur; + + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_norm", il); + + ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s); + cb(Qcur_full, "mtp_Qcur_full", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + 0); + Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "mtp_Qcur_normed", il); + + ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, + n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + ggml_element_size(Qcur_full) * n_embd_head); + gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); + cb(gate, "mtp_gate", il); + + ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "mtp_Kcur_normed", il); + + ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + cb(Vcur, "mtp_Vcur", il); + + Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + const float kq_scale = hparams.f_attention_scale == 0.0f + ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + cur = build_attn(inp_attn, + nullptr, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "mtp_attn_pregate", il); + + cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)); + cur = build_lora_mm(layer.wo, cur, layer.wo_s); + cb(cur, "mtp_attn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "mtp_attn_residual", il); + + ggml_tensor * ffn_residual = cur; + cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_post_norm", il); + + // MoE FFN โ€” routed experts plus gated shared expert (mirrors qwen35moe). + ggml_tensor * moe_out = + build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + hparams.expert_weights_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, layer.ffn_gate_up_exps, + layer.ffn_up_exps_s, + layer.ffn_gate_exps_s, + layer.ffn_down_exps_s); + cb(moe_out, "mtp_ffn_moe_out", il); + + if (layer.ffn_up_shexp != nullptr) { + ggml_tensor * ffn_shexp = + build_ffn(cur, + layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, + layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, + layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "mtp_ffn_shexp", il); + + ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur); + shared_gate = ggml_sigmoid(ctx0, shared_gate); + cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il); + + ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); + cb(ffn_shexp, "mtp_ffn_shexp_gated", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + } else { + cur = moe_out; + } + cb(cur, "mtp_ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_residual); + cb(cur, "mtp_post_ffn", il); + + // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; + GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur, head_s); + cb(cur, "result_output", -1); + + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 4440b83aa45..a4f8e1379c9 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -168,7 +168,7 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index cb1b4814caf..1d873427db5 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -176,7 +176,7 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; @@ -378,8 +378,6 @@ ggml_tensor * llama_model_qwen3next::graph::build_layer_attn_linear( const int64_t head_v_dim = d_inner / num_v_heads; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const auto kv_head = mctx_cur->get_head(); - GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); @@ -429,41 +427,14 @@ ggml_tensor * llama_model_qwen3next::graph::build_layer_attn_linear( beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); - // Get convolution states from cache ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - // Build the convolution states tensor - ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); - cb(conv_states, "conv_states", il); - - // Calculate convolution kernel size ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; const int64_t conv_kernel_size = conv_kernel->ne[0]; const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; - conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); - cb(conv_states, "conv_states_reshaped", il); - - qkv_mixed = ggml_transpose(ctx0, qkv_mixed); - cb(qkv_mixed, "qkv_mixed_transposed", il); - - ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); - cb(conv_input, "conv_input", il); - - // Update convolution state cache - // Extract the last (conv_kernel_size - 1) states from conv_input - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); - - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + ggml_tensor * conv_input = build_conv_state(inp, conv_states_all, qkv_mixed, conv_kernel_size, conv_channels, il); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); @@ -540,18 +511,7 @@ ggml_tensor * llama_model_qwen3next::graph::build_layer_attn_linear( cb(k_conv, "k_conv_predelta", il); cb(v_conv, "v_conv_predelta", il); - auto attn_out = build_delta_net(q_conv, k_conv, v_conv, gate, beta, state, il); - - ggml_tensor * output = attn_out.first; - ggml_tensor * new_state = attn_out.second; - cb(output, "attn_output", il); - cb(new_state, "new_state", il); - - // Update the recurrent states - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, new_state, - ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], - kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + ggml_tensor * output = build_recurrent_attn(inp, ssm_states_all, q_conv, k_conv, v_conv, gate, beta, state, il); // z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim] ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 7871f8f7952..5defd893944 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -163,7 +163,7 @@ llama_model_qwen3vl::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index b99143c8908..5b77df57122 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -180,7 +180,7 @@ llama_model_qwen3vlmoe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index f14f10917ff..bf3949a9092 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -150,7 +150,7 @@ llama_model_refact::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index 325ee73ba5c..ca8e009615e 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -167,7 +167,7 @@ llama_model_rnd1::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index 2944711acec..ba2a9dfa0db 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -176,7 +176,7 @@ llama_model_rwkv6::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 6f7d1f5722f..566b8cdcb54 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -158,7 +158,7 @@ llama_model_rwkv6qwen2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index b205e3935e1..7574b252621 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -202,7 +202,7 @@ llama_model_rwkv7::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 83e114740b6..806cba574be 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -141,7 +141,7 @@ llama_model_seed_oss::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 3214e7cbad3..4231cccc666 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -178,7 +178,7 @@ llama_model_smallthinker::graph::graph(const llama_model & model, const ll res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 7adaf34c534..90e7d473eaf 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -143,7 +143,7 @@ llama_model_smollm3::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 8f613e55947..4da7f7aefcf 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -163,7 +163,7 @@ llama_model_stablelm::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index 58cf0ac0edc..e131af058bc 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -135,7 +135,7 @@ llama_model_starcoder::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 45dae0602d4..9c207c02885 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -148,7 +148,7 @@ llama_model_starcoder2::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index c4789752d21..3b68e68707a 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -261,7 +261,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 27a0711ba41..73e32741406 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -265,7 +265,7 @@ llama_model_t5::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp new file mode 100644 index 00000000000..1258eeb19b6 --- /dev/null +++ b/src/models/talkie.cpp @@ -0,0 +1,149 @@ +#include "models.h" + +void llama_model_talkie::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + + switch (hparams.n_layer) { + case 40: type = LLM_TYPE_13B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_talkie::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + // no k gain + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {1, n_head}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1}, 0); + } +} + +std::unique_ptr llama_model_talkie::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_talkie::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_k(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_v()); + GGML_ASSERT(n_embd_head == n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + inpL = build_norm(inpL, nullptr, nullptr, LLM_NORM_RMS, -1); + cb(inpL, "inp_norm", -1); + + ggml_tensor * embd_skip = inpL; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + ggml_tensor * inp_skip = embd_skip; + + cur = build_norm(inpL, nullptr, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, + n_embd_head, n_head, n_head_kv, il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + // reference applies qknorm after rope + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_norm", il); + + Kcur = build_norm(Kcur, nullptr, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_norm", il); + + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, model.layers[il].wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + inp_skip = ggml_get_rows(ctx0, inp_skip, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, nullptr, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + ggml_tensor * skip = ggml_mul(ctx0, inp_skip, model.layers[il].out_scale); + cb(skip, "embd_skip", il); + + cur = ggml_add(ctx0, cur, skip); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, nullptr, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); + cb(cur, "result_output", -1); + + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/wavtokenizer-dec.cpp b/src/models/wavtokenizer-dec.cpp index a873e5d2e8f..214fed99bad 100644 --- a/src/models/wavtokenizer-dec.cpp +++ b/src/models/wavtokenizer-dec.cpp @@ -253,7 +253,7 @@ llama_model_wavtokenizer_dec::graph::graph(const llama_model & model, const llm_ LLM_NORM, -1); // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index e4d111e622a..d6d1c7a2e5d 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -126,7 +126,7 @@ llama_model_xverse::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3ee535224d9..33ae3b303cf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -252,6 +252,13 @@ llama_build_and_test(test-backend-sampler.cpp LABEL "model") llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -m "${MODEL_DEST}") set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED test-download-model) +llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}") +set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model) + +# Test state save/load functionality +llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}") +set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model) + if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading llama_build_and_test(test-barrier.cpp) diff --git a/tests/gguf-model-data.cpp b/tests/gguf-model-data.cpp index d277173c466..fe8b4ca76e7 100644 --- a/tests/gguf-model-data.cpp +++ b/tests/gguf-model-data.cpp @@ -630,10 +630,11 @@ std::optional gguf_fetch_model_meta( } for (int i = 2; i <= model.n_split; i++) { - char num_buf[6], total_buf[6]; - snprintf(num_buf, sizeof(num_buf), "%05d", i); - snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split); - std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf"; + char buf_num[32]; + char buf_tot[32]; + snprintf(buf_num, sizeof(buf_num), "%05d", i); + snprintf(buf_tot, sizeof(buf_tot), "%05d", (int)model.n_split); + std::string shard_name = split_prefix + "-" + buf_num + "-of-" + buf_tot + ".gguf"; auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose); if (!shard.has_value()) { @@ -704,10 +705,11 @@ gguf_context_ptr gguf_fetch_gguf_ctx( } for (int i = 2; i <= model.n_split; i++) { - char num_buf[6], total_buf[6]; - snprintf(num_buf, sizeof(num_buf), "%05d", i); - snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split); - std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf"; + char buf_num[32]; + char buf_tot[32]; + snprintf(buf_num, sizeof(buf_num), "%05d", i); + snprintf(buf_tot, sizeof(buf_tot), "%05d", (int)model.n_split); + std::string shard_name = split_prefix + "-" + buf_num + "-of-" + buf_tot + ".gguf"; auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose); if (!shard.has_value()) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c60f6d2c2b0..58c5fdd10db 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -55,33 +57,24 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m { // parallel initialization static const size_t n_threads = N_THREADS; - // static RNG initialization (revisit if n_threads stops being constant) - static std::vector generators = []() { - std::random_device rd; - std::vector vec; - vec.reserve(n_threads); - //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed - for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } - return vec; - }(); - - auto init_thread = [&](size_t ith, size_t start, size_t end) { + + auto init_thread = [&](size_t start, size_t end) { + thread_local std::default_random_engine gen(std::random_device{}()); std::uniform_real_distribution distribution(min, max); - auto & gen = generators[ith]; for (size_t i = start; i < end; i++) { data[i] = distribution(gen); } }; if (n_threads == 1) { - init_thread(0, 0, nels); + init_thread(0, nels); } else { std::vector> tasks; tasks.reserve(n_threads); for (size_t i = 0; i < n_threads; i++) { size_t start = i*nels/n_threads; size_t end = (i+1)*nels/n_threads; - tasks.push_back(std::async(std::launch::async, init_thread, i, start, end)); + tasks.push_back(std::async(std::launch::async, init_thread, start, end)); } for (auto & t : tasks) { t.get(); @@ -516,6 +509,25 @@ static bool output_format_from_str(const std::string & s, output_formats & forma return true; } +static std::string test_time_now() { + time_t t = time(NULL); + struct tm tm_buf; +#ifdef _WIN32 + if (gmtime_s(&tm_buf, &t) != 0) { + return ""; + } +#else + if (gmtime_r(&t, &tm_buf) == nullptr) { + return ""; + } +#endif + char buf[32]; + if (std::strftime(buf, sizeof(buf), "%FT%TZ", &tm_buf) == 0) { + return ""; + } + return buf; +} + // Test result structure for SQL output struct test_result { std::string test_time; @@ -545,11 +557,7 @@ struct test_result { supported = false; passed = false; - // Set test time - time_t t = time(NULL); - char buf[32]; - std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); - test_time = buf; + test_time = test_time_now(); // Set build info build_commit = ggml_commit(); @@ -573,11 +581,7 @@ struct test_result { n_runs(n_runs), device_description(device_description), backend_reg_name(backend_reg_name) { - // Set test time - time_t t = time(NULL); - char buf[32]; - std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); - test_time = buf; + test_time = test_time_now(); // Set build info build_commit = ggml_commit(); @@ -1110,6 +1114,17 @@ static std::unique_ptr create_printer(output_formats format) { GGML_ABORT("invalid output format"); } +static std::mutex g_test_output_mutex; + +static void print_test_result_locked(printer * output_printer, const test_result & result) { + if (output_printer == nullptr) { + return; + } + + std::lock_guard guard(g_test_output_mutex); + output_printer->print_test_result(result); +} + struct test_case { virtual ~test_case() {} @@ -1338,9 +1353,7 @@ struct test_case { test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", false, false, "not supported"); - if (output_printer) { - output_printer->print_test_result(result); - } + print_test_result_locked(output_printer, result); ggml_free(ctx); return test_status_t::NOT_SUPPORTED; @@ -1462,9 +1475,7 @@ struct test_case { test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", supported, test_passed, error_msg); - if (output_printer) { - output_printer->print_test_result(result); - } + print_test_result_locked(output_printer, result); return test_passed ? test_status_t::OK : test_status_t::FAIL; } @@ -2404,6 +2415,15 @@ struct test_set_rows : public test_case { } return 1e-7; } + + // See dicussion here: https://github.com/ggml-org/llama.cpp/pull/23760#issuecomment-4566312209 + double max_nmse_err(ggml_backend_t backend) override { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend)); + if (type == GGML_TYPE_Q8_0 && strcmp(ggml_backend_reg_name(reg), "WebGPU") == 0) { + return std::max(test_case::max_nmse_err(backend), 2e-7); + } + return test_case::max_nmse_err(backend); + } }; // GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS @@ -2866,15 +2886,24 @@ struct test_set : public test_case { struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; - const std::array ne; + const std::array ne_src; + const std::array ne_dst; const std::array permute_src; const std::array permute_dst; bool _src_use_permute; bool _dst_use_permute; bool _src_transpose; + bool _use_dst_shape; std::string vars() override { - return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose); + if (_use_dst_shape) { + return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose); + } + return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose); + } + + int64_t total_elements() const { + return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3]; } double max_nmse_err() override { @@ -2899,7 +2928,7 @@ struct test_cpy : public test_case { err_estimate /= 8.0f; } err_estimate *= err_estimate; - err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]); + err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements()); return err_estimate; } return 1e-6; @@ -2910,17 +2939,19 @@ struct test_cpy : public test_case { } test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}, + std::array ne_src = {10, 10, 10, 1}, + std::array ne_dst = {-1, -1, -1, -1}, std::array permute_src = {0, 0, 0, 0}, std::array permute_dst = {0, 0, 0, 0}, bool transpose_src = false) - : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst), + : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst), _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0), - _src_transpose(transpose_src){} + _src_transpose(transpose_src), + _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); + ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data()); ggml_set_param(src); ggml_set_name(src, "src"); @@ -2934,7 +2965,8 @@ struct test_cpy : public test_case { ggml_set_name(src, "src_transposed"); } - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); + std::array dst_ne = _use_dst_shape ? ne_dst : std::array{src->ne[0], src->ne[1], src->ne[2], src->ne[3]}; + ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); ggml_set_name(dst, "dst"); if (_dst_use_permute) { @@ -3832,16 +3864,17 @@ struct test_gated_delta_net : public test_case { const int v_repeat; const bool permuted; const bool kda; + const int64_t K; // snapshot slot count: 1 = final-only, >1 = last K states std::string vars() override { - return VARS_TO_STR8(type, head_count, head_size, n_seq_tokens, n_seqs, v_repeat, permuted, kda); + return VARS_TO_STR9(type, head_count, head_size, n_seq_tokens, n_seqs, v_repeat, permuted, kda, K); } test_gated_delta_net(ggml_type type = GGML_TYPE_F32, int64_t head_count = 4, int64_t head_size = 16, int64_t n_seq_tokens = 1, int64_t n_seqs = 1, - int v_repeat = 1, bool permuted = false, bool kda = false) + int v_repeat = 1, bool permuted = false, bool kda = false, int64_t K = 1) : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs), - v_repeat(v_repeat), permuted(permuted), kda(kda) {} + v_repeat(v_repeat), permuted(permuted), kda(kda), K(K) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * q; @@ -3863,7 +3896,7 @@ struct test_gated_delta_net : public test_case { const int64_t g_ne0 = kda ? head_size : 1; ggml_tensor * g = ggml_new_tensor_4d(ctx, type, g_ne0, head_count * v_repeat, n_seq_tokens, n_seqs); ggml_tensor * beta = ggml_new_tensor_4d(ctx, type, 1, head_count * v_repeat, n_seq_tokens, n_seqs); - ggml_tensor * state = ggml_new_tensor_2d(ctx, type, head_size * v_repeat * head_size * head_count, n_seqs); + ggml_tensor * state = ggml_new_tensor_3d(ctx, type, head_size * v_repeat * head_size * head_count, K, n_seqs); ggml_set_name(g, "g"); ggml_set_name(beta, "beta"); ggml_set_name(state, "state"); @@ -4850,6 +4883,21 @@ struct test_rope : public test_case { a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); ggml_set_name(a, "view_of_a"); + } else if (v == 2) { + // second-half slice along dim 0 (mimics build_rope_2d in clip.cpp). + // The non-zero view offset (ne_a[0] * elem_size) often produces a + // non-aligned buffer offset, which exercises backends' alignment paths. + auto ne = ne_a; ne[0] *= 2; + a = ggml_new_tensor(ctx, type, 4, ne.data()); + if (forward) { + ggml_set_param(a); + } + ggml_set_name(a, "a"); + + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], + a->nb[1], a->nb[2], a->nb[3], + ne_a[0] * ggml_element_size(a)); + ggml_set_name(a, "view_of_a"); } else { a = ggml_new_tensor(ctx, type, 4, ne_a.data()); if (forward) { @@ -4912,8 +4960,6 @@ struct test_rope : public test_case { } else { out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); } - - // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp } ggml_set_name(out, "out"); @@ -7766,6 +7812,8 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 1536, 729}, {2, 2, 1536, 4096}, 1, 1, 0, 0, 1, 1, true)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 1, 2}, {32, 33, 1, 2}, 1, 1, 1, 1, 1, 1, true)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {128, 128, 2, 1}, {33, 34, 2, 1}, 1, 1, 1, 1, 1, 1, true)); // im2col 3D test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32)); @@ -8026,42 +8074,72 @@ static std::vector> make_test_cases_eval() { for (int k = 1; k < 4; ++k) { test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3})); } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : all_types) { for (ggml_type type_dst : {GGML_TYPE_F32}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous } } test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + + // CPY - different src/dst shapes (reshaping via CPY) + // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360. + // Each src permutation is tested against canonical sorted and reverse dst (skip self). + { + std::array dims = {3, 5, 7, 32}; + std::sort(dims.begin(), dims.end()); + std::array canonical = dims; + std::array reversed = {32, 7, 5, 3}; + for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + std::array cur = dims; + do { + if (cur != canonical) { + test_cases.emplace_back(new test_cpy(type, type, cur, canonical)); + } + if (cur != reversed) { + test_cases.emplace_back(new test_cpy(type, type, cur, reversed)); + } + if (cur[0] == 32 && type == GGML_TYPE_F32) { + if (canonical[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical)); + } + if (reversed[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed)); + } + } + std::next_permutation(cur.begin(), cur.end()); + } while (cur != canonical); + } + } for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (bool use_view_slice : { true, false }) { @@ -8251,7 +8329,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 1, 128)); test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 64, 1, 64)); test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 256, 1, 256)); + test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 512, 1, 512)); test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 32, 128)); + test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 4, 128, {2, 3})); #if 0 // > 4GB A matrix. Too slow to be enabled by default. @@ -8337,6 +8417,18 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {1536, 1}, {1, 1})); } } + + // BF16 is absent from base_types: add the 3 standard non-contig permutations explicitly + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + for (ggml_type type_a : other_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { if (ggml_blck_size(type_a) != 256) { @@ -8674,6 +8766,13 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B) } + + // build_rope_2d-style: ROPE on a non-contiguous view + // that starts at a non-zero offset along dim 0 + // (e.g. gemma4v vision second-half view). + for (int rmode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION }) { + test_cases.emplace_back(new test_rope(type, { 36, 16, 2457, 1}, 36, rmode, 512, fs, ef, af, ff, 2, fw)); + } } all = false; @@ -8797,9 +8896,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3)); + test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular test_cases.emplace_back(new test_pad_ext()); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false)); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1})); test_cases.emplace_back(new test_roll()); @@ -9022,6 +9136,18 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 33, 1, 1, false, true)); test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 100, 1, 1, false, true)); + // K > 1: output keeps the last min(n_tokens, K) per-token snapshots in the trailing K-token region. + // exact-match cases (K == n_seq_tokens): + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 16, 2, 1, 1, false, false, /*K=*/2)); + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 32, 4, 1, 1, false, false, /*K=*/4)); + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 4, 2, 1, false, false, /*K=*/4)); + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 8, 128, 4, 1, 1, false, false, /*K=*/4)); + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 4, 2, 1, false, true, /*K=*/4)); + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 8, 32, 4, 2, 2, false, true, /*K=*/4)); + // overflow: n_tokens > K โ€” only the last K snapshots kept. + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 32, 8, 1, 1, false, false, /*K=*/3)); + test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 16, 2, 1, false, false, /*K=*/4)); + #if 0 // these tests are disabled to save execution time, sbut they can be handy for debugging test_cases.emplace_back(new test_llama(2, true)); @@ -9087,22 +9213,21 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); @@ -9292,6 +9417,7 @@ static std::vector> make_test_cases_perf() { // Examples from granite-4.0-h-1b/ggml-model-Q8_0.gguf test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1})); // prefill + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {937, 8192, 1, 1}, {4, 8192, 1, 1})); // prefill test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 3328, 1, 1}, {4, 3328, 1, 1})); // generate test_cases.emplace_back(new test_ssm_conv_bias_silu(GGML_TYPE_F32, {515, 3328, 1, 1}, {4, 3328, 1, 1}, true)); // prefill test_cases.emplace_back(new test_ssm_conv_bias_silu(GGML_TYPE_F32, {4, 3328, 1, 1}, {4, 3328, 1, 1}, true)); // generate @@ -9390,8 +9516,8 @@ static std::vector> make_test_cases_from_file(const c return test_cases; } -static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter, - printer * output_printer, const char * test_file_path) { +static bool test_backend(ggml_backend_t backend, ggml_backend_dev_t dev, test_mode mode, const char * op_names_filter, const char * params_filter, + printer * output_printer, const char * test_file_path, int parallel_workers) { auto filter_test_cases = [](std::vector> & test_cases, const char * params_filter) { if (params_filter == nullptr) { return; @@ -9444,21 +9570,90 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op set_use_ref(backend_cpu, true); } - size_t n_ok = 0; - size_t tests_run = 0; + std::atomic n_ok = 0; + std::atomic tests_run = 0; std::vector failed_tests; - for (auto & test : test_cases) { - test_status_t status = test->eval(backend, backend_cpu, op_names_filter, output_printer); - if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) { - continue; + std::mutex failed_tests_mutex; + + // Each worker grabs a chunk of cases at a time. The chunk shrinks as we + // run out of work so that a few slow tests at the tail get spread across + // workers instead of landing on one unlucky thread. + constexpr size_t MAX_TESTS_PER_ITER = 100; + std::atomic test_idx = 0; + + const auto & next_chunk = [&](size_t & my_begin, size_t & my_end) { + const size_t cur = test_idx.load(std::memory_order_relaxed); + const size_t remaining = cur < test_cases.size() ? test_cases.size() - cur : 0; + const size_t chunk = std::max(1, std::min(MAX_TESTS_PER_ITER, remaining / parallel_workers)); + my_begin = test_idx.fetch_add(chunk); + my_end = std::min(my_begin + chunk, test_cases.size()); + }; + + const auto & run_tests = [&](ggml_backend_t b, ggml_backend_t b_cpu) { + size_t my_begin, my_end; + next_chunk(my_begin, my_end); + while (my_begin < test_cases.size()) { + for (size_t i = my_begin; i < my_end; ++i) { + auto & test = test_cases[i]; + test_status_t status = test->eval(b, b_cpu, op_names_filter, output_printer); + if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) { + continue; + } + tests_run++; + if (status == test_status_t::OK) { + n_ok++; + } else if (status == test_status_t::FAIL) { + std::lock_guard guard(failed_tests_mutex); + failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")"); + } + } + next_chunk(my_begin, my_end); } - tests_run++; - if (status == test_status_t::OK) { - n_ok++; - } else if (status == test_status_t::FAIL) { - failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")"); + }; + + if (parallel_workers <= 1) { + // Reuse the outer backend / backend_cpu so we don't pay an + // extra CPU backend init. + run_tests(backend, backend_cpu); + } else { + std::atomic workers_started = 0; + + const auto & eval_worker = [&]() { + ggml_backend_t b = ggml_backend_dev_init(dev, NULL); + if (b == NULL) { + return; + } + + ggml_backend_t b_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (b_cpu == NULL) { + ggml_backend_free(b); + return; + } + + if (set_use_ref) { + set_use_ref(b_cpu, true); + } + workers_started++; + run_tests(b, b_cpu); + ggml_backend_free(b_cpu); + ggml_backend_free(b); + }; + + std::vector threads; + threads.reserve(parallel_workers); + for (int i = 0; i < parallel_workers; ++i) { + threads.emplace_back(eval_worker); + } + for (auto & t : threads) { + t.join(); + } + + if (workers_started == 0 && !test_cases.empty()) { + ggml_backend_free(backend_cpu); + return false; } } + output_printer->print_summary(test_summary_info(n_ok, tests_run, false)); output_printer->print_failed_tests(failed_tests); @@ -9606,7 +9801,7 @@ static void show_test_coverage() { static void usage(char ** argv) { printf("Usage: %s [mode] [-o ] [-b ] [-p ] [--output ] [--list-ops]", argv[0]); - printf(" [--show-coverage] [--test-file ]\n"); + printf(" [--show-coverage] [--test-file ] [-j ]\n"); printf(" valid modes:\n"); printf(" - test (default, compare with CPU backend for correctness)\n"); printf(" - grad (compare gradients from backpropagation with method of finite differences)\n"); @@ -9618,6 +9813,7 @@ static void usage(char ** argv) { printf(" --list-ops lists all available GGML operations\n"); printf(" --show-coverage shows test coverage\n"); printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n"); + printf(" -j runs tests using parallel worker threads (default: 1, test mode only)\n"); } int main(int argc, char ** argv) { @@ -9627,6 +9823,7 @@ int main(int argc, char ** argv) { const char * backend_filter = nullptr; const char * params_filter = nullptr; const char * test_file_path = nullptr; + int parallel_workers = 1; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "test") == 0) { @@ -9681,6 +9878,17 @@ int main(int argc, char ** argv) { usage(argv); return 1; } + } else if (strcmp(argv[i], "-j") == 0) { + if (i + 1 < argc) { + parallel_workers = atoi(argv[++i]); + if (parallel_workers < 1) { + usage(argv); + return 1; + } + } else { + usage(argv); + return 1; + } } else { usage(argv); return 1; @@ -9733,7 +9941,7 @@ int main(int argc, char ** argv) { false, "", ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024, true)); - bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path); + bool ok = test_backend(backend, dev, mode, op_names_filter, params_filter, output_printer.get(), test_file_path, parallel_workers); if (ok) { n_ok++; diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index 1d96de718e2..6f8e957489c 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -81,6 +81,8 @@ static void test_normalize_quotes_with_embedded_quotes(testing & t); // TAG_WITH_TAGGED argument parsing tests static void test_tagged_args_with_embedded_quotes(testing & t); +static void test_role_markers_all_templates(testing & t); + int main(int argc, char * argv[]) { testing t(std::cout); t.verbose = true; @@ -103,6 +105,7 @@ int main(int argc, char * argv[]) { t.test("standard_json_tools", test_standard_json_tools_formats); t.test("normalize_quotes_to_json", test_normalize_quotes_to_json); t.test("tagged_args_embedded_quotes", test_tagged_args_with_embedded_quotes); + t.test("role_markers_all_templates", test_role_markers_all_templates); return t.summary(); } @@ -714,7 +717,7 @@ static void test_compare_variants_both_modifiers(testing & t) { static void test_compare_variants_template_failure(testing & t) { // Test with template that causes failure during application (not construction) // We use a valid template syntax but one that will fail during application - common_chat_template tmpl("{{ messages[0]['nonexistent_field'] }}", "", ""); + common_chat_template tmpl("{{ messages.cahoot()[0]['nonexistent_field'] }}", "", ""); template_params params; params.messages = json::array({ @@ -1848,6 +1851,128 @@ static json build_edit_tool() { }); } +// ============================================================================ +// Role marker detection tests for all autoparser-handled templates +// +// Verifies that detect_user_start_marker / detect_assistant_start_marker +// return the correct boundary text between turns for every template that +// falls through to the differential autoparser (i.e. is not handled by a +// dedicated specialized template in common_chat_try_specialized_template). +// +// Markers were deduced manually from the jinja sources in models/templates/. +// ============================================================================ +struct role_marker_case { + std::string template_file; + std::string expected_user_start; + std::string expected_assistant_start; +}; + +static void test_role_markers_all_templates(testing & t) { + // Each entry is { template filename, user_start, assistant_start } as + // produced when rendering the standard chatml-like sequences. The values + // come from reading each jinja template and tracing what text precedes + // a user/assistant message body once the autoparser strips any reasoning + // markers it detected first. + const std::vector cases = { + // ChatML family: <|im_start|>{role} ... <|im_end|> + { "Bielik-11B-v3.0-Instruct.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "HuggingFaceTB-SmolLM3-3B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "MiMo-VL.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen3.5-4B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen3-Coder.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen-Qwen2.5-7B-Instruct.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen-Qwen3-0.6B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen-QwQ-32B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "StepFun3.5-Flash.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "stepfun-ai-Step-3.5-Flash.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + + // DeepSeek family + { "deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja", "<๏ฝœUser๏ฝœ>", "<๏ฝœAssistant๏ฝœ>" }, + { "deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja", "<๏ฝœUser๏ฝœ>", "<๏ฝœAssistant๏ฝœ>" }, + { "deepseek-ai-DeepSeek-V3.1.jinja", "<๏ฝœUser๏ฝœ>", "<๏ฝœAssistant๏ฝœ>" }, + { "llama-cpp-deepseek-r1.jinja", "<๏ฝœUser๏ฝœ>", "<๏ฝœAssistant๏ฝœ>" }, + + // Llama 3 header family + { "meetkai-functionary-medium-v3.1.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + { "meta-llama-Llama-3.1-8B-Instruct.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + { "meta-llama-Llama-3.2-3B-Instruct.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + { "meta-llama-Llama-3.3-70B-Instruct.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + // fireworks-ai forces a trailing assistant header even without add_generation_prompt, + // so the marker is absorbed into the common suffix and assistant_start is detected as empty. + { "fireworks-ai-llama-3-firefunction-v2.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + + // Phi/GLM/Apriel-style: <|user|> / <|assistant|> + { "microsoft-Phi-3.5-mini-instruct.jinja", "<|user|>", "<|assistant|>" }, + { "GLM-4.6.jinja", "<|user|>", "<|assistant|>" }, + { "unsloth-Apriel-1.5.jinja", "<|user|>", "<|assistant|>" }, + { "GLM-4.7-Flash.jinja", "<|user|>", "<|assistant|>" }, + + // Gemma 2: {user|model} + { "google-gemma-2-2b-it.jinja", "user", "model" }, + + // IBM Granite + { "ibm-granite-granite-3.3-2B-Instruct.jinja", "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" }, + { "ibm-granite-granite-4.0.jinja", "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" }, + + // Cohere R-series + { "CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja", + "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_RESPONSE|>" }, + { "CohereForAI-c4ai-command-r-plus-tool_use.jinja", + "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" }, + + // Mistral: assistant content follows [/INST] immediately, no header + { "mistralai-Mistral-Nemo-Instruct-2407.jinja", "[INST]", "" }, + { "Mistral-Small-3.2-24B-Instruct-2506.jinja", "[INST]", "" }, + + // Apertus uses <|user_start|> / <|assistant_start|> but the user diff + // carries the preceding <|assistant_end|> from the previous turn. + { "Apertus-8B-Instruct.jinja", "<|user_start|>", "<|assistant_start|>" }, + + // Apriel 1.6 wraps the assistant body with <|begin_assistant|>, but + // <|begin_assistant|> is also the detected reasoning start, so the + // assistant_start is trimmed back to the preceding newline. + { "Apriel-1.6-15b-Thinker-fixed.jinja", "<|begin_user|>", "<|begin_assistant|>" }, + + // ByteDance Seed-OSS: {role} + { "ByteDance-Seed-OSS.jinja", "user", "assistant" }, + + // GigaChat 3.1: {role}<|role_sep|> + { "GigaChat3.1-10B-A1.8B.jinja", "user<|role_sep|>", "assistant<|role_sep|>" }, + + // MiniMax M2: ]~b]{user|ai} + { "MiniMax-M2.jinja", "]~b]user", "]~b]ai" }, + + // Nemotron Nano v2: {User|Assistant}; assistant marker + // is followed by a prefilled block that gets included. + { "NVIDIA-Nemotron-Nano-v2.jinja", "User", "Assistant" }, + + // Reka Edge: "human: " / "assistant: " โ€” but the rendered preamble + // depends on enable_thinking, which currently confuses the user-start + // diff and trims the marker down. Lock in the observed value. + { "Reka-Edge.jinja", "human:", "assistant:" }, + + // RWKV-world chat preset: "User: " / "Assistant: " + { "llama-cpp-rwkv-world.jinja", "User:", "Assistant:" }, + + // Upstage Solar 100B: <|begin|>{role}... but reasoning marker absorbs + // the "<|begin|>assistant" prefix from assistant_start. + { "upstage-Solar-Open-100B.jinja", "<|begin|>user<|content|>", "<|begin|>assistant" }, + }; + + for (const auto & c : cases) { + t.test(c.template_file, [&](testing & t) { + common_chat_template tmpl = load_template(t, "models/templates/" + c.template_file); + struct autoparser ap; + ap.analyze_template(tmpl); + t.assert_equal("user_start", c.expected_user_start, ap.user_start); + t.assert_equal("assistant_start", c.expected_assistant_start, ap.assistant_start); + }); + } +} + // Test that reproduces the Seed-OSS template issue with embedded quotes static void test_tagged_args_with_embedded_quotes(testing & t) { json tools = build_edit_tool(); diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index bf45d737c83..c388dee1c45 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -618,6 +618,16 @@ int main_automated_tests(void) { }, { /* .name= */ "ibm-granite/granite-4.0 (tool call)", + /* .template_str= */ "{%- for message in messages %}\n {%- if message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- else %}\n {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n {%- endif %}\n{%- endfor %}\n{# g4_default_system_message #}", + /* .expected_output= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|><|tool_call|>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n<|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>", + /* .expected_output_jinja= */ "", + /* .bos_token= */ "", + /* .eos_token= */ "", + /* .supported_with_jinja= */ true, + /* .extra_conversation= */ {{"user", "What is the weather?"}, {"assistant_tool_call", "\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n"}, {"tool_response", "{\"temperature\": 72}"}}, + }, + { + /* .name= */ "ibm-granite/granite-4.1 (tool call)", /* .template_str= */ "{%- for message in messages %}\n {%- if message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- else %}\n {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n {%- endif %}\n{%- endfor %}\n{# #}", /* .expected_output= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|><|tool_call|>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n<|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>", /* .expected_output_jinja= */ "", diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index ea9d87ebed2..30ea2c07213 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1,4 +1,4 @@ -// Tests chat handling, including grammar generation and parsing for tool calling, for various templates. +// Tests chat handling, including grammar genration and parsing for tool calling, for various templates. // // Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates, // e.g. given Minja (http://github.com/google/minja) checked out in parent dir: @@ -100,6 +100,25 @@ template static void assert_equals(const T & expected, const T & actua } } +static void assert_contains(const std::string & haystack, const std::string & needle) { + if (haystack.find(needle) == std::string::npos) { + LOG_ERR("Expected to contain: %s\n", needle.c_str()); + LOG_ERR("Actual: %s\n", haystack.c_str()); + common_log_flush(common_log_main()); + throw std::runtime_error("Test failed"); + } +} + +static void assert_ends_with(const std::string & str, const std::string & suffix) { + if (str.size() < suffix.size() || + str.compare(str.size() - suffix.size(), suffix.size(), suffix) != 0) { + LOG_ERR("Expected to end with: %s\n", suffix.c_str()); + LOG_ERR("Actual: %s\n", str.c_str()); + common_log_flush(common_log_main()); + throw std::runtime_error("Test failed"); + } +} + static std::string read_file(const std::string & path) { std::ifstream fs(path, std::ios_base::binary); if (!fs.is_open()) { @@ -945,6 +964,8 @@ const common_chat_msg message_assist_call_python_lines_unclosed = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')"); const common_chat_msg message_assist_json_content = simple_assist_msg("{\n \"response\": \"Hello, world!\\nWhat's up?\"\n}"); +const common_chat_msg message_assist_prefill_content = simple_assist_msg("Hello, ", "I'm thinking"); +const common_chat_msg message_assist_prefill_reasoning = simple_assist_msg("", "I'm"); // Use for PEG parser implementations struct peg_test_case { @@ -1350,7 +1371,10 @@ class peg_test_builder { peg_test_case tc_; public: - peg_test_builder(peg_tester & tester, const std::string & input) : tester_(tester) { tc_.input = input; } + peg_test_builder(peg_tester & tester, const std::string & input) : tester_(tester) { + tc_.input = input; + tc_.params.add_generation_prompt = true; + } // Parameter setters peg_test_builder & reasoning_format(common_reasoning_format fmt) { @@ -1373,6 +1397,16 @@ class peg_test_builder { return *this; } + peg_test_builder & add_generation_prompt(bool val) { + tc_.params.add_generation_prompt = val; + return *this; + } + + peg_test_builder & continue_final_message(common_chat_continuation cont) { + tc_.params.continue_final_message = cont; + return *this; + } + peg_test_builder & json_schema(const std::string & schema) { tc_.params.json_schema = schema; return *this; @@ -1514,6 +1548,40 @@ static void test_msgs_oaicompat_json_conversion() { } } +static void test_split_by_role() { + LOG_DBG("%s\n", __func__); + + // Empty inputs + assert_equals(0, common_chat_split_by_role("", {}).size()); + assert_equals(0, common_chat_split_by_role("hello", {}).size()); + assert_equals(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size()); + + // Multi-role conversation, no leading/trailing content + { + const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye"; + const auto splits = common_chat_split_by_role(prompt, { + { "user", "<|user|>" }, + { "assistant", "<|assistant|>" }, + }); + assert_equals(3, splits.size()); + + assert_equals("user", splits[0].role); + assert_equals(0, splits[0].pos); + assert_equals(10, splits[0].len); + assert_equals("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len)); + + assert_equals("assistant", splits[1].role); + assert_equals(10, splits[1].pos); + assert_equals(18, splits[1].len); + assert_equals("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len)); + + assert_equals("user", splits[2].role); + assert_equals(28, splits[2].pos); + assert_equals(11, splits[2].len); + assert_equals("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len)); + } +} + static void test_tools_oaicompat_json_conversion() { LOG_DBG("%s\n", __func__); std::vector tools{ @@ -1664,6 +1732,83 @@ static void test_convert_responses_to_chatcmpl() { assert_equals(false, result.contains("max_output_tokens")); assert_equals(100, result.at("max_tokens").get()); } + + // Test mixed Responses tools: convert only function tools + { + json input = json::parse(R"({ + "input": "Hello", + "model": "test-model", + "tools": [ + { + "type": "web_search" + }, + { + "type": "function", + "name": "get_weather", + "description": "Get weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + "required": ["location"] + } + }, + { + "type": "image_generation" + }, + { + "type": "mcp", + "server_label": "test-server" + }, + { + "type": "namespace", + "name": "browser" + } + ] + })"); + + json result = server_chat_convert_responses_to_chatcmpl(input); + + assert_equals(true, result.contains("tools")); + assert_equals(true, result.at("tools").is_array()); + assert_equals((size_t)1, result.at("tools").size()); + + const auto & tool = result.at("tools")[0]; + assert_equals(std::string("function"), tool.at("type").get()); + assert_equals(std::string("get_weather"), tool.at("function").at("name").get()); + assert_equals(true, tool.at("function").at("strict").get()); + } + + // Test non-function Responses tools are ignored + { + json input = json::parse(R"({ + "input": "Hello", + "model": "test-model", + "tools": [ + { + "type": "web_search" + }, + { + "type": "image_generation" + }, + { + "type": "mcp", + "server_label": "test-server" + }, + { + "type": "namespace", + "name": "browser" + } + ] + })"); + + json result = server_chat_convert_responses_to_chatcmpl(input); + + assert_equals(false, result.contains("tools")); + } } static void test_template_output_peg_parsers(bool detailed_debug) { @@ -1968,6 +2113,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_content("Final answer without tools.") .run(); + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + { common_chat_msg user_start; user_start.role = "user"; @@ -2127,6 +2293,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking[/THINK]Hello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -2273,6 +2460,26 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -2332,6 +2539,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).expect_reconstruction().run(); tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).expect_reconstruction().run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -2503,6 +2718,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist) .run(); + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + { // additional tests for https://github.com/ggml-org/llama.cpp/pull/21760 auto tmpls = read_templates("models/templates/google-gemma-4-31B-it.jinja"); @@ -2556,6 +2792,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); tst.test("Hello, world!").reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(simple_assist_msg("Hello, world!")).run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { // NousResearch-Hermes-2-Pro and Hermes-3 (tool calling models) @@ -2579,6 +2836,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { // Note: Hermes template doesn't support thinking/reasoning natively // Note: We only support one tool calling format per template, no alternate formats + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } { // Test simple content-only template @@ -2602,6 +2867,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { // .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) // .expect(message_assist_thoughts) // .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -2611,6 +2897,29 @@ static void test_template_output_peg_parsers(bool detailed_debug) { tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + tst.test( + "\n" + "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" + "") + .tools({ special_function_tool }) + .expect(message_assist_call) + .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); + } + + { + // IBM Granite 4.1 (same format as 4.0) + auto tst = peg_tester("models/templates/ibm-granite-granite-4.1.jinja", detailed_debug); + + tst.test("Hello, world!\nWhat's up?").expect(message_assist).run(); + tst.test( "\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" @@ -2707,6 +3016,26 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -2942,6 +3271,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { { "set_unit", R"({"unit": "celsius"})", {} }, }) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } { auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug); @@ -2953,6 +3290,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}")) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -3207,6 +3565,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { { "magic_int", R"({"ref": 42, "name": "foo bar"})", {} }, }) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // GLM-4.6 tests - format: function_name\n...\n...\n @@ -3303,6 +3682,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_reconstruction() .run(); } + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Verify the throw path produces a readable error message, not std::out_of_range. @@ -3553,6 +3953,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { } }) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -3580,6 +4001,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(kimi_id_special_func_tool_call) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // LFM2-8B-A1B tests - uses <|tool_list_start|>/<|tool_list_end|> and <|tool_call_start|>[name(args)]<|tool_call_end|> @@ -3665,6 +4107,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { { "special_function", R"({"arg1": 1})", {} }, }) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens @@ -3726,6 +4189,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .tools({ empty_args_tool }) .expect(simple_assist_msg("", "", "empty_args", "{}")) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Reka-Edge tests - uses native JSON format with per-call wrapper @@ -3811,6 +4295,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { { "special_function", R"({"arg1": 1})", {} }, }) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } @@ -3823,6 +4328,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // MiniMax-M2 tests - XML invoke format with parameter tags @@ -3847,16 +4360,48 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call) .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // NVIDIA-Nemotron-Nano-v2 tests - ... format // Format: [{"name": "func", "arguments": {...}}] { auto tst = peg_tester("models/templates/NVIDIA-Nemotron-Nano-v2.jinja", detailed_debug); - tst.test("[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]") + tst.test("[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]") .tools({ special_function_tool }) .expect(message_assist_call) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // CohereForAI-c4ai-command-r7b (uses START_RESPONSE/END_RESPONSE, START_THINKING/END_THINKING, START_ACTION/END_ACTION) @@ -3886,6 +4431,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .tools({ special_function_tool }) .expect(message_assist_call) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // mistralai-Mistral-Nemo-Instruct-2407.jinja @@ -3897,6 +4450,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call_id) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } { auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.1.jinja", detailed_debug); @@ -3906,6 +4467,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Functionary v3.2 - recipient-based format: >>>recipient\n{content} { @@ -3916,6 +4485,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // FireFunction @@ -3927,6 +4504,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // DeepSeek R1 Distill Llama 8B - reasoning tests only (forced open thinking) @@ -3942,6 +4527,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) .expect(message_assist_thoughts) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // llama-cpp DeepSeek R1 template (always forced-open thinking) { @@ -3959,6 +4565,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .parallel_tool_calls(true) .expect(message_assist_call) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // DeepSeek R1 Distill Qwen 32B - reasoning tests only (forced open thinking) // Note: Template uses forced-open mode (prompt ends with ), so input shouldn't include opening tag @@ -3979,6 +4606,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) .expect(message_assist_call) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinkingHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // MiMo-VL / Hermes 3 / Qwen 2.5 (Common JSON format) @@ -3992,6 +4640,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Reka Edge @@ -4037,6 +4693,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .is_partial(true) .expect(message_assist_call_cutoff_args) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Apriel 1.5 @@ -4047,6 +4724,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .tools({ special_function_tool }) .expect(message_assist_call) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Apriel 1.6 Thinker (reasoning-only support) @@ -4070,6 +4755,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .tools({ special_function_tool }) .expect(simple_assist_msg("", "Here are my reasoning steps:\nI'm\nthinking", "special_function", "{\"arg1\":1}")) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n[BEGIN FINAL RESPONSE]\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Mistral Small 3.2 - FUNC_BRACKET_TAG format: [TOOL_CALLS]func_name[CALL_ID]id[ARGS]{...} @@ -4095,7 +4801,13 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect_reconstruction() .run(); - + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // Devstral { @@ -4117,18 +4829,42 @@ static void test_template_output_peg_parsers(bool detailed_debug) { // Llama 3.1 auto tst = peg_tester("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja", detailed_debug); tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } { // Llama 3.2 auto tst = peg_tester("models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja", detailed_debug); tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } { // Llama 3.3 auto tst = peg_tester("models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja", detailed_debug); tst.test("Hello, world!\nWhat's up?").tools({ python_tool }).expect(message_assist).expect_reconstruction().run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // GPT-OSS format tests @@ -4289,6 +5025,27 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .expect(message_assist_thoughts) .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } { @@ -4479,6 +5236,26 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); + // Continuation tests + tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); + + tst.test(" thinking\n\nHello, world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .enable_thinking(true) + .messages({ message_user, message_assist_prefill_reasoning }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_REASONING) + .expect_reasoning("I'm thinking") + .expect_content("Hello, world!\nWhat's up?") + .run(); } // GigaChat V3 @@ -4499,6 +5276,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call_content) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } // GigaChat V3.1 @@ -4519,55 +5304,154 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .expect(message_assist_call_content) .expect_reconstruction() .run(); + + // Continuation tests + tst.test("world!\nWhat's up?") + .messages({ message_user, message_assist_prefill_content }) + .add_generation_prompt(false) + .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_content("Hello, world!\nWhat's up?") + .run(); } } -static void test_reka_edge_common_path() { - auto tmpls = read_templates("models/templates/Reka-Edge.jinja"); +static void test_template_generation_prompt() { + common_chat_msg system_msg; + system_msg.role = "system"; + system_msg.content ="You are a helpful assistant."; - { - common_chat_templates_inputs inputs; - common_chat_msg system_msg; - system_msg.role = "system"; - system_msg.content = "Use tools when needed."; + common_chat_msg tool_call_msg = simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"); - common_chat_msg tool_call_msg = simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"); + common_chat_msg tool_msg; + tool_msg.role = "tool"; + tool_msg.tool_name = "special_function"; + tool_msg.tool_call_id = "call0"; + tool_msg.content = "Sunny"; - common_chat_msg tool_msg; - tool_msg.role = "tool"; - tool_msg.tool_name = "special_function"; - tool_msg.tool_call_id = "call0"; - tool_msg.content = "Sunny"; + struct test_case_options { + std::vector messages; + bool add_generation_prompt = true; + common_chat_continuation continue_final_message = COMMON_CHAT_CONTINUATION_NONE; + }; - inputs.messages = { system_msg, message_user, tool_call_msg, tool_msg, message_user }; - inputs.tools = { special_function_tool }; - inputs.enable_thinking = true; - inputs.add_generation_prompt = true; + auto basic = [&]() { + test_case_options opts; + opts.messages = { system_msg, message_user }; + return opts; + }; + + auto continuation_content = [&]() { + test_case_options opts; + opts.messages = { system_msg, message_user, message_assist_prefill_content }; + opts.add_generation_prompt = false; + opts.continue_final_message = COMMON_CHAT_CONTINUATION_CONTENT; + return opts; + }; + + auto continuation_reasoning = [&]() { + test_case_options opts; + opts.messages = { system_msg, message_user, message_assist_prefill_reasoning }; + opts.add_generation_prompt = false; + opts.continue_final_message = COMMON_CHAT_CONTINUATION_REASONING; + return opts; + }; + + auto check = [&](const common_chat_templates_ptr & tmpls, + const test_case_options & opts, + const std::string & expected_generation_prompt) { + common_chat_templates_inputs inputs; + inputs.messages = opts.messages; + inputs.add_generation_prompt = opts.add_generation_prompt; + inputs.continue_final_message = opts.continue_final_message; auto params = common_chat_templates_apply(tmpls.get(), inputs); - if (params.prompt.find("\nSunny\n") == std::string::npos) { - throw std::runtime_error("Reka Edge prompt did not render tool response history"); - } - if (params.prompt.rfind("assistant: \n") == std::string::npos) { - throw std::runtime_error("Reka Edge prompt did not render thinking generation prompt"); - } + assert_contains(params.prompt, system_msg.content); + assert_contains(params.prompt, message_user.content); + assert_equals(expected_generation_prompt, params.generation_prompt); + assert_ends_with(params.prompt, expected_generation_prompt); + }; + + { + auto tmpls = read_templates("models/templates/Qwen3.5-4B.jinja"); + check(tmpls, basic(), "<|im_start|>assistant\n\n"); + check(tmpls, continuation_content(), "<|im_start|>assistant\n\nI'm thinking\n\n\nHello, "); + check(tmpls, continuation_reasoning(), "<|im_start|>assistant\n\nI'm"); } { - common_chat_templates_inputs inputs; - inputs.messages = { - message_user, - simple_assist_msg("The first point is") - }; - inputs.add_generation_prompt = false; - inputs.enable_thinking = false; - inputs.chat_template_kwargs["continue_final_message"] = "true"; + auto tmpls = read_templates("models/templates/openai-gpt-oss-120b.jinja"); + check(tmpls, basic(), "<|start|>assistant"); + check(tmpls, continuation_content(), "<|start|>assistant<|channel|>analysis<|message|>I'm thinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, "); + check(tmpls, continuation_reasoning(), "<|start|>assistant<|channel|>analysis<|message|>I'm"); + } - auto params = common_chat_templates_apply(tmpls.get(), inputs); - if (string_ends_with(params.prompt, "")) { - throw std::runtime_error("Reka Edge continue_final_message unexpectedly closed the assistant turn"); - } + { + auto tmpls = read_templates("models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja"); + check(tmpls, basic(), ""); + check(tmpls, continuation_content(), "[THINK]I'm thinking[/THINK]Hello, "); + check(tmpls, continuation_reasoning(), "[THINK]I'm"); + } + + { + auto tmpls = read_templates("models/templates/google-gemma-4-31B-it.jinja"); + check(tmpls, basic(), "<|turn>model\n"); + check(tmpls, continuation_content(), "<|turn>model\n<|channel>thought\nI'm thinkingHello, "); + check(tmpls, continuation_reasoning(), "<|turn>model\n<|channel>thought\nI'm"); + + // Special case when last message is a tool response + test_case_options after_tool_call = continuation_reasoning(); + after_tool_call.messages = { system_msg, message_user, tool_call_msg, tool_msg, message_assist_prefill_reasoning }; + check(tmpls, after_tool_call, "<|channel>thought\nI'm"); + } + + { + auto tmpls = read_templates("models/templates/meetkai-functionary-medium-v3.2.jinja"); + check(tmpls, basic(), "<|start_header_id|>assistant<|end_header_id|>\n\n>>>"); + check(tmpls, continuation_content(), "<|start_header_id|>assistant<|end_header_id|>\n\n>>>all\nHello, "); + check(tmpls, continuation_reasoning(), "<|start_header_id|>assistant<|end_header_id|>\n\n>>>all\n"); + } + + { + auto tmpls = read_templates("models/templates/Reka-Edge.jinja"); + check(tmpls, basic(), "assistant: \n"); + check(tmpls, continuation_content(), "assistant: \nI'm thinking\n\n\nHello, "); + check(tmpls, continuation_reasoning(), "assistant: \nI'm"); + } + + { + auto tmpls = read_templates("models/templates/moonshotai-Kimi-K2.jinja"); + check(tmpls, basic(), "<|im_assistant|>assistant<|im_middle|>"); + check(tmpls, continuation_content(), "<|im_assistant|>assistant<|im_middle|>I'm thinkingHello, "); + check(tmpls, continuation_reasoning(), "<|im_assistant|>assistant<|im_middle|>I'm"); + } + + { + auto tmpls = read_templates("models/templates/LFM2-8B-A1B.jinja"); + check(tmpls, basic(), "<|im_start|>assistant\n"); + check(tmpls, continuation_content(), "<|im_start|>assistant\nI'm thinkingHello, "); + check(tmpls, continuation_reasoning(), "<|im_start|>assistant\nI'm"); + } + + { + auto tmpls = read_templates("models/templates/LFM2.5-Instruct.jinja"); + check(tmpls, basic(), "<|im_start|>assistant\n"); + check(tmpls, continuation_content(), "<|im_start|>assistant\nI'm thinkingHello, "); + check(tmpls, continuation_reasoning(), "<|im_start|>assistant\nI'm"); + } + + { + auto tmpls = read_templates("models/templates/GigaChat3-10B-A1.8B.jinja"); + check(tmpls, basic(), "assistant<|role_sep|>\n"); + check(tmpls, continuation_content(), "assistant<|role_sep|>\nHello, "); + check(tmpls, continuation_reasoning(), "assistant<|role_sep|>\n"); + } + + { + auto tmpls = read_templates("models/templates/deepseek-ai-DeepSeek-V3.2.jinja"); + check(tmpls, basic(), "<๏ฝœAssistant๏ฝœ>"); + check(tmpls, continuation_content(), "<๏ฝœAssistant๏ฝœ>I'm thinkingHello, "); + check(tmpls, continuation_reasoning(), "<๏ฝœAssistant๏ฝœ>I'm"); } } @@ -4761,10 +5645,11 @@ int main(int argc, char ** argv) { { test_msg_diffs_compute(); test_msgs_oaicompat_json_conversion(); + test_split_by_role(); test_tools_oaicompat_json_conversion(); test_convert_responses_to_chatcmpl(); test_developer_role_to_system_workaround(); - test_reka_edge_common_path(); + test_template_generation_prompt(); test_template_output_peg_parsers(detailed_debug); std::cout << "\n[chat] All tests passed!" << '\n'; } diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index ed3070dc4de..1ae468fbd65 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -162,6 +162,42 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) { GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes); } +static std::vector read_file_to_buffer(FILE * file) { + GGML_ASSERT(file != nullptr); + GGML_ASSERT(fseek(file, 0, SEEK_END) == 0); + + const long size = ftell(file); + GGML_ASSERT(size >= 0); + + rewind(file); + + std::vector data(static_cast(size)); + GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size()); + + rewind(file); + return data; +} + +struct callback_reader_data { + const uint8_t * data; + size_t size; +}; + +static size_t read_buffer_callback(void * userdata, void * output, uint64_t offset, size_t len) { + GGML_ASSERT(len > 0); + + const callback_reader_data & reader = *static_cast(userdata); + + if (offset > reader.size || len > reader.size - offset) { + return 0; + } + + const size_t data_offset = static_cast(offset); + const size_t nread = std::min(len, reader.size - data_offset); + memcpy(static_cast(output), reader.data + data_offset, nread); + return nread; +} + static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) { FILE * file = tmpfile(); @@ -1095,10 +1131,29 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml return ok; } -static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) { +enum roundtrip_read_mode { + ROUNDTRIP_READ_MODE_FILE, + ROUNDTRIP_READ_MODE_BUFFER, + ROUNDTRIP_READ_MODE_CALLBACK, +}; + +static const char * roundtrip_read_mode_name(const roundtrip_read_mode mode) { + switch (mode) { + case ROUNDTRIP_READ_MODE_FILE: return "file"; + case ROUNDTRIP_READ_MODE_BUFFER: return "buffer"; + case ROUNDTRIP_READ_MODE_CALLBACK: return "callback"; + } + + GGML_ABORT("fatal error"); +} + +static std::pair test_roundtrip( + ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta, + const roundtrip_read_mode read_mode) { ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - printf("%s: device=%s, backend=%s, only_meta=%s\n", - __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no"); + printf("%s: device=%s, backend=%s, only_meta=%s, read_mode=%s\n", + __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), + only_meta ? "yes" : "no", roundtrip_read_mode_name(read_mode)); int npass = 0; int ntest = 0; @@ -1133,7 +1188,22 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned /*no_alloc =*/ false, /*ctx =*/ only_meta ? nullptr : &ctx_1, }; - struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); + struct gguf_context * gguf_ctx_1 = nullptr; + const std::vector data = read_mode == ROUNDTRIP_READ_MODE_FILE + ? std::vector() + : read_file_to_buffer(file); + + if (read_mode == ROUNDTRIP_READ_MODE_BUFFER) { + gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params); + } else if (read_mode == ROUNDTRIP_READ_MODE_CALLBACK) { + callback_reader_data reader = { + /*.data = */ data.data(), + /*.size = */ data.size(), + }; + gguf_ctx_1 = gguf_init_from_callback(read_buffer_callback, &reader, 4096, 4ull << 30 /* 4GB */, gguf_params); + } else { + gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); + } printf("%s: same_version: ", __func__); if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) { @@ -1343,7 +1413,17 @@ int main(int argc, char ** argv) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); for (bool only_meta : {true, false}) { - std::pair result = test_roundtrip(dev, seed, only_meta); + std::pair result = test_roundtrip(dev, seed, only_meta, ROUNDTRIP_READ_MODE_FILE); + npass += result.first; + ntest += result.second; + } + { + std::pair result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_BUFFER); + npass += result.first; + ntest += result.second; + } + { + std::pair result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_CALLBACK); npass += result.first; ntest += result.second; } diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 16af11a2862..1def7faff60 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -12,6 +12,7 @@ #include "../src/llama-model-saver.h" #include +#include #include #include #include @@ -99,6 +100,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { n_ff = 96; n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded } else if (arch == LLM_ARCH_DEEPSEEK2 + || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR || arch == LLM_ARCH_MISTRAL4) { @@ -155,6 +157,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f); if (arch == LLM_ARCH_DEEPSEEK2 + || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR || arch == LLM_ARCH_MISTRAL4) { @@ -331,6 +334,7 @@ static bool moe_mandatory(const llm_arch arch) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK32: case LLM_ARCH_GLM4_MOE: case LLM_ARCH_GLM_DSA: case LLM_ARCH_EXAONE_MOE: @@ -497,6 +501,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg }; std::vector dev_configs; + size_t max_device_label_length = 4; { std::vector devices_meta; { @@ -504,6 +509,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg for (size_t i = 0; i < device_count; i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); dev_configs.emplace_back(std::vector{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER); + max_device_label_length = std::max(max_device_label_length, dev_configs.back().label.length()); // cpu-based devices cannot be used in tensor split mode if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) { @@ -515,10 +521,27 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR); } + size_t max_arch_name_length = 0; + for (const llm_arch & arch : llm_arch_all()) { + max_arch_name_length = std::max(max_arch_name_length, strlen(llm_arch_name(arch))); + } + + const std::string template_header = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|%15s|%9s|\n"; + const std::string template_row_cfg = std::string("|%" + std::to_string(max_arch_name_length) + "s|%") + std::to_string(max_device_label_length) + "s|%6s|"; + const std::string template_row_res = "%15s %10s|%20s|\n"; + bool all_ok = true; common_log_flush(common_log_main()); - printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip"); - printf("|----------------|------------------------------|------|---------------|---------|\n"); + printf(template_header.c_str(), "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip"); + printf("|"); + for (size_t i = 0; i < max_arch_name_length; i++) { + printf("-"); + } + printf("|"); + for (size_t i = 0; i < max_device_label_length; i++) { + printf("-"); + } + printf("|------|---------------|---------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (arch == LLM_ARCH_UNKNOWN) { continue; @@ -543,6 +566,11 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg std::pair model_and_ctx_cpu; std::vector logits_cpu; for (device_config & dc : dev_configs) { + // print test config first; should anything fail during model loading or inference, at least we know which test case caused it + printf(template_row_cfg.c_str(), + llm_arch_name(arch), dc.label.c_str(), config_name.c_str()); + fflush(stdout); + std::pair model_and_ctx_dev; std::vector logits_dev; std::string status_nmse = "\033[1;33mSKIP\033[0m"; @@ -595,8 +623,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg } } - printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(), - config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str()); + // log the results for this test case + printf(template_row_res.c_str(), + status_nmse.c_str(), nmse_str, status_roundtrip.c_str()); } } } diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp index f7a60178996..f54cff4f8a2 100644 --- a/tests/test-reasoning-budget.cpp +++ b/tests/test-reasoning-budget.cpp @@ -124,6 +124,136 @@ static void test_reasoning_budget( (void)sequence; } +static llama_token get_forced_token(struct llama_sampler * sampler, llama_token max_token) { + std::vector cur; + const size_t n_vocab = (size_t) max_token + 1; + for (size_t i = 0; i < n_vocab; i++) { + cur.emplace_back(llama_token_data{(llama_token) i, logf((float) (i + 1)), 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_sampler_apply(sampler, &cur_p); + + size_t finite_count = 0; + llama_token finite_token = LLAMA_TOKEN_NULL; + for (size_t i = 0; i < cur.size(); i++) { + if (std::isfinite(cur[i].logit)) { + finite_count++; + finite_token = cur[i].id; + } + } + + GGML_ASSERT(finite_count == 1 && "sampler is not forcing exactly one token"); + return finite_token; +} + +static void test_reasoning_budget_clone_mid_counting() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 2, REASONING_BUDGET_IDLE); + + llama_sampler_accept(sampler, 100); // COUNTING, remaining=2 + llama_sampler_accept(sampler, 50); // COUNTING, remaining=1 + + auto * clone = llama_sampler_clone(sampler); + llama_sampler_accept(clone, 51); // should exhaust the cloned remaining budget + + GGML_ASSERT(get_forced_token(clone, 102) == 102 && "cloned counting state lost remaining budget"); + + llama_sampler_free(clone); + llama_sampler_free(sampler); +} + +static void test_reasoning_budget_clone_mid_forcing() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 0, REASONING_BUDGET_FORCING); + + GGML_ASSERT(get_forced_token(sampler, 102) == 102); + llama_sampler_accept(sampler, 102); // advance to the second forced token + + auto * clone = llama_sampler_clone(sampler); + + GGML_ASSERT(get_forced_token(clone, 102) == 101 && "cloned forcing state lost force position"); + + llama_sampler_free(clone); + llama_sampler_free(sampler); +} + +static void test_reasoning_budget_force_manual() { + const std::vector start = {100}; + const std::vector end = {101}; + const std::vector forced = {102, 101}; + + // if COUNTING, force() succeeds and begins forcing the end sequence from the start + { + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 5, REASONING_BUDGET_IDLE); + + llama_sampler_accept(sampler, 100); // COUNTING, remaining=5 + llama_sampler_accept(sampler, 50); // COUNTING, remaining=4 + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_COUNTING); + + GGML_ASSERT(common_reasoning_budget_force(sampler) && "force() should succeed from COUNTING"); + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_FORCING); + + // forces the configured sequence from force_pos=0, then transitions to DONE + GGML_ASSERT(get_forced_token(sampler, 102) == 102); + llama_sampler_accept(sampler, 102); + GGML_ASSERT(get_forced_token(sampler, 102) == 101); + llama_sampler_accept(sampler, 101); + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_DONE); + + llama_sampler_free(sampler); + } + + // if IDLE, force() is a no-op + { + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 5, REASONING_BUDGET_IDLE); + + GGML_ASSERT(!common_reasoning_budget_force(sampler) && "force() must not transition from IDLE"); + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_IDLE); + + llama_sampler_free(sampler); + } + + // if DONE, force() is a no-op + { + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 5, REASONING_BUDGET_IDLE); + + llama_sampler_accept(sampler, 100); // COUNTING + llama_sampler_accept(sampler, 101); // natural end -> DONE + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_DONE); + + GGML_ASSERT(!common_reasoning_budget_force(sampler) && "force() must not transition from DONE"); + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_DONE); + + llama_sampler_free(sampler); + } + + // if FORCING, force() is a no-op and must not rewind the force position + { + auto * sampler = common_reasoning_budget_init(nullptr, start, end, forced, 0, REASONING_BUDGET_FORCING); + + GGML_ASSERT(get_forced_token(sampler, 102) == 102); + llama_sampler_accept(sampler, 102); // advance to the second forced token (force_pos=1) + + GGML_ASSERT(!common_reasoning_budget_force(sampler) && "force() must not transition from FORCING"); + GGML_ASSERT(common_reasoning_budget_get_state(sampler) == REASONING_BUDGET_FORCING); + GGML_ASSERT(get_forced_token(sampler, 102) == 101 && "force() must not rewind the force position"); + + llama_sampler_free(sampler); + } + + // a null sampler is safely ignored + GGML_ASSERT(!common_reasoning_budget_force(nullptr)); + + fprintf(stderr, " Test 'manual force transition' passed\n"); +} + // UTF-8 boundary detection unit test // Tests common_utf8_is_complete() from reasoning-budget.h static void test_utf8_boundary_detection() { @@ -250,7 +380,11 @@ int main(void) { 7); // forcing continues through i=7 } - printf("OK (6 tests passed)\n"); + test_reasoning_budget_clone_mid_counting(); + test_reasoning_budget_clone_mid_forcing(); + test_reasoning_budget_force_manual(); + + printf("OK (9 tests passed)\n"); printf("Testing UTF-8 boundary detection... "); test_utf8_boundary_detection(); diff --git a/tests/test-recurrent-state-rollback.cpp b/tests/test-recurrent-state-rollback.cpp new file mode 100644 index 00000000000..be19316db8a --- /dev/null +++ b/tests/test-recurrent-state-rollback.cpp @@ -0,0 +1,185 @@ +#include "arg.h" +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include + +static llama_context * make_ctx(const common_params & params, llama_model * model) { + auto cparams = common_context_params_to_llama(params); + cparams.n_seq_max = 1; + cparams.n_rs_seq = 8; + cparams.n_batch = std::max(cparams.n_batch, (uint32_t) (cparams.n_rs_seq + 1)); + cparams.n_ubatch = std::max(cparams.n_ubatch, (uint32_t) (cparams.n_rs_seq + 1)); + return llama_init_from_model(model, cparams); +} + +static bool decode_tokens(llama_context * ctx, const std::vector & tokens, uint32_t count) { + llama_batch batch = llama_batch_init(count, 0, 1); + for (uint32_t pos = 0; pos < count; ++pos) { + common_batch_add(batch, tokens[pos], pos, { 0 }, false); + } + const bool ok = llama_decode(ctx, batch) == 0; + llama_batch_free(batch); + return ok; +} + +static bool decode_one(llama_context * ctx, llama_token tok, llama_pos pos) { + llama_batch batch = llama_batch_init(1, 0, 1); + common_batch_add(batch, tok, pos, { 0 }, true); + const bool ok = llama_decode(ctx, batch) == 0; + llama_batch_free(batch); + return ok; +} + +int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + + common_params params; + params.sampling.seed = 1234; + params.n_predict = 1; + + common_init(); + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + return 1; + } + + ggml_backend_load_all(); + + common_init_result_ptr llama_init = common_init_from_params(params); + llama_model * model = llama_init->model(); + if (model == nullptr) { + fprintf(stderr, "%s : failed to init model\n", __func__); + return 1; + } + + if (!llama_model_is_recurrent(model) && !llama_model_is_hybrid(model)) { + fprintf(stderr, "%s : skipping for non-recurrent model\n", __func__); + return 0; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + const int n_vocab = llama_vocab_n_tokens(vocab); + + llama_context * ctx_src = make_ctx(params, model); + llama_context * ctx_dst = make_ctx(params, model); + if (ctx_src == nullptr || ctx_dst == nullptr) { + fprintf(stderr, "%s : failed to init contexts\n", __func__); + return 1; + } + + if (llama_n_rs_seq(ctx_src) == 0) { + fprintf(stderr, "%s : skipping because n_rs_seq is disabled\n", __func__); + llama_free(ctx_src); + llama_free(ctx_dst); + return 0; + } + + std::vector tokens = common_tokenize(ctx_src, "The quick brown fox jumps", true); + const uint32_t n_rs_seq = llama_n_rs_seq(ctx_src); + if (tokens.size() > n_rs_seq + 1) { + tokens.resize(n_rs_seq + 1); + } + if (tokens.size() < 2) { + fprintf(stderr, "%s : not enough prompt tokens\n", __func__); + return 1; + } + const uint32_t n_tokens = tokens.size(); + const llama_token last_tok = tokens.back(); + const llama_pos last_pos = (llama_pos) n_tokens - 2; + + // Decode the full prompt on the source, then roll back the last position. + // Rollback leaves the recurrent memory in a snapshot state (rs_idx != 0). + if (!decode_tokens(ctx_src, tokens, n_tokens)) { + fprintf(stderr, "%s : failed to decode prompt\n", __func__); + return 1; + } + if (!llama_memory_seq_rm(llama_get_memory(ctx_src), 0, last_pos, -1)) { + fprintf(stderr, "%s : rollback failed\n", __func__); + return 1; + } + + // Save the rolled-back state and restore it into a fresh context. + common_prompt_checkpoint ckpt; + ckpt.update_tgt(ctx_src, 0, 0); + ckpt.load_tgt(ctx_dst, 0, 0); + + // Replay the rolled-back token on both contexts and compare logits. + if (!decode_one(ctx_src, last_tok, last_pos) || + !decode_one(ctx_dst, last_tok, last_pos)) { + fprintf(stderr, "%s : replay failed\n", __func__); + return 1; + } + + const float * logits_src = llama_get_logits_ith(ctx_src, 0); + const float * logits_dst = llama_get_logits_ith(ctx_dst, 0); + if (logits_src == nullptr || logits_dst == nullptr) { + fprintf(stderr, "%s : missing logits\n", __func__); + return 1; + } + + constexpr float eps = 1e-5f; + for (int i = 0; i < n_vocab; ++i) { + if (std::fabs(logits_src[i] - logits_dst[i]) > eps) { + fprintf(stderr, "%s : logits mismatch at token %d (%g != %g)\n", + __func__, i, (double) logits_src[i], (double) logits_dst[i]); + return 1; + } + } + + // Repeat the load into a context that already has its own rollback state: + // groups 1..n_rs_seq hold a *different* prompt's history, and rs_idx[0] is + // non-zero at load time. The restore must wipe that state and still match. + llama_context * ctx_dirty = make_ctx(params, model); + if (ctx_dirty == nullptr) { + fprintf(stderr, "%s : failed to init dirty ctx\n", __func__); + return 1; + } + + std::vector noise = tokens; + for (auto & t : noise) { + t = (t + 1) % n_vocab; + if (t < 0) { + t = 0; + } + } + if (!decode_tokens(ctx_dirty, noise, n_tokens)) { + fprintf(stderr, "%s : dirty prompt decode failed\n", __func__); + return 1; + } + if (!llama_memory_seq_rm(llama_get_memory(ctx_dirty), 0, last_pos, -1)) { + fprintf(stderr, "%s : dirty rollback failed\n", __func__); + return 1; + } + + ckpt.load_tgt(ctx_dirty, 0, 0); + + if (!decode_one(ctx_dirty, last_tok, last_pos)) { + fprintf(stderr, "%s : dirty replay failed\n", __func__); + return 1; + } + + const float * logits_dirty = llama_get_logits_ith(ctx_dirty, 0); + if (logits_dirty == nullptr) { + fprintf(stderr, "%s : missing dirty logits\n", __func__); + return 1; + } + + for (int i = 0; i < n_vocab; ++i) { + if (std::fabs(logits_src[i] - logits_dirty[i]) > eps) { + fprintf(stderr, "%s : dirty-ctx logits mismatch at token %d (%g != %g)\n", + __func__, i, (double) logits_src[i], (double) logits_dirty[i]); + return 1; + } + } + + fprintf(stderr, "%s : recurrent rollback checkpoint restored successfully\n", __func__); + llama_free(ctx_src); + llama_free(ctx_dst); + llama_free(ctx_dirty); + return 0; +} diff --git a/tests/test-save-load-state.cpp b/tests/test-save-load-state.cpp new file mode 100644 index 00000000000..97ab7c6de3b --- /dev/null +++ b/tests/test-save-load-state.cpp @@ -0,0 +1,345 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama-cpp.h" + +#include +#include + +struct llama_batch_ptr { + llama_batch batch; + + llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max) + : batch{llama_batch_init(n_tokens, embd, n_seq_max)} {} + + ~llama_batch_ptr() { llama_batch_free(batch); } + + llama_batch_ptr(const llama_batch_ptr &) = delete; + llama_batch_ptr & operator=(const llama_batch_ptr &) = delete; + llama_batch_ptr(llama_batch_ptr &&) = default; + llama_batch_ptr & operator=(llama_batch_ptr &&) = default; + + llama_batch & get() { return batch; } + const llama_batch & get() const { return batch; } +}; + +static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) { + std::string result; + llama_batch_ptr batch(1, 0, 1); + + for (int i = 0; i < n_predict; i++) { + auto next_token = llama_sampler_sample(smpl, ctx, -1); + auto next_token_str = common_token_to_piece(ctx, next_token); + + LOG("%s", next_token_str.c_str()); + result += next_token_str; + + common_batch_clear(batch.get()); + common_batch_add(batch.get(), next_token, n_past, {seq_id}, true); + + if (llama_decode(ctx, batch.get())) { + LOG_ERR("\n%s: failed to evaluate\n", __func__); + return {}; + } + n_past++; + } + + return result; +} + +// Test 1: baseline +// - tokenize the prompt +// - decode all but the last token +// - save state to disk +// - decode the last token +// - generate n_predict tokens +static std::string test_baseline(struct llama_model * model, const struct common_params & params) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + auto n_past = 0; + if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) { + LOG_ERR("%s: failed to decode prompt\n", __func__); + return {}; + } + + LOG("\n=== Test 1: baseline ===\n"); + LOG("%s", params.prompt.c_str()); + + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return {}; + } + + LOG("\n"); + + return result; +} + + +// Test 2: state load +// - create a new context +// - load state from file +// - replay the last prompt token +// - generate n_predict tokens and compare against expected result +static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + LOG("\n=== Test 2: state load ===\n"); + LOG("%s", params.prompt.c_str()); + + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } + + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); + + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; + } + n_past++; + + // Generate tokens + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} + + +// Test 3: seq copy (host) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the CPU path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + LOG("\n=== Test 3: seq copy (host) ===\n"); + LOG("%s", params.prompt.c_str()); + + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } + + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); + + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; + } + n_past++; + + // Migrate KV cache from seq 0 to seq 1 (CPU path) + { + std::vector seq_store(llama_state_seq_get_size(ctx.get(), 0)); + const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0); + if (ncopy != seq_store.size()) { + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); + + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); + + const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1); + if (nset != seq_store.size()) { + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); + } + + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} + + +// Test 4: seq copy (device) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the on-device path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + LOG("\n=== Test 4: seq copy (device) ===\n"); + LOG("%s", params.prompt.c_str()); + + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } + + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); + + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; + } + n_past++; + + // Migrate KV cache from seq 0 to seq 1 (on-device path) + { + std::vector seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); + const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + if (ncopy != seq_store.size()) { + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); + + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); + + const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + if (nset != seq_store.size()) { + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); + } + + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} + + +int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + + common_params params; + params.prompt = "The quick brown fox"; + params.out_file = "dump_state.bin"; + params.sampling.seed = 1234; + + common_init(); + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + return 1; + } + + if (params.n_parallel == 1) { + LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__); + params.kv_unified = true; + } + + if (params.n_predict < 0) { + params.n_predict = 16; + } + + ggml_backend_load_all(); + + auto llama_init = common_init_from_params(params, true); + auto * model = llama_init->model(); + + if (model == nullptr) { + LOG_ERR("%s: failed to init\n", __func__); + return 1; + } + + GGML_ASSERT(llama_init->context() == nullptr); + + // Test 1: baseline (saves state to disk) + auto result_baseline = test_baseline(model, params); + if (result_baseline.empty()) { + return 1; + } + + // Test 2: state load + if (!test_state_load(model, params, result_baseline)) { + return 1; + } + + // Test 3: seq copy (host) + if (!test_seq_cp_host(model, params, result_baseline)) { + return 1; + } + + // Test 4: seq copy (device) + if (!test_seq_cp_device(model, params, result_baseline)) { + return 1; + } + + LOG("\nAll tests passed.\n"); + + return 0; +} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b433c91d85e..a60d3dab469 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -22,6 +22,7 @@ else() add_subdirectory(perplexity) add_subdirectory(quantize) if (LLAMA_BUILD_SERVER) + add_subdirectory(ui) add_subdirectory(cli) add_subdirectory(server) endif() diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index f9ffd2d4ce7..f6ed257f556 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -1,6 +1,23 @@ +# llama-batched-bench-impl: batched-bench logic, reusable by app + +set(TARGET llama-batched-bench-impl) + +add_library(${TARGET} batched-bench.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-batched-bench executable + set(TARGET llama-batched-bench) -add_executable(${TARGET} batched-bench.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-batched-bench-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp index 3964ef25955..e2dcd0b2e71 100644 --- a/tools/batched-bench/batched-bench.cpp +++ b/tools/batched-bench/batched-bench.cpp @@ -15,7 +15,10 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_batched_bench(int argc, char ** argv); + +int llama_batched_bench(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_params params; diff --git a/tools/batched-bench/main.cpp b/tools/batched-bench/main.cpp new file mode 100644 index 00000000000..958cfc5b31c --- /dev/null +++ b/tools/batched-bench/main.cpp @@ -0,0 +1,5 @@ +int llama_batched_bench(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_batched_bench(argc, argv); +} diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index 7e01abb81b9..a3e635719b6 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -1,9 +1,24 @@ +# llama-cli-impl: CLI logic, reusable by app + +set(TARGET llama-cli-impl) + +add_library(${TARGET} cli.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server) +target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-cli executable + set(TARGET llama-cli) -add_executable(${TARGET} cli.cpp) -target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) -include_directories(../server) +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-cli-impl) +target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} RUNTIME) diff --git a/tools/cli/README.md b/tools/cli/README.md index 9f0574d25d3..b11aa45ce95 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -12,7 +12,6 @@ | -------- | ----------- | | `-h, --help, --usage` | print usage and exit | | `--version` | show version and build info | -| `--license` | show source code license and dependencies | | `-cl, --cache-list` | show list of models in cache | | `--completion-bash` | print source-able bash completion script for llama.cpp | | `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | @@ -55,7 +54,6 @@ | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -89,13 +87,13 @@ | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) | | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | | `--log-disable` | Log disable | -| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) | -| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | +| `--log-file FNAME` | Log to file
(env: LLAMA_ARG_LOG_FILE) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_ARG_LOG_COLORS) | | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | -| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | -| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | -| `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | -| `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | +| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_ARG_OFFLINE) | +| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: trace (more info)
- 5: debug
(default: 3)

(env: LLAMA_ARG_LOG_VERBOSITY) | +| `--log-prefix, --no-log-prefix` | Enable prefix in log messages
(env: LLAMA_ARG_LOG_PREFIX) | +| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages
(env: LLAMA_ARG_LOG_TIMESTAMPS) | | `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) | | `--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) | @@ -148,7 +146,6 @@ | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | -| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | @@ -167,14 +164,14 @@ | `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | -| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | +| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) | @@ -192,14 +189,15 @@ | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)
(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | -| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | +| `--spec-type none,draft-simple,draft-eagle3,draft-mtp,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) | | `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) | | `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) | diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 369c24216b7..af40adbb4ce 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -342,7 +342,10 @@ static std::vector> auto_completion_callback(std: static constexpr size_t FILE_GLOB_MAX_RESULTS = 100; -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_cli(int argc, char ** argv); + +int llama_cli(int argc, char ** argv) { common_params params; params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs diff --git a/tools/cli/main.cpp b/tools/cli/main.cpp new file mode 100644 index 00000000000..cb7d795b666 --- /dev/null +++ b/tools/cli/main.cpp @@ -0,0 +1,5 @@ +int llama_cli(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_cli(argc, argv); +} diff --git a/tools/completion/CMakeLists.txt b/tools/completion/CMakeLists.txt index 2c7df80652c..a310251eff6 100644 --- a/tools/completion/CMakeLists.txt +++ b/tools/completion/CMakeLists.txt @@ -1,6 +1,23 @@ +# llama-completion-impl: completion logic, reusable by app + +set(TARGET llama-completion-impl) + +add_library(${TARGET} completion.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-completion executable + set(TARGET llama-completion) -add_executable(${TARGET} completion.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-completion-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/completion/README.md b/tools/completion/README.md index 048cf7416fc..d90f8174866 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -95,7 +95,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | -------- | ----------- | | `-h, --help, --usage` | print usage and exit | | `--version` | show version and build info | -| `--license` | show source code license and dependencies | | `-cl, --cache-list` | show list of models in cache | | `--completion-bash` | print source-able bash completion script for llama.cpp | | `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | @@ -138,7 +137,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -172,13 +170,13 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) | | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | | `--log-disable` | Log disable | -| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) | -| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | +| `--log-file FNAME` | Log to file
(env: LLAMA_ARG_LOG_FILE) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_ARG_LOG_COLORS) | | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | -| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | -| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | -| `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | -| `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | +| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_ARG_OFFLINE) | +| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: trace (more info)
- 5: debug
(default: 3)

(env: LLAMA_ARG_LOG_VERBOSITY) | +| `--log-prefix, --no-log-prefix` | Enable prefix in log messages
(env: LLAMA_ARG_LOG_PREFIX) | +| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages
(env: LLAMA_ARG_LOG_TIMESTAMPS) | | `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) | | `--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) | @@ -255,8 +253,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 1dc5df1afa2..dffcadd4131 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -84,7 +84,10 @@ static void sigint_handler(int signo) { } #endif -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_completion(int argc, char ** argv); + +int llama_completion(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_params params; diff --git a/tools/completion/main.cpp b/tools/completion/main.cpp new file mode 100644 index 00000000000..bea9a0ec9aa --- /dev/null +++ b/tools/completion/main.cpp @@ -0,0 +1,5 @@ +int llama_completion(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_completion(argc, argv); +} diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt index 25c40966333..8acdaef3712 100644 --- a/tools/fit-params/CMakeLists.txt +++ b/tools/fit-params/CMakeLists.txt @@ -1,6 +1,23 @@ +# llama-fit-params-impl: fit-params logic, reusable by app + +set(TARGET llama-fit-params-impl) + +add_library(${TARGET} fit-params.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-fit-params executable + set(TARGET llama-fit-params) -add_executable(${TARGET} fit-params.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-fit-params-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index bcdf4404016..5d897bc4669 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -12,7 +12,10 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_fit_params(int argc, char ** argv); + +int llama_fit_params(int argc, char ** argv) { common_params params; common_init(); @@ -30,7 +33,7 @@ int main(int argc, char ** argv) { if (!params.fit_params_print) { const common_params_fit_status status = common_fit_params(params.model.path.c_str(), &mparams, &cparams, params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, - params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); if (status != COMMON_PARAMS_FIT_STATUS_SUCCESS) { LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__); exit(1); diff --git a/tools/fit-params/main.cpp b/tools/fit-params/main.cpp new file mode 100644 index 00000000000..b7271d4756a --- /dev/null +++ b/tools/fit-params/main.cpp @@ -0,0 +1,5 @@ +int llama_fit_params(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_fit_params(argc, argv); +} diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index 93d6a3aa2e7..b1c35ee88a5 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -1,6 +1,23 @@ +# llama-bench-impl: benchmark logic, reusable by app + +set(TARGET llama-bench-impl) + +add_library(${TARGET} llama-bench.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-bench executable + set(TARGET llama-bench) -add_executable(${TARGET} llama-bench.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-bench-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index 70355920b89..d53978548a1 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -26,17 +26,28 @@ options: -h, --help --numa numa mode (default: disabled) -r, --repetitions number of times to repeat each test (default: 5) - --prio <0|1|2|3> process/thread priority (default: 0) + --prio <-1|0|1|2|3> process/thread priority (default: 0) --delay <0...N> (seconds) delay between each test (default: 0) -o, --output output format printed to stdout (default: md) -oe, --output-err output format printed to stderr (default: none) --list-devices list available devices and exit -v, --verbose verbose output --progress print test progress indicators + --no-warmup skip warmup runs before benchmarking + -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off) + -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096) -rpc, --rpc register RPC devices (comma separated) test parameters: -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive + default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist. + example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M + (default: unused) + -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo + (default: unused) + -hft, --hf-token Hugging Face access token + (default: value from HF_TOKEN environment variable) -p, --n-prompt (default: 512) -n, --n-gen (default: 128) -pg (default: ) @@ -49,21 +60,21 @@ test parameters: -C, --cpu-mask (default: 0x0) --cpu-strict <0|1> (default: 0) --poll <0...100> (default: 50) - -ngl, --n-gpu-layers (default: 99) + -ngl, --n-gpu-layers (default: -1) -ncmoe, --n-cpu-moe (default: 0) - -sm, --split-mode (default: layer) + -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) + -fa, --flash-attn (default: auto) -dev, --device (default: auto) -mmp, --mmap <0|1> (default: 1) + -dio, --direct-io <0|1> (default: 0) -embd, --embeddings <0|1> (default: 0) -ts, --tensor-split (default: 0) - -ot --override-tensors =;... + -ot --override-tensor =;... (default: disabled) -nopo, --no-op-offload <0|1> (default: 0) - -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off) - -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096) + --no-host <0|1> (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as @@ -97,12 +108,12 @@ $ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0. | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ยฑ 0.55 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ยฑ 0.54 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ยฑ 0.25 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ยฑ 0.31 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ยฑ 0.23 | -| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ยฑ 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 132.19 ยฑ 0.55 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 256 | 129.37 ยฑ 0.54 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 512 | 123.83 ยฑ 0.25 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 128 | 82.17 ยฑ 0.31 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 256 | 80.74 ยฑ 0.23 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | -1 | tg 512 | 78.08 ยฑ 0.07 | ### Prompt processing with different batch sizes @@ -112,10 +123,10 @@ $ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 | model | size | params | backend | ngl | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ยฑ 3.66 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ยฑ 23.48 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ยฑ 15.59 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ยฑ 13.58 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 128 | pp 1024 | 1436.51 ยฑ 3.66 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 256 | pp 1024 | 1932.43 ยฑ 23.48 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 512 | pp 1024 | 2254.45 ยฑ 15.59 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | 1024 | pp 1024 | 2498.61 ยฑ 13.58 | ### Different numbers of threads @@ -171,10 +182,10 @@ $ ./llama-bench -d 0,512 | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ยฑ 23.45 | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ยฑ 0.59 | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ยฑ 18.88 | -| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ยฑ 0.60 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 | 7340.20 ยฑ 23.45 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 | 120.60 ยฑ 0.59 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | pp512 @ d512 | 6425.91 ยฑ 18.88 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | -1 | tg128 @ d512 | 116.71 ยฑ 0.60 | ## Output formats @@ -188,8 +199,8 @@ $ ./llama-bench -o md | model | size | params | backend | ngl | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ยฑ 93.24 | -| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ยฑ 0.59 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | pp 512 | 2368.80 ยฑ 93.24 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | -1 | tg 128 | 131.42 ยฑ 0.59 | ### CSV @@ -198,9 +209,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" -"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,n_cpu_moe,split_mode,main_gpu,no_kv_offload,flash_attn,devices,tensor_split,tensor_buft_overrides,use_mmap,use_direct_io,embeddings,no_op_offload,no_host,fit_target,fit_min_ctx,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","-1","0","layer","0","0","-1","auto","0.00","none","1","0","0","0","0","0","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" ``` ### JSON @@ -229,14 +240,22 @@ $ ./llama-bench -o json "poll": 50, "type_k": "f16", "type_v": "f16", - "n_gpu_layers": 99, + "n_gpu_layers": -1, + "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, - "flash_attn": false, + "flash_attn": -1, + "devices": "auto", "tensor_split": "0.00", + "tensor_buft_overrides": "none", "use_mmap": true, + "use_direct_io": false, "embeddings": false, + "no_op_offload": 0, + "no_host": false, + "fit_target": 0, + "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, @@ -266,14 +285,22 @@ $ ./llama-bench -o json "poll": 50, "type_k": "f16", "type_v": "f16", - "n_gpu_layers": 99, + "n_gpu_layers": -1, + "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, - "flash_attn": false, + "flash_attn": -1, + "devices": "auto", "tensor_split": "0.00", + "tensor_buft_overrides": "none", "use_mmap": true, + "use_direct_io": false, "embeddings": false, + "no_op_offload": 0, + "no_host": false, + "fit_target": 0, + "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, @@ -296,8 +323,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} -{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": -1, "n_cpu_moe": 0, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": -1, "devices": "auto", "tensor_split": "0.00", "tensor_buft_overrides": "none", "use_mmap": true, "use_direct_io": false, "embeddings": false, "no_op_offload": 0, "no_host": false, "fit_target": 0, "fit_min_ctx": 0, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} ``` @@ -310,7 +337,7 @@ $ ./llama-bench -o sql ``` ```sql -CREATE TABLE IF NOT EXISTS test ( +CREATE TABLE IF NOT EXISTS llama_bench ( build_commit TEXT, build_number INTEGER, cpu_info TEXT, @@ -329,13 +356,21 @@ CREATE TABLE IF NOT EXISTS test ( type_k TEXT, type_v TEXT, n_gpu_layers INTEGER, + n_cpu_moe INTEGER, split_mode TEXT, main_gpu INTEGER, no_kv_offload INTEGER, flash_attn INTEGER, + devices TEXT, tensor_split TEXT, + tensor_buft_overrides TEXT, use_mmap INTEGER, + use_direct_io INTEGER, embeddings INTEGER, + no_op_offload INTEGER, + no_host INTEGER, + fit_target INTEGER, + fit_min_ctx INTEGER, n_prompt INTEGER, n_gen INTEGER, n_depth INTEGER, @@ -346,6 +381,6 @@ CREATE TABLE IF NOT EXISTS test ( stddev_ts REAL ); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); -INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); +INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); +INSERT INTO llama_bench (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, n_cpu_moe, split_mode, main_gpu, no_kv_offload, flash_attn, devices, tensor_split, tensor_buft_overrides, use_mmap, use_direct_io, embeddings, no_op_offload, no_host, fit_target, fit_min_ctx, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '-1', '0', 'layer', '0', '0', '-1', 'auto', '0.00', 'none', '1', '0', '0', '0', '0', '0', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); ``` diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 07198fb164c..a85f86c3ab2 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -19,6 +19,7 @@ #include #include +#include "arg.h" #include "build-info.h" #include "common.h" #include "download.h" @@ -275,9 +276,11 @@ static std::string pair_str(const std::pair & p) { return buf; } -static std::vector parse_int_range(const std::string & s) { +static std::vector parse_int_range(const std::string & s, bool allow_negative = false) { // first[-last[(+|*)step]] - std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"); + std::regex range_regex(allow_negative + ? R"(^(-?\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))" + : R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"); std::smatch match; std::string::const_iterator search_start(s.cbegin()); @@ -337,7 +340,7 @@ struct cmd_params { std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; - std::vector flash_attn; + std::vector flash_attn; std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; @@ -376,12 +379,12 @@ static const cmd_params cmd_params_defaults = { /* cpu_mask */ { "0x0" }, /* cpu_strict */ { false }, /* poll */ { 50 }, - /* n_gpu_layers */ { 99 }, + /* n_gpu_layers */ { -1 }, /* n_cpu_moe */ { 0 }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, - /* flash_attn */ { false }, + /* flash_attn */ { LLAMA_FLASH_ATTN_TYPE_AUTO }, /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, @@ -451,7 +454,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -fa, --flash-attn (default: %s)\n", join(transform_to_str(cmd_params_defaults.flash_attn, llama_flash_attn_type_name), ",").c_str()); printf(" -dev, --device (default: auto)\n"); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str()); @@ -710,7 +713,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = parse_int_range(argv[i]); + auto p = parse_int_range(argv[i], /*allow_negative=*/true); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { if (++i >= argc) { @@ -793,8 +796,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); - params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); + auto p = string_split(argv[i], split_delim); + + std::vector types; + for (const auto & v : p) { + llama_flash_attn_type type; + if (common_arg_utils::is_truthy(v)) { + type = LLAMA_FLASH_ATTN_TYPE_ENABLED; + } else if (common_arg_utils::is_falsey(v)) { + type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + } else if (common_arg_utils::is_autoy(v)) { + type = LLAMA_FLASH_ATTN_TYPE_AUTO; + } else { + invalid_param = true; + break; + } + types.push_back(type); + } + if (invalid_param) { + break; + } + params.flash_attn.insert(params.flash_attn.end(), types.begin(), types.end()); } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; @@ -1138,7 +1160,7 @@ struct cmd_params_instance { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool flash_attn; + llama_flash_attn_type flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; @@ -1222,7 +1244,7 @@ struct cmd_params_instance { cparams.type_k = type_k; cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; + cparams.flash_attn_type = flash_attn; cparams.embeddings = embeddings; cparams.op_offload = !no_op_offload; cparams.swa_full = false; @@ -1400,7 +1422,7 @@ struct test { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool flash_attn; + llama_flash_attn_type flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; @@ -1522,10 +1544,10 @@ struct test { field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" || - field == "fit_target" || field == "fit_min_ctx") { + field == "fit_target" || field == "fit_min_ctx" || field == "flash_attn") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") { return BOOL; } @@ -1594,7 +1616,7 @@ struct test { split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), - std::to_string(flash_attn), + std::to_string((int) flash_attn), devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, @@ -1779,7 +1801,7 @@ struct markdown_printer : public printer { return 6; } if (field == "flash_attn") { - return 2; + return 3; } if (field == "devices") { return -12; @@ -2136,7 +2158,10 @@ static std::unique_ptr create_printer(output_formats format) { GGML_ABORT("fatal error"); } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_bench(int argc, char ** argv); + +int llama_bench(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); // try to set locale for unicode characters in markdown std::setlocale(LC_CTYPE, ".UTF-8"); diff --git a/tools/llama-bench/main.cpp b/tools/llama-bench/main.cpp new file mode 100644 index 00000000000..0c18bb0c9d4 --- /dev/null +++ b/tools/llama-bench/main.cpp @@ -0,0 +1,5 @@ +int llama_bench(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_bench(argc, argv); +} diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 62bd7988efb..542a18b5cbc 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -18,11 +18,12 @@ add_library(mtmd models/cogvlm.cpp models/conformer.cpp models/dotsocr.cpp + models/exaone4_5.cpp models/gemma4a.cpp models/gemma4v.cpp models/glm4v.cpp models/granite-speech.cpp - models/hunyuanocr.cpp + models/hunyuanvl.cpp models/internvl.cpp models/kimivl.cpp models/kimik25.cpp @@ -40,6 +41,7 @@ add_library(mtmd models/siglip.cpp models/whisper-enc.cpp models/deepseekocr.cpp + models/deepseekocr2.cpp models/mobilenetv5.cpp models/youtuvl.cpp models/yasa2.cpp diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 39f069501bc..1d9f6a136a9 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -11,6 +11,10 @@ #define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS) +struct build_vit_opts { + ggml_tensor * attn_mask = nullptr; +}; + struct clip_graph { const clip_model & model; const clip_hparams & hparams; @@ -25,6 +29,7 @@ struct clip_graph { const int n_patches; const int n_embd; const int n_head; + const int n_head_kv; const int d_head; const int n_layer; const int n_mmproj_embd; @@ -63,7 +68,8 @@ struct clip_graph { norm_type norm_t, ffn_op_type ffn_t, ggml_tensor * learned_pos_embd, - std::function add_pos); + std::function add_pos, + const build_vit_opts & opts = {}); // build the input after conv2d (inp_raw --> patches) // returns tensor with shape [n_embd, n_patches] diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index c359851999f..b7fdcc42f1e 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -170,7 +170,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// hunyuanocr / hunyuanvl (shared GGUF tensor names) +// hunyuanvl (shared GGUF tensor names) #define TN_MM_PRE_NORM "mm.pre_norm.%s" #define TN_TOK_IMG_BEGIN "mm.image_begin" #define TN_TOK_IMG_END "mm.image_end" @@ -188,6 +188,8 @@ #define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s" #define TN_SAM_NECK "v.sam.neck.%d.%s" #define TN_SAM_NET "v.sam.net_%d.%s" +// deepseek-ocr-2 +#define TN_RESMPL_QUERY "v.resample_query_%d.%s" // (conformer) lfm2 #define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" #define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" @@ -337,14 +339,15 @@ enum projector_type { PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_DOTS_OCR, PROJECTOR_TYPE_DEEPSEEKOCR, + PROJECTOR_TYPE_DEEPSEEKOCR2, PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_YOUTUVL, PROJECTOR_TYPE_YASA2, PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, - PROJECTOR_TYPE_HUNYUANOCR, PROJECTOR_TYPE_HUNYUANVL, + PROJECTOR_TYPE_EXAONE4_5, PROJECTOR_TYPE_MINICPMV4_6, PROJECTOR_TYPE_GRANITE_SPEECH, PROJECTOR_TYPE_MIMOVL, @@ -387,13 +390,14 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"}, { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, + { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, { PROJECTOR_TYPE_YASA2, "yasa2"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, - { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, + { PROJECTOR_TYPE_EXAONE4_5, "exaone4_5"}, { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, @@ -426,6 +430,9 @@ struct clip_image_f32 { int ny; std::vector buf; + + // marks the global view in e.g., DeepSeek-OCR Models + bool add_viewsep = false; }; // diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index ce15dbcd11e..1f3657a8507 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -35,6 +35,16 @@ enum resize_algo { // RESIZE_ALGO_LANCZOS, // TODO }; +// Padding style for img_tool::resize +// PAD_NONE - no padding; direct resize to target dimensions +// PAD_CEIL - aspect-preserving pad (default) +// PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity) +enum pad_style { + PAD_NONE, + PAD_CEIL, + PAD_NEAREST, +}; + struct clip_hparams { int32_t image_size = 0; int32_t patch_size = 0; @@ -52,7 +62,7 @@ struct clip_hparams { int32_t image_min_pixels = -1; int32_t image_max_pixels = -1; resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC; - bool image_resize_pad = true; // if false, center-crop will be applied when resizing + pad_style image_resize_pad = PAD_CEIL; // padding style when resizing std::array image_pad_color = {0, 0, 0}; // (preprocessor) for llava-uhd style models @@ -61,8 +71,8 @@ struct clip_hparams { int32_t preproc_max_tiles = 0; resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC; resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR; - bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6) - bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6) + pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6) + pad_style image_pad_ov = PAD_NONE; // padding style for the overview image (e.g. llava-1.6) std::array image_pad_color_rf = {0, 0, 0}; // padding color for refined image std::array image_pad_color_ov = {0, 0, 0}; // padding color for overview image @@ -510,7 +520,7 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; - // hunyuanocr perceiver + // hunyuanvl perceiver ggml_tensor * mm_pre_norm_w = nullptr; ggml_tensor * mm_img_begin = nullptr; ggml_tensor * mm_img_end = nullptr; @@ -532,6 +542,11 @@ struct clip_model { int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder std::vector sam_layers; + + // deepseek-ocr-2 + ggml_tensor * resample_query_768 = nullptr; + ggml_tensor * resample_query_1024 = nullptr; + // lfm2 audio std::array pre_encode_conv_X_w = {nullptr}; std::array pre_encode_conv_X_b = {nullptr}; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 02f0982797e..3eeda41155d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -162,8 +162,14 @@ struct clip_ctx { bool debug_output_embeddings = false; + // for measuring memory usage + bool no_alloc = false; + std::map mem_usage; + std::map mem_compute; + clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; + no_alloc = ctx_params.no_alloc; backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (!backend_cpu) { throw std::runtime_error("failed to initialize CPU backend"); @@ -240,6 +246,7 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : n_patches(n_patches_x * n_patches_y), n_embd(hparams.n_embd), n_head(hparams.n_head), + n_head_kv(hparams.n_head_kv), d_head(n_embd / n_head), n_layer(hparams.n_layer), n_mmproj_embd(clip_n_mmproj_embd(ctx)), @@ -300,7 +307,8 @@ ggml_tensor * clip_graph::build_vit( norm_type norm_t, ffn_op_type ffn_t, ggml_tensor * learned_pos_embd, - std::function add_pos + std::function add_pos, + const build_vit_opts & opts ) { if (learned_pos_embd) { inp = ggml_add(ctx0, inp, learned_pos_embd); @@ -394,9 +402,9 @@ ggml_tensor * clip_graph::build_vit( } } - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head_kv, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head_kv, n_pos); if (norm_per_head) { if (layer.q_norm) { @@ -427,7 +435,7 @@ ggml_tensor * clip_graph::build_vit( } cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, opts.attn_mask, kq_scale, il); cb(cur, "attn_out", il); } @@ -663,6 +671,9 @@ ggml_tensor * clip_graph::build_attn( k = ggml_cast(ctx0, k, GGML_TYPE_F16); v = ggml_cast(ctx0, v, GGML_TYPE_F16); + if (kq_mask) { + kq_mask = ggml_cast(ctx0, kq_mask, GGML_TYPE_F16); + } cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); @@ -873,6 +884,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_EXAONE4_5: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_MIMOVL: { builder = std::make_unique(ctx, img); @@ -926,10 +941,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { - builder = std::make_unique(ctx, img); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: @@ -943,6 +957,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR2: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_LFM2A: { builder = std::make_unique(ctx, img); @@ -1111,6 +1129,9 @@ struct clip_model_loader { get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim); get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps); + // n_head_kv is optional (for GQA), default to n_head + hparams.n_head_kv = hparams.n_head; + if (is_vision) { get_u32(KEY_IMAGE_SIZE, hparams.image_size); get_u32(KEY_PATCH_SIZE, hparams.patch_size); @@ -1223,12 +1244,12 @@ struct clip_model_loader { hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM; hparams.image_pad_color = {122, 116, 104}; if (!hparams.image_res_candidates.empty()) { - hparams.image_resize_pad = true; + hparams.image_resize_pad = PAD_CEIL; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } else { // llava-1.6 default params - hparams.image_pad_ov = false; - hparams.image_pad_rf = true; + hparams.image_pad_ov = PAD_NONE; + hparams.image_pad_rf = PAD_CEIL; hparams.image_pad_color_rf = {122, 116, 104}; hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC; hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR; @@ -1236,7 +1257,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_GLM_EDGE: { - hparams.image_resize_pad = true; + hparams.image_resize_pad = PAD_CEIL; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } break; case PROJECTOR_TYPE_MINICPMV: @@ -1431,7 +1452,7 @@ struct clip_model_loader { { hparams.n_merge = 2; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; - hparams.image_resize_pad = false; + hparams.image_resize_pad = PAD_NONE; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); std::vector wa_layer_indexes_vec; @@ -1451,7 +1472,7 @@ struct clip_model_loader { // reka model performs better when using resize_bicubic, which stretches // the image to fit fixed square size - hparams.image_resize_pad = false; + hparams.image_resize_pad = PAD_NONE; } break; case PROJECTOR_TYPE_GLM4V: { @@ -1501,36 +1522,33 @@ struct clip_model_loader { hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup } break; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: { hparams.patch_size = 16; hparams.image_size = 1024; hparams.warmup_image_size = 1024; hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; - hparams.image_pad_color[0] = hparams.image_mean[0]; - hparams.image_pad_color[1] = hparams.image_mean[1]; - hparams.image_pad_color[2] = hparams.image_mean[2]; + hparams.image_pad_color = {127, 127, 127}; get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true); get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + // qwen2 encoder is GQA, requires KEY_N_HEAD_KV + get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv); + } } break; - case PROJECTOR_TYPE_HUNYUANOCR: - { - hparams.n_merge = 2; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); - get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); - hparams.set_warmup_n_tokens(28*28); - } break; case PROJECTOR_TYPE_HUNYUANVL: { hparams.n_merge = 2; hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; - hparams.image_resize_pad = false; + hparams.image_resize_pad = PAD_NONE; hparams.ffn_op = FFN_GELU; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(256, 16384); + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels, false); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels, false); hparams.set_warmup_n_tokens(32*32); } break; case PROJECTOR_TYPE_LFM2A: @@ -1542,6 +1560,19 @@ struct clip_model_loader { hparams.audio_window_len = 400; hparams.audio_hop_len = 160; } break; + case PROJECTOR_TYPE_EXAONE4_5: + { + hparams.n_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, false); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); + hparams.set_warmup_n_tokens(46 * 46); + if (hparams.rope_theta <= 0.0f) { + hparams.rope_theta = 10000.0f; + } + get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv); + } break; case PROJECTOR_TYPE_GEMMA4A: { // Gemma4 feature_extraction_gemma4.py: @@ -1551,6 +1582,9 @@ struct clip_model_loader { hparams.audio_n_fft = 512; hparams.audio_window_len = 320; // 20ms frame (NOT 25ms/400) hparams.audio_hop_len = 160; + // due to a mistake in the original conversion code, rms_norm_eps is set to a wrong value + // since all gemma4a models use 1e-6, we just hardcode it here to avoid re-conversion + hparams.eps = 1e-6f; } break; case PROJECTOR_TYPE_GRANITE_SPEECH: { @@ -1684,6 +1718,8 @@ struct clip_model_loader { ggml_set_name(data_tensor, cur->name); loaded_tensor_names.insert(name); cur = data_tensor; + // add to weight memory counter + ctx_clip.mem_usage[ggml_backend_get_device(ctx_clip.backend)] += ggml_nbytes(cur); } return cur; }; @@ -1780,6 +1816,7 @@ struct clip_model_loader { || model.proj_type == PROJECTOR_TYPE_LDPV2 || model.proj_type == PROJECTOR_TYPE_QWEN2VL || model.proj_type == PROJECTOR_TYPE_QWEN25VL + || model.proj_type == PROJECTOR_TYPE_EXAONE4_5 || model.proj_type == PROJECTOR_TYPE_GLM_EDGE || model.proj_type == PROJECTOR_TYPE_GEMMA3 || model.proj_type == PROJECTOR_TYPE_IDEFICS3 @@ -1929,6 +1966,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_EXAONE4_5: { model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); @@ -2333,7 +2371,6 @@ struct clip_model_loader { model.mm_boi = get_tensor(TN_TOK_BOI); model.mm_eoi = get_tensor(TN_TOK_EOI); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) @@ -2365,6 +2402,7 @@ struct clip_model_loader { model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: { model.pos_embed = get_tensor(string_format(TN_SAM_POS_EMBD, "weight")); model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight")); @@ -2395,10 +2433,12 @@ struct clip_model_loader { model.neck_3_w = get_tensor(string_format(TN_SAM_NECK, 3, "weight")); model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight")); model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight")); - model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR); model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); + model.resample_query_768 = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false); + model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false); } break; case PROJECTOR_TYPE_GEMMA4A: { @@ -2598,7 +2638,7 @@ struct clip_model_loader { } // load data - { + if (!ctx_clip.no_alloc) { std::vector read_buf; // alloc memory and offload data @@ -2672,7 +2712,7 @@ struct clip_model_loader { if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) { // try to enable flash attention to see if it's supported ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED; - info = alloc_compute_meta(ctx_clip, batch); + info = reserve_compute_meta(ctx_clip, batch); if (!info.fattn && info.fattn_op) { auto op = info.fattn_op; LOG_WRN("%s: *****************************************************************\n", __func__); @@ -2691,10 +2731,10 @@ struct clip_model_loader { LOG_WRN("%s: please report this on github as an issue\n", __func__); LOG_WRN("%s: *****************************************************************\n", __func__); ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED; - alloc_compute_meta(ctx_clip, batch); + reserve_compute_meta(ctx_clip, batch); } } else { - info = alloc_compute_meta(ctx_clip, batch); + info = reserve_compute_meta(ctx_clip, batch); if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__); } @@ -2733,12 +2773,14 @@ struct clip_model_loader { } } - static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { + // only initialize backend buffers, but do not allocate them yet + static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); + ctx_clip.mem_compute.clear(); for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { ggml_backend_t backend = ctx_clip.backend_ptrs[i]; ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i]; @@ -2748,6 +2790,7 @@ struct clip_model_loader { ggml_backend_buft_name(buft), size / 1024.0 / 1024.0); } + ctx_clip.mem_compute[ggml_backend_get_device(backend)] += size; } const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get()); @@ -3055,10 +3098,10 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_EXAONE4_5: case PROJECTOR_TYPE_MIMOVL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; @@ -3077,6 +3120,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_EXAONE4_5: case PROJECTOR_TYPE_MIMOVL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: @@ -3156,6 +3200,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_EXAONE4_5: case PROJECTOR_TYPE_MIMOVL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_YOUTUVL: @@ -3244,12 +3289,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_QWEN3A: { - // 3x stride-2 conv2d: each step is floor((n-1)/2)+1 - int n = img->nx; - n = (n - 1) / 2 + 1; - n = (n - 1) / 2 + 1; - n = (n - 1) / 2 + 1; - n_patches = n; + // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk + const int chunk_size = 100; + const int tokens_per_chunk = 13; + n_patches = (img->nx / chunk_size) * tokens_per_chunk; } break; case PROJECTOR_TYPE_GLMA: { @@ -3268,7 +3311,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_DEEPSEEKOCR: { // SAM encoder applies two stride-2 convolutions (net_2 and net_3) - // which reduces spatial dimensions by 4x in each direction (16x total) + // that reduce spatial dimensions by 4x in each direction (16x total) // E.g., 64x64 -> 16x16 patches n_patches /= 16; @@ -3277,7 +3320,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int h = static_cast(std::sqrt(static_cast(n_patches))); n_patches = h * (h + 1) + 1; } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; @@ -3285,6 +3327,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int oh = (img->ny / patch_size) / merge; n_patches = (ow + 1) * oh + 2; } break; + case PROJECTOR_TYPE_DEEPSEEKOCR2: + { + // 1024 global view -> 256 query tokens + 1 view separator = 257; + // 768 local tile -> 144 query tokens, no separator. + n_patches /= 16; + if (img->add_viewsep) { + n_patches += 1; // view separator, appended only after the global view + } + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -3646,11 +3697,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("positions", positions); } break; case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_EXAONE4_5: case PROJECTOR_TYPE_YOUTUVL: { // pw * ph = number of tokens output by ViT after apply patch merger // ipw * ipw = number of vision token been processed inside ViT - const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty(); + const bool use_window_attn = + (ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL || ctx->model.proj_type == PROJECTOR_TYPE_EXAONE4_5) + ? hparams.n_wa_pattern > 0 + : !hparams.wa_layer_indexes.empty(); const int merge_ratio = 2; const int pw = image_size_width / patch_size / merge_ratio; const int ph = image_size_height / patch_size / merge_ratio; @@ -3874,6 +3929,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("pos_y", pos_y); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: { GGML_ASSERT(pos_w == pos_h); @@ -3896,6 +3952,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("rel_pos_indices_local", rel_pos_indices_local); set_input_i32("rel_pos_indices_global", rel_pos_indices_global); + + if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) { + + // qwen2 encoder attention mask + + // num_image_tokens = num_patches / 16 + // 256 for 1024 global view + // 144 for 768 tile views + const int num_image_tokens = num_patches / 16; + const int seq_len = num_image_tokens * 2; + std::vector qwen2_mask(static_cast(seq_len) * seq_len, 0.0f); + + // attention mask layout + // +--------------+---------------+ + // | all 0 | all -inf | + // +--------------+---------------+ + // | all 0 | lower tri 0 | + // +--------------+---------------+ + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < seq_len; j++) { + const bool zero = i < num_image_tokens ? + j < num_image_tokens : + j < num_image_tokens || j <= i; + qwen2_mask[static_cast(i) * seq_len + j] = zero ? 0.0f : -1e9f; + } + } + set_input_f32("qwen2_attn_mask", qwen2_mask); + } } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: @@ -3913,7 +3997,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_YASA2: { // do nothing @@ -3923,7 +4006,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // Compute the HunyuanVL 2D position embedding on CPU (with the // custom sf=(target+0.1)/n_grid bilinear sampling that the // reference implementation uses) and upload it to the graph - // input declared in clip_graph_hunyuanocr::build(). + // input declared in clip_graph_hunyuanvl::build(). GGML_ASSERT(model.position_embeddings != nullptr); ggml_tensor * src_t = model.position_embeddings; const int64_t n_embd = src_t->ne[0]; @@ -4205,6 +4288,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_mlp_3_w->ne[1]; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_EXAONE4_5: case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_YOUTUVL: return ctx->model.mm_1_b->ne[0]; @@ -4244,12 +4328,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_KIMIK25: case PROJECTOR_TYPE_YASA2: return ctx->model.mm_2_w->ne[1]; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; @@ -4264,22 +4348,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } } -int clip_is_minicpmv(const struct clip_ctx * ctx) { - // TODO: remove this function - if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) { - return ctx->model.hparams.minicpmv_version; - } - if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV4_6) { - return 46; - } - return 0; -} - -bool clip_is_glm(const struct clip_ctx * ctx) { - // TODO: remove this function - return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; -} - bool clip_is_llava(const struct clip_ctx * ctx) { return ctx->model.hparams.has_llava_projector; } @@ -4292,21 +4360,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { - switch (ctx->proj_type()) { - case PROJECTOR_TYPE_ULTRAVOX: - case PROJECTOR_TYPE_QWEN2A: - case PROJECTOR_TYPE_QWEN3A: - case PROJECTOR_TYPE_GLMA: - case PROJECTOR_TYPE_VOXTRAL: - case PROJECTOR_TYPE_MERALION: - case PROJECTOR_TYPE_MUSIC_FLAMINGO: - return true; - default: - return false; - } -} - bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); @@ -4343,6 +4396,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { return &ctx->model.hparams; } +std::map clip_get_mem_usage(const struct clip_ctx * ctx) { + std::map result = ctx->mem_usage; + for (auto & [dev, size] : ctx->mem_compute) { + result[dev] += size; + } + return result; +} + // // API for debugging // diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 5f9dc93c6b0..9b807ffa77b 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -6,6 +6,8 @@ #include #include +#include + // !!! Internal header, to be used by mtmd only !!! #define MTMD_INTERNAL_HEADER @@ -40,6 +42,7 @@ struct clip_context_params { bool warmup; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + bool no_alloc; }; struct clip_init_result { @@ -102,8 +105,6 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); -int clip_is_minicpmv(const struct clip_ctx * ctx); -bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated // do NOT add new functions like this @@ -115,7 +116,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); -bool clip_has_whisper_encoder(const struct clip_ctx * ctx); + +std::map clip_get_mem_usage(const struct clip_ctx * ctx); struct clip_cap { bool has_vision; diff --git a/tools/mtmd/debug/mtmd-debug.cpp b/tools/mtmd/debug/mtmd-debug.cpp index f19ca4cfe29..b88a16f0f8b 100644 --- a/tools/mtmd/debug/mtmd-debug.cpp +++ b/tools/mtmd/debug/mtmd-debug.cpp @@ -30,7 +30,9 @@ static void show_additional_info(int /*argc*/, char ** argv) { " -p \"encode\" (debugging encode pass, default case):\n" " --image can be:\n" " \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n" + " \"red\", \"green\", \"blue\": filled with respective colors\n" " \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n" + " \"rainbow\": raspberry-pi-like rainbow pattern\n" " --audio can be:\n" " \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n" " \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n" @@ -144,6 +146,65 @@ int main(int argc, char ** argv) { image[y][x * 3 + 2] = v; } } + } else if (input == "red") { + for (int i = 0; i < inp_size; ++i) { + auto row = std::vector(inp_size * 3, 0.0f); + for (int j = 0; j < inp_size; ++j) { + row[j * 3 + 0] = 1.0f; // R channel + } + image.push_back(row); + } + } else if (input == "green") { + for (int i = 0; i < inp_size; ++i) { + auto row = std::vector(inp_size * 3, 0.0f); + for (int j = 0; j < inp_size; ++j) { + row[j * 3 + 1] = 1.0f; // G channel + } + image.push_back(row); + } + } else if (input == "blue") { + for (int i = 0; i < inp_size; ++i) { + auto row = std::vector(inp_size * 3, 0.0f); + for (int j = 0; j < inp_size; ++j) { + row[j * 3 + 2] = 1.0f; // B channel + } + image.push_back(row); + } + } else if (input == "rainbow") { + for (int i = 0; i < inp_size; ++i) { + image.push_back(std::vector(inp_size * 3, 0.0f)); + } + float cx = inp_size / 2.0f; + float cy = inp_size / 2.0f; + float max_dist = std::sqrt(cx * cx + cy * cy); + for (int y = 0; y < inp_size; ++y) { + for (int x = 0; x < inp_size; ++x) { + float dx = x - cx; + float dy = y - cy; + float hue = std::atan2(dy, dx) / (2.0f * 3.14159265f); + if (hue < 0) hue += 1.0f; + float sat = std::sqrt(dx * dx + dy * dy) / max_dist; + if (sat > 1.0f) sat = 1.0f; + float h6 = hue * 6.0f; + int i6 = (int)h6; + float f = h6 - i6; + float p = 1.0f - sat; + float q = 1.0f - sat * f; + float t = 1.0f - sat * (1.0f - f); + float r, g, b; + switch (i6 % 6) { + case 0: r=1; g=t; b=p; break; + case 1: r=q; g=1; b=p; break; + case 2: r=p; g=1; b=t; break; + case 3: r=p; g=q; b=1; break; + case 4: r=t; g=p; b=1; break; + default: r=1; g=p; b=q; break; + } + image[y][x * 3 + 0] = r; + image[y][x * 3 + 1] = g; + image[y][x * 3 + 2] = b; + } + } } else if (input == "one") { samples = std::vector(inp_size, 1.0f); } else if (input == "zero") { diff --git a/tools/mtmd/debug/mtmd-debug.md b/tools/mtmd/debug/mtmd-debug.md index 76ffe5c8451..71bd52dd4b3 100644 --- a/tools/mtmd/debug/mtmd-debug.md +++ b/tools/mtmd/debug/mtmd-debug.md @@ -20,6 +20,43 @@ def test_vision(): test_vision() ``` +Example of debugging a rainbow image: + +```py +import torch +import math + +def make_rainbow(img_size): + cx, cy = img_size / 2.0, img_size / 2.0 + max_dist = math.sqrt(cx * cx + cy * cy) + img = torch.zeros(1, 3, img_size, img_size) + for y in range(img_size): + for x in range(img_size): + dx, dy = x - cx, y - cy + hue = math.atan2(dy, dx) / (2 * math.pi) + if hue < 0: + hue += 1 + sat = math.sqrt(dx * dx + dy * dy) / max_dist + sat = min(sat, 1.0) + h6 = hue * 6 + i6 = int(h6) + f = h6 - i6 + p = 1 - sat + q = 1 - sat * f + t = 1 - sat * (1 - f) + rgb = [(1,t,p),(q,1,p),(p,1,t),(p,q,1),(t,p,1),(1,p,q)][i6 % 6] + img[0, 0, y, x] = rgb[0] + img[0, 1, y, x] = rgb[1] + img[0, 2, y, x] = rgb[2] + return img + +img_size = 896 +pixel_values = make_rainbow(img_size) +with torch.no_grad(): + outputs = model.model.get_image_features(pixel_values=pixel_values) +print("last_hidden_state:", outputs.last_hidden_state) +``` + ## Debugging preprocess pass (TODO) diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp index b1f6ead5b5e..c3c22d0a4ba 100644 --- a/tools/mtmd/models/deepseekocr.cpp +++ b/tools/mtmd/models/deepseekocr.cpp @@ -88,176 +88,179 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0, return cur; // [C, k_size, q_size] } -ggml_cgraph * clip_graph_deepseekocr::build() { - // patch embedding - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * sam_out; +ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { // Building SAM - { - const int n_embd = hparams.sam_n_embd; - const int n_layer = hparams.sam_n_layer; - const int n_heads = hparams.sam_n_head; - const int d_heads = n_embd / n_heads; - const int window = hparams.attn_window_size; - - ggml_tensor * inpL; - - inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); - inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); - inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); - - ggml_tensor * rel_pos_indices_local; - ggml_tensor * rel_pos_indices_global; - - rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); - rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); - ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); - ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); - ggml_set_input(rel_pos_indices_local); - ggml_set_input(rel_pos_indices_global); - - ggml_tensor * cur; - const auto tgt_size = inpL->ne[1]; - const auto str_size = model.pos_embed->ne[1]; - - if (str_size != tgt_size) { - ggml_tensor * old_pos_embed = nullptr; - old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); - ggml_tensor * new_pos_embed = - ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); - new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); - cur = ggml_add(ctx0, inpL, new_pos_embed); - } else { - cur = ggml_add(ctx0, inpL, model.pos_embed); - } + const int n_embd = hparams.sam_n_embd; + const int n_layer = hparams.sam_n_layer; + const int n_heads = hparams.sam_n_head; + const int d_heads = n_embd / n_heads; + const int window = hparams.attn_window_size; - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.sam_layers[il]; - ggml_tensor * shortcut = cur; + ggml_tensor * inpL; - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); + inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); + inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); - const int64_t w0 = cur->ne[1]; - const int64_t h0 = cur->ne[2]; + ggml_tensor * rel_pos_indices_local; + ggml_tensor * rel_pos_indices_global; - ggml_tensor * indices; + rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); + rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); + ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); + ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); + ggml_set_input(rel_pos_indices_local); + ggml_set_input(rel_pos_indices_global); - if (hparams.is_global_attn(il)) { - indices = rel_pos_indices_global; - } else { - // local attention layer - apply window partition - cur = window_partition(ctx0, cur, window); - indices = rel_pos_indices_local; - } + ggml_tensor * cur; + const auto tgt_size = inpL->ne[1]; + const auto str_size = model.pos_embed->ne[1]; + + if (str_size != tgt_size) { + ggml_tensor * old_pos_embed = nullptr; + old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); + ggml_tensor * new_pos_embed = + ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); + new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); + cur = ggml_add(ctx0, inpL, new_pos_embed); + } else { + cur = ggml_add(ctx0, inpL, model.pos_embed); + } - const int64_t W = cur->ne[1]; - const int64_t H = cur->ne[2]; - // self-attention - { - const int B = cur->ne[3]; - - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); - cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape - cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); - - ggml_tensor * Q; - ggml_tensor * K; - ggml_tensor * V; - - Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); - Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); - - K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); - K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); - - V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); - V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); - - ggml_tensor * mask; - ggml_tensor * rw; - ggml_tensor * rh; - ggml_tensor * qr; - - rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] - rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] - qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); - qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); - - rw = ggml_mul_mat(ctx0, rw, - ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] - rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] - rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); - rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); - rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] - rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); - mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] - mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); - mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.sam_layers[il]; + ggml_tensor * shortcut = cur; - const float scale = 1.0f / sqrtf(static_cast(d_heads)); + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, - il); // [B, H*W, n_embd] - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); - } + const int64_t w0 = cur->ne[1]; + const int64_t h0 = cur->ne[2]; - if (hparams.is_global_attn(il) == false) { - // local attention layer - reverse window partition - cur = window_unpartition(ctx0, cur, w0, h0, window); - } + ggml_tensor * indices; - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, shortcut); + if (hparams.is_global_attn(il)) { + indices = rel_pos_indices_global; + } else { + // local attention layer - apply window partition + cur = window_partition(ctx0, cur, window); + indices = rel_pos_indices_local; + } - ggml_tensor * inpFF = cur; + const int64_t W = cur->ne[1]; + const int64_t H = cur->ne[2]; + // self-attention + { + const int B = cur->ne[3]; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); + + ggml_tensor * Q; + ggml_tensor * K; + ggml_tensor * V; + + Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); + Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); + + K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); + K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); + + V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); + V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); + + ggml_tensor * mask; + ggml_tensor * rw; + ggml_tensor * rh; + ggml_tensor * qr; + + rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] + rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] + qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); + qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); + + rw = ggml_mul_mat(ctx0, rw, + ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] + rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] + rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); + rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); + rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] + rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); + mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] + mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); + // casting mask to F16 only required when flash-attn is enabled + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); + } - // layernorm2 - cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + const float scale = 1.0f / sqrtf(static_cast(d_heads)); - // ffn - cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); + cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, + il); // [B, H*W, n_embd] + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); + } - // residual 2 - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "sam_layer_out", il); + if (hparams.is_global_attn(il) == false) { + // local attention layer - reverse window partition + cur = window_unpartition(ctx0, cur, w0, h0, window); } - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, shortcut); - cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + ggml_tensor * inpFF = cur; - cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + // layernorm2 + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); - cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); - cb(cur, "sam_output", -1); + // ffn + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); - ggml_build_forward_expand(gf, cur); - sam_out = cur; + // residual 2 + cur = ggml_add(ctx0, cur, inpFF); + cb(cur, "sam_layer_out", il); } + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); + cb(cur, "sam_output", -1); + + ggml_build_forward_expand(gf, cur); + return cur; +} + +ggml_cgraph * clip_graph_deepseekocr::build() { + // patch embedding + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * sam_out = build_sam(inp_raw); + + const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; + ggml_tensor * clip_out; // Building DS-OCR CLIP { ggml_tensor * inp; - inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out)); - inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); + inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - ggml_tensor * new_pos_embd = - ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings)); + ggml_tensor * new_pos_embd = model.position_embeddings; int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); @@ -291,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() { clip_out = cur; } - const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; - sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); ggml_tensor * cur; cur = ggml_concat(ctx0, clip_out, sam_out, 0); - cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); - cur = ggml_cont(ctx0, cur); cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); cur = ggml_add(ctx0, cur, model.mm_fc_b); @@ -309,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() { const auto n_dim = cur->ne[0]; ggml_tensor * imgnl; - ggml_tensor * vs; imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); - vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); - cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) + cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1) cb(cur, "dsocr_output", -1); diff --git a/tools/mtmd/models/deepseekocr2.cpp b/tools/mtmd/models/deepseekocr2.cpp new file mode 100644 index 00000000000..056bb81807f --- /dev/null +++ b/tools/mtmd/models/deepseekocr2.cpp @@ -0,0 +1,81 @@ +#include "models.h" + +ggml_cgraph * clip_graph_deepseekocr2::build() { + GGML_ASSERT(hparams.n_head_kv > 0); + GGML_ASSERT(n_head % hparams.n_head_kv == 0); + + // patch embedding + ggml_tensor * inp_raw = build_inp_raw(); + + ggml_tensor * sam_out = build_sam(inp_raw); + + ggml_tensor * qwen2_out; + // Building Qwen2 encoder + { + ggml_tensor * inp; + + inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + auto num_image_tokens = inp->ne[1]; // H*W + GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256); + + // query based on numbers of image tokens (in SAM output) + // 16x16 -> query_1024 (1024x1024 images) + // 12x12 -> query_768 (768x768 images) + + ggml_tensor * query_embed = model.resample_query_1024; + int num_queries = 256; + + if (num_image_tokens == 144) { + query_embed = model.resample_query_768; + num_queries = 144; + } + + // (B, num_image_tokens + num_queries, C) + inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1); + + auto seq_len = inp->ne[1]; + + // qwen2 encoder attention mask + ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len); + ggml_set_name(attn_mask, "qwen2_attn_mask"); + ggml_set_input(attn_mask); + + ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32); + + auto add_rope = [&](ggml_tensor * x, const clip_layer &) { + return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head, + GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0); + }; + + build_vit_opts vit_opts; + vit_opts.attn_mask = attn_mask; + + // build_vit applies model.post_ln_w internally; do not re-apply + ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU, + /* learned_pos_embd */ nullptr, add_rope, vit_opts); + + cur = ggml_cont(ctx0, + ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1], + cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output + + ggml_build_forward_expand(gf, cur); + qwen2_out = cur; + } + + ggml_tensor * cur; + + cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + // view_seperator only after the global view + if (img.add_viewsep) { + cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257) + } + + cb(cur, "dsocr2_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/exaone4_5.cpp b/tools/mtmd/models/exaone4_5.cpp new file mode 100644 index 00000000000..7bfbaca996b --- /dev/null +++ b/tools/mtmd/models/exaone4_5.cpp @@ -0,0 +1,170 @@ +// similar to qwen2vl, except for GQA attention +#include "models.h" + +ggml_cgraph * clip_graph_exaone4_5::build() { + GGML_ASSERT(model.patch_bias == nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; + + const norm_type norm_t = NORM_TYPE_RMS; + + const int64_t n_kv_head = hparams.n_head_kv > 0 ? hparams.n_head_kv : n_head; + GGML_ASSERT(n_head % n_kv_head == 0); + + int rope_sections[4] = { d_head / 4, d_head / 4, d_head / 4, d_head / 4 }; + const float rope_freq_base = hparams.rope_theta > 0.0f ? hparams.rope_theta : 10000.0f; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + { + ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + } + + ggml_tensor * inpL = inp; + + if (use_window_attn) { + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + ggml_tensor * cur = inpL; + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + { + GGML_ASSERT(layer.qkv_w != nullptr); + cur = build_mm(layer.qkv_w, cur); + if (layer.qkv_b) { + cur = ggml_add(ctx0, cur, layer.qkv_b); + } + + const int64_t n_embd_kv = d_head * n_kv_head; + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_patches, + ggml_row_size(cur->type, d_head), + cur->nb[1], + 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_kv_head, n_patches, + ggml_row_size(cur->type, d_head), + cur->nb[1], + ggml_row_size(cur->type, n_embd)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_kv_head, n_patches, + ggml_row_size(cur->type, d_head), + cur->nb[1], + ggml_row_size(cur->type, n_embd + n_embd_kv)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head / 2, rope_sections, GGML_ROPE_TYPE_VISION, 32768, rope_freq_base, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head / 2, rope_sections, GGML_ROPE_TYPE_VISION, 32768, rope_freq_base, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + cb(Vcur, "Vcur", il); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cb(cur, "attn_out", il); + } + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cb(cur, "ffn_inp", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + ggml_tensor * embeddings = inpL; + embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + + if (use_window_attn) { + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); + } + + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp index 4068a08aaf9..3570d6da135 100644 --- a/tools/mtmd/models/gemma4v.cpp +++ b/tools/mtmd/models/gemma4v.cpp @@ -124,12 +124,12 @@ ggml_cgraph * clip_graph_gemma4v::build() { } // Gemma4MultimodalEmbedder - cur = build_mm(model.mm_input_proj_w, cur); - cb(cur, "projected", -1); - - // embedding_post_projection_norm - cur = ggml_rms_norm(ctx0, cur, hparams.eps); - cb(cur, "projected_normed", -1); + { + // embedding_pre_projection_norm + cur = ggml_rms_norm(ctx0, cur, hparams.eps); + cur = build_mm(model.mm_input_proj_w, cur); + cb(cur, "projected", -1); + } ggml_build_forward_expand(gf, cur); return gf; diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanvl.cpp similarity index 70% rename from tools/mtmd/models/hunyuanocr.cpp rename to tools/mtmd/models/hunyuanvl.cpp index 45ed684f70d..2c670979d76 100644 --- a/tools/mtmd/models/hunyuanocr.cpp +++ b/tools/mtmd/models/hunyuanvl.cpp @@ -1,25 +1,15 @@ #include "models.h" -ggml_cgraph * clip_graph_hunyuanocr::build() { +ggml_cgraph * clip_graph_hunyuanvl::build() { const int merge = hparams.n_merge; const int pw = n_patches_x; const int ph = n_patches_y; - // Position embedding interpolation. - // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard - // ggml_interpolate cannot express. To avoid adding a new ggml op, the - // resize is computed on CPU in clip_image_batch_encode and uploaded here - // as a graph input (named "hunyuanvl_pos_embd"). - // HunyuanOCR uses the same square layout and the standard ratio-based - // interpolation provided by resize_position_embeddings(). - ggml_tensor * pos_embd = nullptr; - if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) { - pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); - ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); - ggml_set_input(pos_embd); - } else { - pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); - } + // position embedding: declared as a graph input, filled on CPU + // by clip_image_batch_encode (see PROJECTOR_TYPE_HUNYUANVL branch there). + ggml_tensor * pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); + ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); + ggml_set_input(pos_embd); ggml_tensor * inp = build_inp(); ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 955daa6d6d3..da37bc65050 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -118,6 +118,12 @@ struct clip_graph_whisper_enc : clip_graph { struct clip_graph_deepseekocr : clip_graph { clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model +}; + +struct clip_graph_deepseekocr2 : clip_graph_deepseekocr { + clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {} + ggml_cgraph * build() override; // reuses build_sam() from base }; struct clip_graph_conformer : clip_graph { @@ -141,8 +147,8 @@ struct clip_graph_glm4v : clip_graph { ggml_cgraph * build() override; }; -struct clip_graph_hunyuanocr : clip_graph { - clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_hunyuanvl : clip_graph { + clip_graph_hunyuanvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; @@ -190,3 +196,8 @@ struct clip_graph_kimik25 : clip_graph { ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode); }; + +struct clip_graph_exaone4_5 : clip_graph { + clip_graph_exaone4_5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/models/qwen3a.cpp b/tools/mtmd/models/qwen3a.cpp index 1384e5155ee..4de96955d96 100644 --- a/tools/mtmd/models/qwen3a.cpp +++ b/tools/mtmd/models/qwen3a.cpp @@ -1,68 +1,88 @@ #include "models.h" ggml_cgraph * clip_graph_qwen3a::build() { + // Ref implementation: https://github.com/QwenLM/Qwen3-ASR/blob/main/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py + + // inp_raw: [n_frames, n_mel, 1] (nx=n_frames, ny=n_mel) ggml_tensor * inp = build_inp_raw(1); - // conv2d block - // TODO: do we need to split by chunks of n_window each like on transformers impl? - { - inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1); - inp = ggml_add(ctx0, inp, model.conv2d_1_b); - inp = ggml_gelu_erf(ctx0, inp); + const int64_t n_frames = inp->ne[0]; // total frames, padded to multiple of chunk_size + const int64_t n_mel = inp->ne[1]; // 128 + const int64_t chunk_size = 100; // n_window * 2 (n_window=50 from model config) + const int64_t n_chunks = n_frames / chunk_size; - inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1); - inp = ggml_add(ctx0, inp, model.conv2d_2_b); - inp = ggml_gelu_erf(ctx0, inp); + GGML_ASSERT(n_frames % chunk_size == 0); // preprocessor should already pad the input + GGML_ASSERT(inp->type == GGML_TYPE_F32); - inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1); - inp = ggml_add(ctx0, inp, model.conv2d_3_b); - inp = ggml_gelu_erf(ctx0, inp); + // View mel spectrogram as batched 100-frame chunks: [chunk_size, n_mel, 1, n_chunks] + inp = ggml_view_4d(ctx0, inp, + chunk_size, n_mel, 1, n_chunks, + n_frames * (int64_t)sizeof(float), // nb[1]: stride over mel bins + chunk_size * (int64_t)sizeof(float), // nb[2]: stride for C=1 (unused) + chunk_size * (int64_t)sizeof(float), // nb[3]: stride over chunks + 0); + inp = ggml_cont(ctx0, inp); + cb(inp, "inp_chunks", -1); - // inp [n_pos, n_mels/8, channels, 1] (W, H, C, N) - cb(inp, "after_conv_blocks", -1); + // 3 x conv2d + gelu + { + // conv output [OW, OH, C_out, n_chunks] + auto conv_block = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) { + x = ggml_conv_2d(ctx0, w, x, 2, 2, 1, 1, 1, 1); + if (b) { + x = ggml_add(ctx0, x, ggml_reshape_4d(ctx0, b, 1, 1, x->ne[2], 1)); + } + return ggml_gelu_erf(ctx0, x); + }; - const int64_t n_pos_after_conv = inp->ne[0]; - const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16 + inp = conv_block(inp, model.conv2d_1_w, model.conv2d_1_b); + inp = conv_block(inp, model.conv2d_2_w, model.conv2d_2_b); + inp = conv_block(inp, model.conv2d_3_w, model.conv2d_3_b); + // inp: [OW=13, OH=16, OC=480, n_chunks] + cb(inp, "after_conv_blocks", -1); + } - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1)); - inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680] - inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos] + // permute [OW=25, OH=16, OC=480, n_chunks] -> [OH=16, OC=480, OW=25, n_chunks] + // reshape to [OH*OC=7680, OW*n_chunks] + // feature index h+16*c = c*16+f (matches python code) + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 2, 0, 1, 3)); + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2] * inp->ne[3]); - // project to n_embd - inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); - if (model.conv_out_b) { - inp = ggml_add(ctx0, inp, model.conv_out_b); - } - cb(inp, "after_conv_out", -1); + // Project to d_model: [d_model, 25*n_chunks] + inp = ggml_mul_mat(ctx0, model.conv_out_w, inp); + if (model.conv_out_b) { + inp = ggml_add(ctx0, inp, model.conv_out_b); } + cb(inp, "after_conv_out", -1); - auto n_pos = inp->ne[1]; + const int64_t n_pos = inp->ne[1]; // 25 * n_chunks - ggml_tensor * pos_embd_selected = ggml_view_2d( - ctx0, model.position_embeddings, - model.position_embeddings->ne[0], n_pos, - model.position_embeddings->nb[1], 0 - ); - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - pos_embd_selected, - nullptr); + // Per-chunk positional embeddings: repeat pos[0:13] for each chunk + // (position indices reset 0..12 per chunk, not sequential across chunks) + { + const int64_t tokens_per_chunk = n_pos / n_chunks; // 13 + ggml_tensor * pos_tmp = ggml_view_2d(ctx0, model.position_embeddings, + model.position_embeddings->ne[0], tokens_per_chunk, + model.position_embeddings->nb[1], 0); + ggml_tensor * tgt = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, + model.position_embeddings->ne[0], n_pos); + inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, pos_tmp, tgt)); + } + ggml_tensor * cur = build_vit(inp, n_pos, + NORM_TYPE_NORMAL, hparams.ffn_op, + nullptr, // pos embd already added above + nullptr); cb(cur, "after_transformer", -1); - // projector + // MLP projector cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, - FFN_GELU_ERF, - -1); - + FFN_GELU_ERF, -1); cb(cur, "projected", -1); ggml_build_forward_expand(gf, cur); - return gf; } diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 46fa9b58cf9..1446fbefcae 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -652,6 +652,110 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s return true; } +// +// mtmd_audio_preprocessor_qwen3a +// +// Matches the Python WhisperFeatureExtractor called with truncation=False: +// - reflection padding of n_fft/2 samples at each end (center=True) +// - Whisper-style log10 + (max-8)/4 normalization applied to full audio +// - output split into โ‰ค30s (3000 mel frames) windows, each padded to a +// multiple of 200 frames (n_window * 2) for the cgraph batch view +// + +void mtmd_audio_preprocessor_qwen3a::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + if (n_samples == 0) { + return false; + } + + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + // Reflection-pad n_fft/2 samples at each end, matching WhisperFeatureExtractor center=True + const int pad = hparams.audio_n_fft / 2; // = 200 + + std::vector padded(n_samples + 2 * pad, 0.0f); + // Reflect start: padded[0..pad-1] = samples[pad..1] (reversed) + for (int i = 0; i < pad; i++) { + int src = pad - i; // samples[pad], samples[pad-1], ..., samples[1] + padded[i] = (src < (int)n_samples) ? samples[src] : 0.0f; + } + std::copy(samples, samples + n_samples, padded.begin() + pad); + // Reflect end: padded[n+pad..n+2*pad-1] = samples[n-2..n-pad-1] (reversed) + for (int i = 0; i < pad; i++) { + int src = (int)n_samples - 2 - i; // samples[n-2], samples[n-3], ... + padded[n_samples + pad + i] = (src >= 0) ? samples[src] : 0.0f; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.no_padding = true; // reflection padding already applied above + params.use_natural_log = false; // log10 + + mtmd_audio_mel mel_full; + bool ok = log_mel_spectrogram(padded.data(), (int)padded.size(), 4, params, cache, mel_full); + if (!ok) { + return false; + } + + // Whisper-style normalization: clamp to (max - 8), scale to [-1, 1] + { + double mmax = -1e20; + for (float v : mel_full.data) { + if (v > mmax) mmax = v; + } + mmax -= 8.0; + for (float & v : mel_full.data) { + v = (std::max((double)v, mmax) + 4.0) / 4.0; + } + } + + // The effective frame count: center-padded STFT gives ~n_samples/hop_length frames. + // We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames. + const int n_eff = std::min(mel_full.n_len, + (int)(n_samples / hparams.audio_hop_len) + 1); + + // Split into inference windows matching n_window_infer=800 from model config. + // Each window is padded to the next multiple of chunk_size for the cgraph. + // The mtmd caller loops over output entries, so long audio is handled automatically. + const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50) + const int window_size = 800; // mel frames per forward pass (n_window_infer=800) + + for (int off = 0; off < n_eff; off += window_size) { + const int win_eff = std::min(window_size, n_eff - off); + const int n_chunks = (win_eff + chunk_size - 1) / chunk_size; + const int n_padded = n_chunks * chunk_size; + + mtmd_audio_mel out; + out.n_mel = mel_full.n_mel; + out.n_len = n_padded; + out.n_len_org = win_eff; + out.data.assign(out.n_mel * out.n_len, 0.0f); + for (int m = 0; m < out.n_mel; m++) { + const int copy_len = std::min(win_eff, mel_full.n_len - off); + if (copy_len > 0) { + std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off, + mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len, + out.data.begin() + (size_t)m * out.n_len); + } + } + output.push_back(std::move(out)); + } + return true; +} + // // mtmd_audio_preprocessor_conformer // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index e6b363b1f90..1f8b0c3f379 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -111,6 +111,14 @@ bool mtmd_audio_compute_log_mel_spectrogram(const float * samples, bool use_natural_log, bool norm_per_feature, mtmd_audio_mel & out); +struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_qwen3a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; +}; // // streaming ISTFT - converts spectrogram frames back to audio one frame at a time diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index 1b058e02601..caf72d53621 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -38,7 +38,7 @@ struct img_tool { clip_image_u8 & dst, const clip_image_size & target_resolution, resize_algo algo, - bool add_padding = true, // TODO: define the behavior for add_padding = false + pad_style padding = PAD_CEIL, std::array pad_color = {0, 0, 0}) { dst.nx = target_resolution.width; dst.ny = target_resolution.height; @@ -50,7 +50,7 @@ struct img_tool { return; } - if (!add_padding) { + if (padding == PAD_NONE) { // direct resize switch (algo) { case RESIZE_ALGO_BILINEAR: @@ -71,8 +71,15 @@ struct img_tool { float scale_w = static_cast(target_resolution.width) / src.nx; float scale_h = static_cast(target_resolution.height) / src.ny; float scale = std::min(scale_w, scale_h); - int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); - int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + + int new_width, new_height; + if (padding == PAD_NEAREST) { + new_width = std::min(static_cast(std::round(src.nx * scale)), target_resolution.width); + new_height = std::min(static_cast(std::round(src.ny * scale)), target_resolution.height); + } else { + new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); + new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + } switch (algo) { case RESIZE_ALGO_BILINEAR: @@ -91,9 +98,14 @@ struct img_tool { // fill dst with pad_color fill(dst, pad_color); - int offset_x = (target_resolution.width - new_width) / 2; - int offset_y = (target_resolution.height - new_height) / 2; - + int offset_x, offset_y; + if (padding == PAD_NEAREST) { + offset_x = static_cast(std::round((target_resolution.width - new_width) / 2.0f)); + offset_y = static_cast(std::round((target_resolution.height - new_height) / 2.0f)); + } else { + offset_x = (target_resolution.width - new_width) / 2; + offset_y = (target_resolution.height - new_height) / 2; + } composite(dst, resized_image, offset_x, offset_y); } } @@ -356,10 +368,10 @@ struct img_tool { GGML_ASSERT(inSize > 0 && outSize > 0); double support, scale, filterscale; double center, ww, ss; - int xx, x, ksize, xmin, xmax, xcnt; + int xx, x, ksize, xmin, xmax; // Calculate scaling factor: ratio of input range to output size - filterscale = scale = (double)inSize / outSize; + filterscale = scale = static_cast(inSize) / outSize; // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness // For downsampling (scale > 1), widen filter to prevent aliasing if (filterscale < 1.0) { @@ -373,6 +385,7 @@ struct img_tool { std::vector pre_weights(outSize * ksize); // Temporary weights bounds.resize(outSize * 2); + // For each output pixel, compute its filter coefficients for (xx = 0; xx < outSize; xx++) { // Calculate the center position in input space (pixel-center convention: +0.5) @@ -391,10 +404,10 @@ struct img_tool { xmax = inSize; } - xcnt = xmax - xmin; + xmax -= xmin; // Compute filter weights for each contributing input pixel - for (x = 0; x < xcnt; x++) { + for (x = 0; x < xmax; x++) { // Distance from input pixel center to output pixel center in input space double w = bicubic_filter((x + xmin - center + 0.5) * ss); pre_weights[xx * ksize + x] = w; @@ -402,7 +415,7 @@ struct img_tool { } // Normalize weights to sum to 1.0 (preserves brightness) - for (x = 0; x < xcnt; x++) { + for (x = 0; x < xmax; x++) { if (ww != 0.0) { pre_weights[xx * ksize + x] /= ww; } @@ -415,18 +428,27 @@ struct img_tool { // Store input pixel range for this output pixel bounds[xx * 2 + 0] = xmin; - bounds[xx * 2 + 1] = xcnt; + bounds[xx * 2 + 1] = xmax; } // Convert floating-point coefficients to fixed-point integers // Formula: int32 = round(float * 2^PRECISION_BITS) weights.resize(outSize * ksize); + + const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS + for (int i = 0; i < outSize * ksize; i++) { + double tmp_val = pre_weights[i] * fxp_scale; if (pre_weights[i] < 0) { - weights[i] = static_cast(-0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + tmp_val -= 0.5; } else { - weights[i] = static_cast(0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + tmp_val += 0.5; } + tmp_val = std::round(tmp_val); + tmp_val = std::clamp(tmp_val, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); + weights[i] = static_cast(tmp_val); } return ksize; @@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli // bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - const std::vector native_resolutions = { - /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ - }; - // original image size - const clip_image_size original_size{img.nx, img.ny}; - const int orig_w = original_size.width; - const int orig_h = original_size.height; - const int orig_area = orig_h * orig_w; - - size_t mode_i = 0; - int min_diff = orig_area; - - for (size_t i = 0; i < native_resolutions.size(); i++) { - int r = native_resolutions[i]; - if (std::abs(orig_area - r * r) < min_diff) { - mode_i = i; - min_diff = std::abs(orig_area - r * r); + static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; + // TODO: support 512 (tiny) and 640 (small) once we have eval data for them + + const int64_t orig_area = static_cast(img.nx) * img.ny; + + size_t mode_i = 0; + int64_t min_diff = std::numeric_limits::max(); + for (size_t i = 0; i < std::size(native_resolutions); i++) { + const int64_t r = native_resolutions[i]; + const int64_t diff = std::abs(orig_area - r * r); + if (diff < min_diff) { + mode_i = i; + min_diff = diff; } } - - /* Native Resolution (Base/Large) */ const int image_size = native_resolutions[mode_i]; - // scaled and padded image - clip_image_u8_ptr scaled_img(clip_image_u8_init()); - img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo); + // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for + // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor. + clip_image_u8 padded; + img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW, + PAD_NEAREST, hparams.image_pad_color); clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std); + img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std); output.entries.push_back(std::move(res)); output.grid_x = 1; @@ -1119,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, return true; } +// +// mtmd_image_preprocessor_deepseekocr2 +// + +// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles +// sorted by tile count +std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ratios() { + std::vector ratios; + for (int n = min_tiles; n <= max_tiles; n++) { + for (int w = 1; w <= n; w++) { + for (int h = 1; h <= n; h++) { + if (w * h < min_tiles || w * h > max_tiles) { + continue; + } + bool found = false; + for (const auto & r : ratios) { + if (r.width == w && r.height == h) { + found = true; + break; + } + } + if (!found) { + ratios.push_back({ w, h }); + } + } + } + } + std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { + return a.width * a.height < b.width * b.height; + }); + return ratios; +} + +// pick the grid whose aspect ratio is closest to the image +// on a tie, prefer the larger grid when the image fits +clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, + int height) { + float best_ratio_diff = std::numeric_limits::max(); + clip_image_size best_ratio = { 1, 1 }; + const float area = static_cast(width * height); + + for (const auto & ratio : target_ratios) { + const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + if (ratio_diff < best_ratio_diff) { + best_ratio_diff = ratio_diff; + best_ratio = ratio; + } else if (ratio_diff == best_ratio_diff) { + const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); + if (area > 0.5f * target_area) { + best_ratio = ratio; + } + } + } + return best_ratio; +} + +bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + // emit 768x768 local tiles when the image is larger than a tile in either + // dimension, then always a 1024x1024 global view. order: [tiles..., global]. + + if (img.nx > tile_size || img.ny > tile_size) { + const float aspect_ratio = static_cast(img.nx) / img.ny; + const auto target_ratios = get_target_ratios(); + const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + + // stretch onto the grid (no aspect preserve), then crop tiles row-major. + clip_image_u8 refined; + img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height }, + RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE); + + for (int row = 0; row < grid.height; row++) { + for (int col = 0; col < grid.width; col++) { + clip_image_u8 tile; + img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size); + clip_image_f32_ptr res(clip_image_f32_init()); + img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(res)); + } + } + } + + // global view: aspect-preserving fit-and-pad to base_size. + clip_image_u8 padded; + img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, + PAD_NEAREST, hparams.image_pad_color); + clip_image_f32_ptr global(clip_image_f32_init()); + img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std); + global->add_viewsep = true; + output.entries.push_back(std::move(global)); + + output.grid_x = 1; + output.grid_y = 1; + return true; +} + // // mtmd_image_preprocessor_step3vl // @@ -1246,7 +1363,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 std::max(1, static_cast(std::floor(resized.ny * scale))), }; clip_image_u8 scaled; - img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false); + img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE); resized = std::move(scaled); } @@ -1347,7 +1464,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip clip_image_u8 img_for_crop = prepared; if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) { clip_image_u8 refined; - img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false); + img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE); img_for_crop = std::move(refined); } diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 08129a08ed5..91a5bc253ef 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; +// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local +// tiles when the image is larger than a tile in either dimension. +struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor { + static constexpr int base_size = 1024; // global view + static constexpr int tile_size = 768; // local tile + static constexpr int min_tiles = 2; + static constexpr int max_tiles = 6; + + mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + +private: + static std::vector get_target_ratios(); + static clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, + int height); +}; + // custom image preprocessing for Step3VL // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 1ab8a4c0464..94c3be82f0f 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include // represents raw image data, layout is RGBRGBRGB... @@ -139,13 +140,13 @@ mtmd_context_params mtmd_context_params_default() { struct mtmd_context { struct clip_ctx * ctx_v; // vision struct clip_ctx * ctx_a; // audio - const struct llama_model * text_model; std::vector image_embd_v; // image embedding vector bool print_timings; int n_threads; std::string media_marker; - const int n_embd_text; + const int n_embd_text = -1; // -1 means llm context not provided, skip checking this + const llama_vocab * vocab = nullptr; // can be nullptr if text_model is not provided mtmd_pos_type pos_type; // these are not token, but strings used to mark the beginning and end of image/audio embeddings @@ -178,12 +179,13 @@ struct mtmd_context { mtmd_context(const char * mmproj_fname, const llama_model * text_model, - const mtmd_context_params & ctx_params) : - text_model (text_model), + const mtmd_context_params & ctx_params, + bool no_alloc = false) : print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), media_marker (ctx_params.media_marker), - n_embd_text (llama_model_n_embd_inp(text_model)) + n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1), + vocab (text_model ? llama_model_get_vocab(text_model) : nullptr) { if (ctx_params.image_marker != nullptr) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); @@ -193,21 +195,23 @@ struct mtmd_context { throw std::runtime_error("media_marker must not be empty"); } - auto decoder_rope_type = llama_model_rope_type(text_model); - switch (decoder_rope_type) { - case LLAMA_ROPE_TYPE_NONE: - case LLAMA_ROPE_TYPE_NORM: - case LLAMA_ROPE_TYPE_NEOX: - { - pos_type = MTMD_POS_TYPE_NORMAL; - } break; - case LLAMA_ROPE_TYPE_MROPE: - case LLAMA_ROPE_TYPE_IMROPE: - { - pos_type = MTMD_POS_TYPE_MROPE; - } break; - default: - throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type)); + if (text_model) { + auto decoder_rope_type = llama_model_rope_type(text_model); + switch (decoder_rope_type) { + case LLAMA_ROPE_TYPE_NONE: + case LLAMA_ROPE_TYPE_NORM: + case LLAMA_ROPE_TYPE_NEOX: + { + pos_type = MTMD_POS_TYPE_NORMAL; + } break; + case LLAMA_ROPE_TYPE_MROPE: + case LLAMA_ROPE_TYPE_IMROPE: + { + pos_type = MTMD_POS_TYPE_MROPE; + } break; + default: + throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type)); + } } clip_context_params ctx_clip_params { @@ -218,6 +222,7 @@ struct mtmd_context { /* warmup */ ctx_params.warmup, /* cb_eval */ ctx_params.cb_eval, /* cb_eval_user_data */ ctx_params.cb_eval_user_data, + /* no_alloc */ no_alloc, }; auto res = clip_init(mmproj_fname, ctx_clip_params); @@ -241,7 +246,7 @@ struct mtmd_context { // since we already validate n_embd of vision and audio mmproj, // we can safely assume that they are the same int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a); - if (n_embd_text != n_embd_clip) { + if (n_embd_text > 0 && n_embd_text != n_embd_clip) { throw std::runtime_error(string_format( "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n" "hint: you may be using wrong mmproj\n", @@ -279,7 +284,7 @@ struct mtmd_context { } break; case PROJECTOR_TYPE_MINICPMV: { - int minicpmv_version = clip_is_minicpmv(ctx_v); + int minicpmv_version = clip_get_hparams(ctx_v)->minicpmv_version; if (minicpmv_version == 2) { // minicpmv 2.5 format: // (overview) (slice) (slice) \n ... @@ -488,7 +493,11 @@ struct mtmd_context { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; - case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: + { + img_end = "\n"; // prevent empty batch on llama-server + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth ๏ฝœ (U+FF5C) and โ– (U+2581) to match the tokenizer vocabulary @@ -496,6 +505,13 @@ struct mtmd_context { img_end = "<๏ฝœhy_placeโ–holderโ–noโ–101๏ฝœ>"; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_EXAONE4_5: + { + // ... (image embeddings) ... + img_beg = ""; + img_end = ""; + image_preproc = std::make_unique(ctx_v); + } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); } @@ -515,7 +531,6 @@ struct mtmd_context { // set preprocessor switch (proj) { case PROJECTOR_TYPE_QWEN2A: - case PROJECTOR_TYPE_QWEN3A: case PROJECTOR_TYPE_QWEN25O: { // <|audio_bos|> ... (embeddings) ... <|audio_eos|> @@ -523,6 +538,12 @@ struct mtmd_context { aud_end = "<|audio_eos|>"; audio_preproc = std::make_unique(ctx_a); } break; + case PROJECTOR_TYPE_QWEN3A: + { + aud_beg = "<|audio_start|>"; + aud_end = "<|audio_end|>"; + audio_preproc = std::make_unique(ctx_a); + } break; case PROJECTOR_TYPE_VOXTRAL: { // [BEGIN_AUDIO] ... (embeddings) ... @@ -589,7 +610,11 @@ struct mtmd_context { private: llama_token lookup_token(const std::string & token_text) { - const llama_vocab * vocab = llama_model_get_vocab(text_model); + if (vocab == nullptr) { + // TODO @ngxson : this case is currently hit by mtmd_get_memory_usage + // but we should reconsider this if this case is needed in other places in the future + return LLAMA_TOKEN_NULL; + } const int n_vocab = llama_vocab_n_tokens(vocab); for (int i = 0; i < n_vocab; i++) { if (token_to_piece(vocab, i, true) == token_text) { @@ -600,6 +625,9 @@ struct mtmd_context { } std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) { + if (vocab == nullptr) { + throw std::runtime_error("llama_vocab is not provided"); + } std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); @@ -648,7 +676,7 @@ struct mtmd_tokenizer { add_special = text->add_special; parse_special = text->parse_special; input_text = text->text; - vocab = llama_model_get_vocab(ctx->text_model); + vocab = ctx->vocab; } int32_t tokenize(mtmd_input_chunks * output) { @@ -674,27 +702,29 @@ struct mtmd_tokenizer { } } - if (add_special && llama_vocab_get_add_bos(vocab)) { - // if first chunk is text, we add BOS token to first text chunk - // otherwise, create a new text chunk with BOS token - if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) { - // add BOS token to the beginning of first text chunk - cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab)); - } else { - // create a new text chunk with BOS token at the beginning - mtmd_input_chunk bos_chunk{ - MTMD_INPUT_CHUNK_TYPE_TEXT, - {llama_vocab_bos(vocab)}, - nullptr, // image tokens - nullptr, // audio tokens - }; - cur.entries.insert(cur.entries.begin(), std::move(bos_chunk)); + if (vocab != nullptr) { + if (add_special && llama_vocab_get_add_bos(vocab)) { + // if first chunk is text, we add BOS token to first text chunk + // otherwise, create a new text chunk with BOS token + if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + // add BOS token to the beginning of first text chunk + cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab)); + } else { + // create a new text chunk with BOS token at the beginning + mtmd_input_chunk bos_chunk{ + MTMD_INPUT_CHUNK_TYPE_TEXT, + {llama_vocab_bos(vocab)}, + nullptr, // image tokens + nullptr, // audio tokens + }; + cur.entries.insert(cur.entries.begin(), std::move(bos_chunk)); + } } - } - if (add_special && llama_vocab_get_add_eos(vocab)) { - // if last chunk is text, we add EOS token to it - add_text({llama_vocab_eos(vocab)}); + if (add_special && llama_vocab_get_add_eos(vocab)) { + // if last chunk is text, we add EOS token to it + add_text({llama_vocab_eos(vocab)}); + } } if (i_bm != bitmaps.size()) { @@ -709,6 +739,9 @@ struct mtmd_tokenizer { } void add_text(const std::string & txt, bool parse_special) { + if (vocab == nullptr) { + throw std::runtime_error("llama_vocab is not provided"); + } LOG_DBG("%s: %s\n", __func__, txt.c_str()); auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special); add_text(tokens); @@ -997,10 +1030,16 @@ struct mtmd_tokenizer { const std::string & text, bool add_special, bool parse_special) { + if (vocab == nullptr) { + throw std::runtime_error("llama_vocab is not provided"); + } // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + if (n_tokens == std::numeric_limits::min()) { + throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit"); + } if (n_tokens < 0) { result.resize(-n_tokens); int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); @@ -1062,18 +1101,23 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) bool ok = false; if (clip_is_llava(ctx_clip) - || clip_is_minicpmv(ctx_clip) - || clip_is_glm(ctx_clip) - || proj_type == PROJECTOR_TYPE_INTERNVL) { + || proj_type == PROJECTOR_TYPE_MINICPMV + || proj_type == PROJECTOR_TYPE_GLM_EDGE + || proj_type == PROJECTOR_TYPE_INTERNVL + || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; + // entries may have different token counts + // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view + size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); ok = clip_image_encode( ctx_clip, ctx->n_threads, entries[i].get(), - ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image); + ctx->image_embd_v.data() + offset); + offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { ok = clip_image_batch_encode( @@ -1537,3 +1581,36 @@ void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector & } } } + +static void stub_log_callback(enum ggml_log_level, const char *, void *) { + // do nothing +} + +std::map mtmd_get_memory_usage(const char * mmproj_fname, + struct mtmd_context_params ctx_params) { + mtmd::context_ptr ctx; + auto saved_log_callback = g_logger_state.log_callback; + auto saved_log_user_data = g_logger_state.log_callback_user_data; + try { + mtmd_log_set(stub_log_callback, nullptr); // suppress logging + ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params)); + mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback + std::map total_mem; + auto merge = [&](const struct clip_ctx * c) { + for (auto & [dev, size] : clip_get_mem_usage(c)) { + total_mem[dev] += size; + } + }; + if (ctx->ctx_v) { + merge(ctx->ctx_v); + } + if (ctx->ctx_a) { + merge(ctx->ctx_a); + } + return total_mem; + } catch (const std::exception & e) { + mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return {}; + } +} diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 54b9515a3ea..5d518df799e 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -9,6 +9,7 @@ #include #ifdef __cplusplus +#include #include #include #include @@ -261,6 +262,14 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); } // extern "C" #endif +// Get memory usage of the current model in bytes, per backend device +// Note: this is an unstable API, used internally by fit_params; it WILL be removed or changed without deprecation +#ifdef __cplusplus +MTMD_API std::map mtmd_get_memory_usage( + const char * mmproj_fname, + struct mtmd_context_params ctx_params); +#endif + // // C++ wrappers // diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt index 0a1f4e86477..f26d8e912a3 100644 --- a/tools/mtmd/requirements.txt +++ b/tools/mtmd/requirements.txt @@ -1,5 +1,12 @@ -r ../../requirements/requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu pillow~=11.3.0 -torch~=2.6.0 -torchvision~=0.21.0 + +## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility +torch==2.11.0; platform_machine != "s390x" # check_requirements: ignore "==" +torchvision==0.26.0; platform_machine != "s390x" # check_requirements: ignore "==" + +# torch s390x packages can only be found from nightly builds +--extra-index-url https://download.pytorch.org/whl/nightly +torch>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "==" +torchvision>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "==" diff --git a/tools/mtmd/tests/test-1-extracted.md b/tools/mtmd/tests/test-1-extracted.md deleted file mode 100644 index a92dcd95916..00000000000 --- a/tools/mtmd/tests/test-1-extracted.md +++ /dev/null @@ -1,85 +0,0 @@ -<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|> -# MEN WALK ON MOON -ASTRONAUTS LAND ON PLAIN; -COLLECT ROCKS, PLANT FLAG - -<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|> -Voice From Moon: -Eagle Has Landed' - -<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|> -EAGLE (the lunar surface, Houston, Truesquily) -Base here, The Eagle has landed. - -<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|> -BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot. - -<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|> -TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here. - -<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|> -TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.) - -<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|> -TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue. - -<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|> -TRAVELLING MADE: Eagle, and service mobility. - -<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|> -How do you read me? - -<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|> -TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over. - -<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|> -COLUMBIA: Yes, I heard the whole thing. - -<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|> -BOOTHROOM: Well, it's a good show. - -<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|> -COLUMBIA: Fantastic. - -<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|> -TRAVELLING MADE: I'll read that. - -<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|> -APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec- - -<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|> -tion of lunar descent. - -<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|> - - -<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|> -A Powdery Surface -Is Closely Explored - -<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|> -BY JOHN NOBLE WILFORD - -<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|> -HOUSTON, Monday, July 21โ€”New hires landed and walked on the moon. - -<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|> -Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time. - -<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|> -Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here. - -<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|> -"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moonโ€”Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager. - -<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|> -About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater. - -<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|> -"That's one small step for man, one giant leap for mankind." - -<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|> -His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth. - -<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|> -Testable Slope Test Soil diff --git a/tools/mtmd/tests/test-1-extracted.txt b/tools/mtmd/tests/test-1-extracted.txt deleted file mode 100644 index 4fe273e31b6..00000000000 --- a/tools/mtmd/tests/test-1-extracted.txt +++ /dev/null @@ -1,42 +0,0 @@ -MEN WALK ON MOON -ASTRONAUTS LAND ON PLAIN; -COLLECT ROCKS, PLANT FLAG - -Voice From Moon: -'Eagle Has Landed' - -A Powder Surface -Is Closely Explored - -By JOHN NOBLE WILFORD -NOVEMBER, Monday, July 21โ€”New York Herald and -wished on the moon. - -Two American astronauts of Apollo 11, steered their -frigate Eagle toward the moon's surface and smoothly to -the lunar landing yesterday at 4:17:40 P.M., Eastern day- -light time. - -Neil A. Armstrong, the 38-year-old civilian commander, -landed on the soft sand of the moon's surface here. - -"Beautiful, Triumph!" he said. "The Eagle has landed." - -The first man to reach the moonโ€”Neil Armstrong and -his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon, -brought their ship to rest on a level, rock-strewn plain near -the moon's surface. The two men and two of the three -astronauts on board, Armstrong, Conrad and Edwin E. -Aldrin, 38, of Houston, stepped slowly down the ladder -and descended as he pointed his first full-flaming footpad -at the lunar crater. - -"That's one small step for man, one giant leap for -mankind." - -His first step on the moon came at 10:56:20 P.M., as -a television camera rolled the earth's thousandth line every -second to an aerial and studied audiences of hundreds of -millions of people on earth. - -Textile Slope Test Soil diff --git a/tools/mtmd/tests/test-1-ground-truth.txt b/tools/mtmd/tests/test-1-ground-truth.txt new file mode 100644 index 00000000000..fd85b6485f7 --- /dev/null +++ b/tools/mtmd/tests/test-1-ground-truth.txt @@ -0,0 +1,24 @@ + + A Powdery Surface + Is Closely Explored + +By JOHN NOBLE WILFORD +Special to The New York Times + +HOUSTON, Monday, July 21โ€”Men have landed and walked on the moon. + +Two Americans, astronauts of Apollo 11, steered their fragile four-legged lunar module safely and smoothly to the historic landing yesterday at 4:17:40 P.M., Eastern daylight time. + +Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the mission control room here: + +"Houston, Tranquility Base here. The Eagle has landed." + +The first men to reach the moonโ€”Mr. Armstrong and his co-pilot, Col. Edwin E. Aldrin Jr. of the Air Forceโ€”brought their ship to rest on a level, rock-strewn plain near the southwestern shore of the arid Sea of Tranquility. + +About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and declared as he planted the first human footprint on the lunar crust: + +"That's one small step for man, one giant leap for mankind." + +His first step on the moon came at 10:56:20 P.M., as a television camera outside the craft transmitted his every move to an awed and excited audience of hundreds of millions of people on earth. + +Tentative Steps Test Soil diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 674a3500151..5f5fef765a6 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -1,186 +1,293 @@ #!/usr/bin/env python3 """ -Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation -for DeepSeek-OCR model using embedding similarity. +Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test +image to the actual text in part of that image. + +Runs each test image through mtmd-cli, calculates CER and chrF for +its output, and holds them against the HF model's scores. """ import argparse +import logging import subprocess import sys +import unicodedata +from dataclasses import dataclass from pathlib import Path -from sentence_transformers import SentenceTransformer -from sentence_transformers import util +logger = logging.getLogger("deepseek-ocr-test") + +RUN_TIMEOUT = 300 + + +@dataclass +class ModelSpec: + key: str + label: str + model_arg: str + mmproj_arg: str + model_default: str + mmproj_default: str + + +@dataclass +class TestCase: + model_key: str + label: str + image: str + ground_truth: str + hf_cer: float + hf_chrf: float + cer_tol: float + chrf_tol: float + + @property + def cer_max(self) -> float: + return self.hf_cer + self.cer_tol + + @property + def chrf_min(self) -> float: + return self.hf_chrf - self.chrf_tol + + +MODELS = { + "v1": ModelSpec( + key="v1", label="DeepSeek-OCR", + model_arg="--llama-model", mmproj_arg="--mmproj", + model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf", + mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf", + ), + "v2": ModelSpec( + key="v2", label="DeepSeek-OCR-2", + model_arg="--llama-model-2", mmproj_arg="--mmproj-2", + model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf", + mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf", + ), +} + +CASES = [ + TestCase( + model_key="v1", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0, + ), + TestCase( + model_key="v2", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # 640x488 is below the 768 tiling threshold -- single 1024 global view. + # hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad); + # the transformers HF processor is *not* the reference -- its pad_to_square + # is one pixel off and lands at ~0.69 instead. + hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, + ), +] + + +def arg_dest(flag: str) -> str: + return flag.lstrip("-").replace("-", "_") -def run_mtmd_deepseek_ocr( - model_path: str, - mmproj_path: str, - image_path: str, - bin_path: str, - prompt: str = "Free OCR." -) -> str: +def verdict(ok: bool) -> str: + return "PASS" if ok else "FAIL" + + +def normalize_text(text: str) -> str: + """NFC-normalize and collapse whitespace, so line-wrap and spacing + don't count as CER errors.""" + return " ".join(unicodedata.normalize("NFC", text).split()) + + +def locally_align(expected: str, ocr_out: str) -> str: + """Return the span of `ocr_out` that best matches `expected`. + + The ground truth covers part of the article body. + But the test image includes half of the newspaper's front page. + Fuzzy partial-ratio matching picks out + the body so the unrelated text doesn't disturb CER / chrF. """ - Run inference using llama.cpp mtmd-cli. + from rapidfuzz import fuzz + alignment = fuzz.partial_ratio_alignment(expected, ocr_out) + if alignment is None or alignment.dest_end <= alignment.dest_start: + return ocr_out + return ocr_out[alignment.dest_start:alignment.dest_end] + + +def compute_cer(expected: str, ocr_out: str) -> float: + """Character Error Rate. Lower is better. + CER: fraction of characters you'd insert/delete/substitute to fix the output; 0 = perfect.""" + import jiwer + return jiwer.cer(expected, ocr_out) + + +def compute_chrf(expected: str, ocr_out: str) -> float: + """chrF score on 0-100. Higher is better. + chrF: F-score over shared character n-grams; more forgiving of small word/spacing drift than CER. """ + from sacrebleu.metrics import CHRF + return CHRF().sentence_score(ocr_out, [expected]).score + + +def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: + """Run mtmd-cli on the image and return its output.""" cmd = [ - bin_path, - "-m", model_path, - "--mmproj", mmproj_path, - "--image", image_path, - # "-p", "<|grounding|>Convert the document to markdown.", - "-p", prompt, + str(bin_path), + "-m", str(model_path), + "--mmproj", str(mmproj_path), + "--image", str(image_path), + "-p", "Free OCR. ", "--chat-template", "deepseek-ocr", "--temp", "0", - "-n", "1024", - # "--verbose" + "--flash-attn", "off", # match the HF "eager" attention reference + "--no-warmup", + "-n", "512", # cap loops on hard images (KV would otherwise fill) + # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY. + # Default DRY breakers include "\n", so they are cleared below. + "--dry-multiplier", "0.8", + "--dry-base", "1.75", + "--dry-allowed-length", "2", + "--dry-penalty-last-n", "-1", + "--dry-sequence-breaker", "none", ] + logger.debug(f" command: {' '.join(cmd)}") - print(f"Running llama.cpp command: {' '.join(cmd)}") - - result = subprocess.run( - cmd, - capture_output=True, - text=False, - timeout=300 - ) + try: + result = subprocess.run(cmd, capture_output=True, text=False, timeout=RUN_TIMEOUT) + except subprocess.TimeoutExpired as e: + if e.stderr: + logger.error("llama.cpp stderr:\n%s", e.stderr.decode("utf-8", errors="replace")) + raise RuntimeError(f"llama-mtmd-cli timed out after {RUN_TIMEOUT}s") if result.returncode != 0: - stderr = result.stderr.decode('utf-8', errors='replace') - print(f"llama.cpp stderr: {stderr}") + logger.error("llama.cpp stderr:\n%s", result.stderr.decode("utf-8", errors="replace")) raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") - output = result.stdout.decode('utf-8', errors='replace').strip() - print(f"llama.cpp output length: {len(output)} chars") + output = result.stdout.decode("utf-8", errors="replace").strip() + if not output: + raise RuntimeError("llama-mtmd-cli produced no output on stdout") + logger.info(f" output: {len(output)} chars") return output -def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float: - """ - Compute cosine similarity between two texts using embedding model. - """ - print(f"Loading embedding model: {model_name}") +def read_expected_text(file_path: Path) -> str: + with open(file_path, "r", encoding="utf-8") as f: + return f.read().strip() - # Use sentence-transformers for easier embedding extraction - embed_model = SentenceTransformer(model_name) - print("Computing embeddings...") - embeddings = embed_model.encode([text1, text2], convert_to_numpy=True) +def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool: + expected = normalize_text(expected) + ocr_out = normalize_text(ocr_out) + aligned = locally_align(expected, ocr_out) - similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0] - return float(similarity) + logger.debug(f"\n--- expected (normalized) ---\n{expected}") + logger.debug(f"\n--- OCR output (normalized) ---\n{ocr_out}") + logger.debug(f"\n--- aligned span ---\n{aligned}") + cer = compute_cer(expected, aligned) + chrf = compute_chrf(expected, aligned) -def read_expected_output(file_path: str) -> str: - """ - Read expected OCR output from file. - """ - cur_path = Path(__file__).parent - expected_path = str(cur_path / file_path) - with open(expected_path, "r", encoding="utf-8") as f: - return f.read().strip() + cer_pass = cer <= case.cer_max + chrf_pass = chrf >= case.chrf_min + passed = cer_pass and chrf_pass + logger.info("") + logger.info("=" * 60) + logger.info("Free OCR evaluation:") + logger.info("=" * 60) + logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})") + logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})") + logger.info(f" Expected chars {len(expected):>7}") + logger.info(f" Aligned chars {len(aligned):>7} (of {len(ocr_out)} OCR chars)") + logger.info("") + logger.info(f" Result: {verdict(passed)}") + logger.info("=" * 60) + return passed -def main(): - ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs") - ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf", - help="Path to llama.cpp GGUF model") - ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf", - help="Path to mmproj GGUF file") - ap.add_argument("--image", default="test-1.jpeg", - help="Path to test image") + +def argument_parser() -> argparse.ArgumentParser: + ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript") ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli", - help="Path to llama-mtmd-cli binary") - ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B", - help="Embedding model for similarity computation") - ap.add_argument("--threshold", type=float, default=0.7, - help="Minimum similarity threshold for pass") - args = ap.parse_args() - - # Validate paths - # script directory + image - mtmd_dir = Path(__file__).parent.parent - args.image = str(mtmd_dir / args.image) - # project directory + llama model - args.llama_model = str(mtmd_dir.parent.parent / args.llama_model) - # project directory + mmproj - args.mmproj = str(mtmd_dir.parent.parent / args.mmproj) - args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin) - if not Path(args.image).exists(): - print(f"Error: Image not found: {args.image}") - sys.exit(1) - if not Path(args.llama_model).exists(): - print(f"Error: Model not found: {args.llama_model}") - sys.exit(1) - if not Path(args.mmproj).exists(): - print(f"Error: mmproj not found: {args.mmproj}") - sys.exit(1) - - print("=" * 60) - print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison") - print("=" * 60) - - # Default paths based on your command - - # Run llama.cpp inference - print("\n[2/3] Running llama.cpp implementation...") - llama_free_ocr = run_mtmd_deepseek_ocr( - args.llama_model, - args.mmproj, - args.image, - args.llama_bin - ) - - llama_md_ocr = run_mtmd_deepseek_ocr( - args.llama_model, - args.mmproj, - args.image, - args.llama_bin, - prompt="<|grounding|>Convert the document to markdown." - ) - - expected_free_ocr = read_expected_output("test-1-extracted.txt") - expected_md_ocr = read_expected_output("test-1-extracted.md") - - # Compute similarity - print("\n[3/3] Computing embedding similarity...") - free_ocr_similarity = compute_embedding_similarity( - expected_free_ocr, - llama_free_ocr, - args.embedding_model - ) - - md_ocr_similarity = compute_embedding_similarity( - expected_md_ocr, - llama_md_ocr, - args.embedding_model - ) - - # Results - print("\n" + "=" * 60) - print("RESULTS") - print("=" * 60) - print(f"\nReference Model output:\n{'-' * 40}") - print(expected_free_ocr) - print(f"\nDeepSeek-OCR output:\n{'-' * 40}") - print(llama_free_ocr) - print(f"\n{'=' * 60}") - print(f"Cosine Similarity: {free_ocr_similarity:.4f}") - print(f"Threshold: {args.threshold}") - print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}") - print("=" * 60) - - # Markdown OCR results - print(f"\nReference Model Markdown output:\n{'-' * 40}") - print(expected_md_ocr) - print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}") - print(llama_md_ocr) - print(f"\n{'=' * 60}") - print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}") - print(f"Threshold: {args.threshold}") - print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}") - print("=" * 60) + help="Path to llama-mtmd-cli binary (relative to repo root or absolute)") + for spec in MODELS.values(): + ap.add_argument(spec.model_arg, default=spec.model_default, + help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)") + ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default, + help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)") + ap.add_argument("--verbose", action="store_true", + help="Also log the expected, OCR, and aligned text") + return ap + + +def configure_logging(verbose: bool) -> None: + logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO, + format="%(message)s") + + +def resolve_path(path: str, base: Path) -> Path: + p = Path(path) + return p if p.is_absolute() else base / p + + +def main() -> int: + args = argument_parser().parse_args() + configure_logging(args.verbose) + + repo_root = Path(__file__).resolve().parents[3] # tests -> mtmd -> tools -> repo root + binary = resolve_path(args.llama_bin, repo_root) + + if not binary.exists(): + logger.error(f"Error: binary not found: {binary}") + return 1 + + logger.info("=" * 60) + logger.info("DeepSeek-OCR: llama.cpp vs HF parity check") + logger.info("=" * 60) + + results = {} + for case in CASES: + model_spec = MODELS[case.model_key] + title = f"{model_spec.label} -- {case.label}" + + logger.info("") + logger.info(f"=== {title} ===") + + model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root) + mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root) + image = resolve_path(case.image, repo_root) + ground_truth = resolve_path(case.ground_truth, repo_root) + + missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj), + ("image", image), ("ground-truth", ground_truth)] + if not p.exists()] + if missing: + for lbl, p in missing: + logger.error(f" Error: {lbl} not found: {p}") + results[title] = False + continue + + expected = read_expected_text(ground_truth) + logger.info(f" Image: {case.image}") + logger.info(f" Expected text: {len(expected)} chars") + logger.info(" Running llama.cpp 'Free OCR'") + try: + ocr_out = run_mtmd_cli(model, mmproj, image, binary) + except RuntimeError as e: + logger.error(f" Error: {e}") + results[title] = False + continue + + results[title] = evaluate(case, expected, ocr_out) + + logger.info("") + logger.info("=== Summary ===") + for title, ok in results.items(): + logger.info(f" {title:<48} {verdict(ok)}") + all_passed = all(results.values()) + logger.info(f"Overall: {verdict(all_passed)}") + + return 0 if all_passed else 1 if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/tools/mtmd/tests/tests-requirements.txt b/tools/mtmd/tests/tests-requirements.txt index 3134d098d62..f6645a70422 100644 --- a/tools/mtmd/tests/tests-requirements.txt +++ b/tools/mtmd/tests/tests-requirements.txt @@ -1,5 +1,3 @@ -sentence-transformers -transformers -tokenizers -torch -torchvision +jiwer +sacrebleu +rapidfuzz diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index 0c194ee7f08..0eee9acd406 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -1,6 +1,23 @@ +# llama-perplexity-impl: perplexity logic, reusable by app + +set(TARGET llama-perplexity-impl) + +add_library(${TARGET} perplexity.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-perplexity executable + set(TARGET llama-perplexity) -add_executable(${TARGET} perplexity.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-perplexity-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/perplexity/main.cpp b/tools/perplexity/main.cpp new file mode 100644 index 00000000000..13a9940e9ee --- /dev/null +++ b/tools/perplexity/main.cpp @@ -0,0 +1,5 @@ +int llama_perplexity(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_perplexity(argc, argv); +} diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 75defd7c87b..92f88306c74 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -157,7 +157,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, break; } lock.unlock(); - const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]); + const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + size_t(i)*nv, tokens[i+1]); local_nll += v; local_nll2 += v*v; } @@ -169,7 +169,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, for (auto & w : workers) { w.join(); } - out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t)); + out.write((const char *)log_probs.data(), size_t(n_token)*nv*sizeof(uint16_t)); } struct kl_divergence_result { @@ -279,7 +279,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens break; } lock.unlock(); - std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + size_t(i)*nv, tokens[i+1], local_kld); kld_values[i] = (float)v.first; p_diff_values[i] = v.second; } @@ -524,7 +524,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & logits_stream.write((const char *)&n_chunk, sizeof(n_chunk)); logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0])); const int nv = 2*((n_vocab + 1)/2) + 4; - log_probs.resize(n_ctx * nv); + log_probs.resize(size_t(n_ctx) * nv); } // We get the logits for all the tokens in the context window (params.n_ctx) @@ -923,7 +923,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { } if (i0 == i1) { - LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, hs_data[i0].required_tokens); + LOG_ERR("%s : task %zu does not fit in the context window (requires %zu tokens)\n", __func__, i0, hs_data[i0].required_tokens); return; } @@ -1216,7 +1216,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) } if (i0 == i1) { - LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, data[i0].required_tokens); + LOG_ERR("%s : task %zu does not fit in the context window (requires %zu tokens)\n", __func__, i0, data[i0].required_tokens); return; } @@ -1595,7 +1595,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par } if (i0 == i1) { - LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, tasks[i0].required_tokens); + LOG_ERR("%s : task %zu does not fit in the context window (requires %zu tokens)\n", __func__, i0, tasks[i0].required_tokens); return; } @@ -2005,7 +2005,10 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { LOG("Same top p: %6.3lf ยฑ %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_perplexity(int argc, char ** argv); + +int llama_perplexity(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_params params; diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 965adc0059b..eead4c85951 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -1,7 +1,23 @@ +# llama-quantize-impl: quantize logic, reusable by app + +set(TARGET llama-quantize-impl) + +add_library(${TARGET} quantize.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + +# llama-quantize executable + set(TARGET llama-quantize) -add_executable(${TARGET} quantize.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-quantize-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/quantize/main.cpp b/tools/quantize/main.cpp new file mode 100644 index 00000000000..fc247190c83 --- /dev/null +++ b/tools/quantize/main.cpp @@ -0,0 +1,5 @@ +int llama_quantize(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_quantize(argc, argv); +} diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 3d33d47d98b..7292bda6f4e 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -490,7 +490,10 @@ static bool parse_layer_prune(const char * data, std::vector & prune_layers return true; } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_quantize(int argc, char ** argv); + +int llama_quantize(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); if (argc < 3) { usage(argv[0]); diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 587ce3225ff..7b11a82e959 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -77,149 +77,33 @@ if(LLAMA_SERVER_SMT_VISION) endif() endif() +# llama-server-impl: server logic, reusable by app -# llama-server executable - -set(TARGET llama-server) +set(TARGET llama-server-impl) -set(TARGET_SRCS +add_library(${TARGET} server.cpp server-http.cpp server-http.h server-models.cpp server-models.h ) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) -# Option to specify custom HF bucket for webui (defaults to llama-ui) -# Usage: cmake -B build -DLLAMA_WEBUI_HF_BUCKET=llama-ui -set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets") - -if (LLAMA_BUILD_WEBUI) - set(PUBLIC_ASSETS - index.html - bundle.js - bundle.css - loading.html - ) +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC server-context llama-ui cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) - # Determine source of webui assets (priority: local > HF Bucket) - set(WEBUI_SOURCE "") - set(WEBUI_SOURCE_DIR "") - - # Priority 1: Check for local webui build output - set(LOCAL_WEBUI_DIR "${CMAKE_CURRENT_SOURCE_DIR}/public") - - # Verify all required assets exist before declaring local source valid - set(ALL_ASSETS_PRESENT TRUE) - foreach(asset ${PUBLIC_ASSETS}) - if(NOT EXISTS "${LOCAL_WEBUI_DIR}/${asset}") - set(ALL_ASSETS_PRESENT FALSE) - break() - endif() - endforeach() - - if(ALL_ASSETS_PRESENT) - set(WEBUI_SOURCE "local") - set(WEBUI_SOURCE_DIR "${LOCAL_WEBUI_DIR}") - message(STATUS "WebUI: using local build from ${WEBUI_SOURCE_DIR}") - endif() +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() - # Priority 2: Build-time asset provisioning (npm build โ†’ HF Bucket fallback) - if(NOT WEBUI_SOURCE_DIR) - # Environment variable takes precedence (e.g., from CI workflows) - if(DEFINED ENV{HF_WEBUI_VERSION}) - set(HF_WEBUI_VERSION "$ENV{HF_WEBUI_VERSION}") - # Validate against allowed characters to prevent CMake list separator - # or path-traversal issues in stamp filenames and download URLs - if(NOT HF_WEBUI_VERSION MATCHES "^[A-Za-z0-9._-]+$") - message(FATAL_ERROR "WebUI: invalid HF_WEBUI_VERSION='${HF_WEBUI_VERSION}' - must match ^[A-Za-z0-9._-]+$") - endif() - message(STATUS "WebUI: using HF_WEBUI_VERSION from environment=${HF_WEBUI_VERSION}") - elseif(DEFINED LLAMA_BUILD_NUMBER) - set(HF_WEBUI_VERSION "b${LLAMA_BUILD_NUMBER}") - message(STATUS "WebUI: using LLAMA_BUILD_NUMBER=${HF_WEBUI_VERSION}") - else() - set(HF_WEBUI_VERSION "") - message(STATUS "WebUI: version not specified (will use HF 'latest')") - endif() - - # Stamp file embeds the version tag so a changed build number triggers - # a fresh provision run on the next `cmake --build` without reconfiguring. - if("${HF_WEBUI_VERSION}" STREQUAL "") - set(WEBUI_VERSION_TAG "provisioned") - else() - set(WEBUI_VERSION_TAG "${HF_WEBUI_VERSION}") - endif() - set(WEBUI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.webui-${WEBUI_VERSION_TAG}.stamp") - - # Join assets with + separator (safe across all platforms, unlike ; and |) - string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}") - - add_custom_command( - OUTPUT ${WEBUI_STAMP} - COMMAND ${CMAKE_COMMAND} - "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}" - "-DPUBLIC_DIR=${CMAKE_CURRENT_SOURCE_DIR}/public" - "-DHF_BUCKET=${LLAMA_WEBUI_HF_BUCKET}" - "-DHF_VERSION=${HF_WEBUI_VERSION}" - "-DHF_ENABLED=${LLAMA_USE_PREBUILT_WEBUI}" - "-DASSETS=${PUBLIC_ASSETS_JOINED}" - "-DSTAMP_FILE=${WEBUI_STAMP}" - "-DNPM_DIR=${CMAKE_CURRENT_SOURCE_DIR}/webui" - -P ${PROJECT_SOURCE_DIR}/scripts/webui-download.cmake - COMMENT "Building/provisioning WebUI assets (npm build -> HF Bucket fallback)" - ) - - set(WEBUI_SOURCE "provisioned") - set(WEBUI_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/public") - endif() +# llama-server executable - # Process assets from the determined source - if(WEBUI_SOURCE_DIR) - foreach(asset ${PUBLIC_ASSETS}) - set(input "${WEBUI_SOURCE_DIR}/${asset}") - set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") - list(APPEND TARGET_SRCS ${output}) - - if(WEBUI_SOURCE STREQUAL "local") - # Local build: files exist at configure time - if(NOT EXISTS "${input}") - message(FATAL_ERROR "WebUI asset not found: ${input}") - endif() - set(dependency "${input}") - else() - # HF Bucket: files are downloaded at build time - set(dependency "${WEBUI_STAMP}") - endif() - - add_custom_command( - DEPENDS ${dependency} - OUTPUT "${output}" - COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" - ) - set_source_files_properties(${output} PROPERTIES GENERATED TRUE) - endforeach() - - add_definitions(-DLLAMA_BUILD_WEBUI) - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=1) - message(STATUS "WebUI: embedded with source: ${WEBUI_SOURCE}") - else() - # WebUI source not found - issue warning but don't fail the build - # The server will still build but without webui embedded - message(WARNING "WebUI: no source available. Neither local build (tools/server/public/) nor HF Bucket download succeeded.") - message(WARNING "WebUI: building server without embedded WebUI. Set LLAMA_BUILD_WEBUI=OFF to suppress this warning.") - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) - endif() -else() - # WebUI is disabled at build time - add_definitions(-DLLAMA_WEBUI_DEFAULT_ENABLED=0) -endif() +set(TARGET llama-server) -add_executable(${TARGET} ${TARGET_SRCS}) +add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) -target_include_directories(${TARGET} PRIVATE ../mtmd) -target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) -target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) - +target_link_libraries(${TARGET} PRIVATE llama-server-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index a9c1e7385fc..0ff334724a3 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -224,7 +224,7 @@ The SvelteKit-based Web UI is introduced in this PR: https://github.com/ggml-org ### Architecture -The WebUI follows a layered architecture: +The UI follows a layered architecture: ``` Routes โ†’ Components โ†’ Hooks โ†’ Stores โ†’ Services โ†’ Storage/API @@ -234,7 +234,7 @@ Routes โ†’ Components โ†’ Hooks โ†’ Stores โ†’ Services โ†’ Storage/API - **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`) - **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`) -For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/): +For detailed architecture diagrams, see [`tools/ui/docs/`](../ui/docs/): - `high-level-architecture.mmd` - full architecture with all modules - `high-level-architecture-simplified.mmd` - simplified overview @@ -246,7 +246,7 @@ For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/ ```sh # make sure you have Node.js installed -cd tools/server/webui +cd tools/ui npm i # run dev server (with hot reload) diff --git a/tools/server/README.md b/tools/server/README.md index 5ba1a58f8c1..f1eeec36aa0 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -33,7 +33,6 @@ For the full list of features, please refer to [server's changelog](https://gith | -------- | ----------- | | `-h, --help, --usage` | print usage and exit | | `--version` | show version and build info | -| `--license` | show source code license and dependencies | | `-cl, --cache-list` | show list of models in cache | | `--completion-bash` | print source-able bash completion script for llama.cpp | | `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | @@ -72,7 +71,6 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -106,13 +104,13 @@ For the full list of features, please refer to [server's changelog](https://gith | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) | | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | | `--log-disable` | Log disable | -| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) | -| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | +| `--log-file FNAME` | Log to file
(env: LLAMA_ARG_LOG_FILE) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_ARG_LOG_COLORS) | | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | -| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | -| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | -| `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | -| `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | +| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_ARG_OFFLINE) | +| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: trace (more info)
- 5: debug
(default: 3)

(env: LLAMA_ARG_LOG_VERBOSITY) | +| `--log-prefix, --no-log-prefix` | Enable prefix in log messages
(env: LLAMA_ARG_LOG_PREFIX) | +| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages
(env: LLAMA_ARG_LOG_TIMESTAMPS) | | `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) | | `--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) | @@ -164,7 +162,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | -| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | +| `-cms, --checkpoint-min-step N` | minimum spacing between context checkpoints in tokens (default: 256, 0 = no minimum)
(env: LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) | @@ -177,6 +175,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-np, --parallel N` | number of server slots (default: -1, -1 = auto)
(env: LLAMA_ARG_N_PARALLEL) | | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | +| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint
(env: LLAMA_ARG_TALKER_MODEL) | +| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer
(env: LLAMA_ARG_CODE2WAV_MODEL) | | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)
(env: LLAMA_ARG_MMPROJ_AUTO) | | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | @@ -184,24 +184,29 @@ For the full list of features, please refer to [server's changelog](https://gith | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | +| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | -| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | -| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | -| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | +| `--webui-config JSON` | [DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | +| `--ui-config JSON` | JSON that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG) | +| `--webui-config-file PATH` | [DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--ui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG_FILE) | +| `--webui-mcp-proxy, --no-webui-mcp-proxy` | [DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | +| `--ui-mcp-proxy, --no-ui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_UI_MCP_PROXY) | | `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)
specify "all" to enable all tools
available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime
(env: LLAMA_ARG_TOOLS) | -| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | +| `--webui, --no-webui` | [DEPRECATED: use --ui/--no-ui] whether to enable the Web UI
(env: LLAMA_ARG_WEBUI) | +| `--ui, --no-ui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_UI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)
(env: LLAMA_API_KEY) | -| `--api-key-file FNAME` | path to file containing API keys (default: none) | +| `--api-key-file FNAME` | path to file containing API keys (default: none)
(env: LLAMA_ARG_API_KEY_FILE) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | -| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | -| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | +| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) | +| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)
(env: LLAMA_ARG_CACHE_PROMPT) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | @@ -219,8 +224,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | @@ -241,14 +246,15 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)
(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | -| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | +| `--spec-type none,draft-simple,draft-eagle3,draft-mtp,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) | | `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) | | `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) | @@ -1238,6 +1244,8 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text. +`reasoning_control`: Arms realtime reasoning control for this completion so it can be ended early via `/v1/chat/completions/control`. Defaults to `false`. + `generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing. `parse_tool_calls`: Whether to parse the generated tool call. @@ -1322,12 +1330,44 @@ This provides information on the performance of the server. It also allows calcu The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n` +The response also includes a standard `usage` object: + +```js +{ + // ... + "usage": { + "completion_tokens": 48, + "prompt_tokens": 44, + "total_tokens": 92, + "prompt_tokens_details": { + "cached_tokens": 0 + } + } +} +``` + *Reasoning support* The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API. Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994). +### POST `/v1/chat/completions/control`: Control a running chat completion in real time + +Acts on an in-flight completion identified by its `id` (the `id` field streamed back by `/v1/chat/completions`). The request is processed in parallel with the SSE stream, so the client sends it while still reading tokens. + +*Options:* + +`id`: (Required) The chat completion id to act on. A completion that has already finished matches nothing and the call is a no-op. + +`action`: (Required) The control action to perform. Currently the only supported value is `reasoning_end`, which forces the end of the current reasoning block so the model moves on to the final answer. Requires `reasoning_control: true` on the original completion request. + +`model`: (Required in router mode) The model name, used to route the request to the right instance. Ignored in single model mode. + +**Response format** + +Returns a JSON object with a boolean `success` field, and an optional `message` field describing the reason when `success` is `false`. + ### POST `/v1/responses`: OpenAI-compatible Responses API *Options:* @@ -1641,23 +1681,30 @@ Listing all models in cache. The model metadata will also include a field to ind { "data": [{ "id": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", - "in_cache": true, "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf", "status": { "value": "loaded", "args": ["llama-server", "-ctx", "4096"] }, + "architecture": { + "input_modalities": [ + "text", + "image" + ], + "output_modalities": [ + "text" + ] + }, ... }] } ``` Note: -1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`. -2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow: +1. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow: - If a model is running but updated or removed from the source, it will be unloaded - If a model is not running, it will be added or updated according to the source -3. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance. +2. When the model is loaded, the info from `/v1/models` is forwarded to router's `/v1/models`. This includes metadata about the model and the runtime instance. The `status` object can be: @@ -1815,10 +1862,12 @@ Apart from error types supported by OAI, we also have custom types that are spec ### Custom default Web UI preferences -You can specify default preferences for the web UI using `--webui-config ` or `--webui-config-file `. For example, you can disable pasting long text as attachments and enable rendering Markdown in user messages with this command: +You can specify default preferences for the web UI using `--ui-config ` or `--ui-config-file `. For example, you can disable pasting long text as attachments and enable rendering Markdown in user messages with this command: ```bash -./llama-server -m model.gguf --webui-config '{"pasteLongTextToFileLen": 0, "renderUserContentAsMarkdown": true}' +./llama-server -m model.gguf --ui-config '{"pasteLongTextToFileLen": 0, "renderUserContentAsMarkdown": true}' ``` -You may find available preferences in [settings-config.ts](webui/src/lib/constants/settings-config.ts). +> **Note:** The old flags `--webui-config` and `--webui-config-file` are deprecated but still work as aliases. + +You may find available preferences in [settings-config.ts](../ui/src/lib/constants/settings-config.ts). diff --git a/tools/server/bench/speed-bench/README.md b/tools/server/bench/speed-bench/README.md new file mode 100644 index 00000000000..8d3fcd804c4 --- /dev/null +++ b/tools/server/bench/speed-bench/README.md @@ -0,0 +1,117 @@ +# SPEED-Bench server benchmark + +A lightweight [SPEED-Bench](https://huggingface.co/datasets/nvidia/SPEED-Bench) client for benchmarking an already-running `llama-server` through its OpenAI-compatible API. It is primarily meant to evaluate speculative decoding (draft model, n-gram, MTP, EAGLE3, ...) by reporting per-category throughput, latency, and draft acceptance. + +The dataset handling follows the [aiperf SPEED-Bench tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorials/speed-bench.md), which also documents the dataset layout in more detail. + +## Install + +```bash +pip install -r tools/server/bench/speed-bench/requirements.txt +``` + +## Start a server + +The client does not launch the server, so start `llama-server` yourself first. If you care about throughput numbers, set the client `--concurrency` to the server's slot count (`--np`): + +```bash +llama-server \ + -m target.gguf \ + -c 8192 \ + --port 8080 \ + -ngl 99 -fa on \ + --np 1 \ + --jinja +``` + +For speculative decoding, start the server with the appropriate flags for your setup (e.g. a draft model with `-md`, or `--spec-type ngram-mod`). See the [speculative decoding doc](../../../../docs/speculative.md) for details. + +## Run + +```bash +python tools/server/bench/speed-bench/speed_bench.py \ + --url localhost:8080 \ + --bench qualitative \ + --category coding \ + --osl 1024 \ + --concurrency 1 +``` + +## Options + +| Option | Default | Description | +| --- | --- | --- | +| `--url` | `localhost:8080` | Server URL. The scheme and `/v1` are optional and a trailing slash is fine, so `localhost:8080` and `http://localhost:8080/v1/` both work. | +| `--model` | none | Optional `model` field sent in each request. | +| `--bench` | `qualitative` | SPEED-Bench config, e.g. `qualitative`, `throughput_1k`. See [available dataset variants](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorials/speed-bench.md#available-dataset-variants). | +| `--category` | `all` | Category filter within the bench; comma-separated list or `all`. For `qualitative` the categories are `coding`, `humanities`, `math`, `multilingual`, `qa`, `rag`, `reasoning`, `roleplay`, `stem`, `summarization`, `writing`. For the `throughput_{ISL}` splits they are `high_entropy`, `low_entropy`, `mixed`. | +| `--osl` | `1024` | Output sequence length, mapped to `max_tokens`. | +| `--extra-inputs` | `{"temperature":0}` | Extra request fields as a JSON object. | +| `--concurrency` | `1` | Concurrent client requests; usually match `--np`. | +| `--limit` | none | Max samples per category (handy for smoke tests). | +| `--timeout` | `600` | Per-request timeout in seconds. | +| `--output` | none | Save raw per-request results and the summary to JSON. | + +A few common ones: + +- `--category all` runs every category in the bench. +- `--category coding,math` runs just those two. +- `--bench throughput_8k` runs a fixed-input-length throughput split. +- `--limit 8` keeps at most 8 samples per category, which is enough for a quick check. + +The `throughput_{ISL}` splits use fixed input lengths (1k - 32k), so they are handy for long-context testing and for comparing different `llama-server` batching settings (e.g. sweeping `-ub` / `--ubatch-size`) on prompts of a known size. Make sure the server `-c` is large enough for the chosen split. When raising `-ub`, also raise `-b` to at least the same value, since the physical ubatch cannot exceed the logical batch. + +When `--output` is given, the JSON file holds the run `config`, the `selected_samples` / `completed_samples` / `failed_samples` counts, the per-category `summary` rows, and the per-sample `results`. + +## Metrics + +The summary prints one row per category plus an `overall` row: + +- `samples` - how many samples finished successfully. +- `avg_prompt_t/s` - prefill throughput from llama.cpp (`timings.prompt_per_second`), averaged over the category's samples. +- `avg_pred_t/s` - decode throughput from llama.cpp (`timings.predicted_per_second`), averaged over the category's samples. +- `avg_latency` - average end-to-end request latency seen by the client. +- `accept_rate` - `accepted / draft_n` over the category, or `n/a` if nothing was drafted (`draft_n == 0`). + +## Baseline vs speculative decoding + +Save a run from each server with `--output`, then diff the two JSON files with `speed_bench_compare.py`. + +First, start a plain `llama-server` (no speculative decoding) and save a baseline: + +```bash +python tools/server/bench/speed-bench/speed_bench.py \ + --url localhost:8080 \ + --bench qualitative \ + --category all \ + --osl 1024 \ + --concurrency 1 \ + --output baseline.json +``` + +Then restart `llama-server` with speculative decoding enabled and save another run: + +```bash +python tools/server/bench/speed-bench/speed_bench.py \ + --url localhost:8080 \ + --bench qualitative \ + --category all \ + --osl 1024 \ + --concurrency 1 \ + --output spec.json +``` + +Finally compare the two: + +```bash +python tools/server/bench/speed-bench/speed_bench_compare.py \ + --baseline baseline.json \ + --speculative spec.json +``` + +The comparison table adds: + +- `decode_speedup = spec_avg_pred_t/s / base_avg_pred_t/s` +- `latency_speedup = base_avg_latency / spec_avg_latency` + +Keep `--bench`, `--category`, `--osl`, and `--limit` the same across both runs, otherwise they won't be using the same prompts. diff --git a/tools/server/bench/speed-bench/requirements.txt b/tools/server/bench/speed-bench/requirements.txt new file mode 100644 index 00000000000..a524c2f5193 --- /dev/null +++ b/tools/server/bench/speed-bench/requirements.txt @@ -0,0 +1,3 @@ +datasets +requests +tqdm diff --git a/tools/server/bench/speed-bench/speed_bench.py b/tools/server/bench/speed-bench/speed_bench.py new file mode 100644 index 00000000000..adb378a6bf0 --- /dev/null +++ b/tools/server/bench/speed-bench/speed_bench.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import concurrent.futures +import json +import statistics +import sys +import time +from dataclasses import asdict, dataclass +from typing import Any +from urllib.parse import urlparse + +import requests +from datasets import get_dataset_config_names, load_dataset +from tqdm import tqdm + + +DATASET_REPO = "nvidia/SPEED-Bench" + +@dataclass +class Sample: + id: str + category: str + turns: list[str] + + +@dataclass +class RequestResult: + id: str + category: str + ok: bool + turns: int + latency_s: float + prompt_tokens: int + completion_tokens: int + total_tokens: int + finish_reason: str | None + draft_n: int + draft_n_accepted: int + prompt_ms: float | None + predicted_ms: float | None + prompt_per_second: float | None + predicted_per_second: float | None + error: str | None + + +def normalize_base_url(url: str) -> str: + url = url.strip().rstrip("/") + if not url: + raise ValueError("--url cannot be empty") + if "://" not in url: + url = "http://" + url + parsed = urlparse(url) + if not parsed.scheme or not parsed.netloc: + raise ValueError(f"invalid --url: {url}") + if not parsed.path.rstrip("/").endswith("/v1"): + url = url + "/v1" + return url.rstrip("/") + + +def parse_extra_inputs(value: str) -> dict[str, Any]: + extra = json.loads(value) + if not isinstance(extra, dict): + raise ValueError("--extra-inputs must be a JSON object") + return extra + + +def extract_turns(row: dict[str, Any]) -> list[str]: + turns = row.get("turns") + if isinstance(turns, list) and turns: + clean_turns = [str(turn).strip() for turn in turns if turn and str(turn).strip()] + if clean_turns: + return clean_turns + raise ValueError("missing or empty turns") + + +def load_samples(args: argparse.Namespace) -> list[Sample]: + bench_names = get_dataset_config_names(DATASET_REPO) + if args.bench not in bench_names: + raise ValueError( + f"unknown --bench {args.bench!r}; available benches: {', '.join(bench_names)}" + ) + + dataset = load_dataset(DATASET_REPO, name=args.bench, split="test") + categories = list(dict.fromkeys(str(category) for category in dataset["category"])) + requested_categories = None + if args.category != "all": + requested_list = [category.strip() for category in args.category.split(",") if category.strip()] + if not requested_list: + raise ValueError( + f"--category must be 'all' or a comma-separated list; available categories: {', '.join(categories)}" + ) + requested_categories = set(requested_list) + unknown_categories = [category for category in requested_list if category not in categories] + if unknown_categories: + unknown = ", ".join(unknown_categories) + raise ValueError( + f"unknown --category {unknown!r} for bench {args.bench!r}; " + f"available categories: all, {', '.join(categories)}" + ) + + samples: list[Sample] = [] + samples_per_category: dict[str, int] = {} + skipped = 0 + for index, row_raw in enumerate(dataset): + row = dict(row_raw) + category_raw = row.get("category") + if not isinstance(category_raw, str) or not category_raw.strip(): + skipped += 1 + continue + category = category_raw.strip() + if requested_categories is not None and category not in requested_categories: + continue + if args.limit is not None and samples_per_category.get(category, 0) >= args.limit: + continue + + try: + turns = extract_turns(row) + except ValueError: + skipped += 1 + continue + question_id = row.get("question_id") + if not isinstance(question_id, str) or not question_id.strip(): + skipped += 1 + continue + sample_id = question_id.strip() + samples.append(Sample(id=sample_id, category=category, turns=turns)) + samples_per_category[category] = samples_per_category.get(category, 0) + 1 + + if not samples: + raise RuntimeError(f"no samples selected from bench={args.bench} category={args.category}") + + if skipped: + print(f"speed_bench: skipped {skipped} rows without usable turns") + return samples + + +def parse_completion_response(data: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any], str | None, str]: + usage = data.get("usage") or {} + timings = data.get("timings") or {} + finish_reason = None + content = "" + choices = data.get("choices") + if isinstance(choices, list) and choices and isinstance(choices[0], dict): + choice = choices[0] + finish_reason = choice.get("finish_reason") + message = choice.get("message") + if isinstance(message, dict) and isinstance(message.get("content"), str): + content = message["content"] + elif isinstance(choice.get("text"), str): + content = choice["text"] + return usage, timings, finish_reason, content + + +def run_request( + endpoint: str, + model: str | None, + messages: list[dict[str, str]], + osl: int, + extra_inputs: dict[str, Any], + timeout: float, +) -> tuple[dict[str, Any], float]: + payload: dict[str, Any] = { + "messages": messages, + "max_tokens": osl, + "stream": False, + } + if model: + payload["model"] = model + payload.update(extra_inputs) + payload["max_tokens"] = osl + + start = time.perf_counter() + response = requests.post(endpoint, json=payload, timeout=timeout) + latency_s = time.perf_counter() - start + if response.status_code != 200: + body = response.text[:500].replace("\n", "\\n") + raise RuntimeError(f"HTTP {response.status_code}: {body}") + return response.json(), latency_s + + +def run_one( + sample: Sample, + endpoint: str, + model: str | None, + osl: int, + extra_inputs: dict[str, Any], + timeout: float, +) -> RequestResult: + selected_turns = sample.turns + messages: list[dict[str, str]] = [] + total_latency_s = 0.0 + prompt_tokens = 0 + completion_tokens = 0 + total_tokens = 0 + draft_n = 0 + draft_n_accepted = 0 + prompt_ms = 0.0 + predicted_ms = 0.0 + prompt_per_second = None + predicted_per_second = None + finish_reason: str | None = None + try: + for turn in selected_turns: + messages.append({"role": "user", "content": turn}) + data, latency_s = run_request(endpoint, model, messages, osl, extra_inputs, timeout) + total_latency_s += latency_s + usage, timings, finish_reason, assistant_text = parse_completion_response(data) + + turn_prompt_tokens = int(usage.get("prompt_tokens") or timings.get("prompt_n") or 0) + turn_completion_tokens_count = int(usage.get("completion_tokens") or timings.get("predicted_n") or 0) + turn_total_tokens_count = int(usage.get("total_tokens") or (turn_prompt_tokens + turn_completion_tokens_count)) + prompt_tokens += turn_prompt_tokens + completion_tokens += turn_completion_tokens_count + total_tokens += turn_total_tokens_count + draft_n += int(timings.get("draft_n") or 0) + draft_n_accepted += int(timings.get("draft_n_accepted") or 0) + prompt_ms += float(timings.get("prompt_ms") or 0) + predicted_ms += float(timings.get("predicted_ms") or 0) + if len(selected_turns) == 1 and isinstance(timings.get("prompt_per_second"), (int, float)): + prompt_per_second = float(timings["prompt_per_second"]) + if len(selected_turns) == 1 and isinstance(timings.get("predicted_per_second"), (int, float)): + predicted_per_second = float(timings["predicted_per_second"]) + + messages.append({"role": "assistant", "content": assistant_text}) + + if total_tokens == 0: + total_tokens = prompt_tokens + completion_tokens + if len(selected_turns) > 1: + prompt_per_second = (prompt_tokens / (prompt_ms / 1000)) if prompt_ms > 0 else None + predicted_per_second = (completion_tokens / (predicted_ms / 1000)) if predicted_ms > 0 else None + + return RequestResult( + id=sample.id, + category=sample.category, + ok=True, + turns=len(selected_turns), + latency_s=total_latency_s, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + finish_reason=finish_reason, + draft_n=draft_n, + draft_n_accepted=draft_n_accepted, + prompt_ms=prompt_ms if prompt_ms > 0 else None, + predicted_ms=predicted_ms if predicted_ms > 0 else None, + prompt_per_second=prompt_per_second, + predicted_per_second=predicted_per_second, + error=None, + ) + except Exception as exc: + return RequestResult( + id=sample.id, + category=sample.category, + ok=False, + turns=len(selected_turns), + latency_s=total_latency_s, + prompt_tokens=0, + completion_tokens=0, + total_tokens=0, + finish_reason=None, + draft_n=0, + draft_n_accepted=0, + prompt_ms=None, + predicted_ms=None, + prompt_per_second=None, + predicted_per_second=None, + error=str(exc), + ) + + +def summarize_group(category: str, results: list[RequestResult]) -> dict[str, Any]: + ok_results = [result for result in results if result.ok] + latencies = [result.latency_s for result in ok_results] + server_prompt_speeds = [ + result.prompt_per_second + for result in ok_results + if result.prompt_per_second is not None + ] + server_completion_speeds = [ + result.predicted_per_second + for result in ok_results + if result.predicted_per_second is not None + ] + turns = sum(result.turns for result in ok_results) + draft_n = sum(result.draft_n for result in ok_results) + accepted = sum(result.draft_n_accepted for result in ok_results) + + return { + "category": category, + "requests": len(ok_results), + "turns": turns, + "failed": len(results) - len(ok_results), + "avg_prompt_t_s": statistics.mean(server_prompt_speeds) if server_prompt_speeds else None, + "avg_pred_t_s": statistics.mean(server_completion_speeds) if server_completion_speeds else None, + "avg_latency": statistics.mean(latencies) if latencies else None, + "draft_n": draft_n, + "accepted": accepted, + "accept_rate": (accepted / draft_n) if draft_n > 0 else None, + } + + +def fmt_value(value: Any, kind: str = "") -> str: + if value is None: + return "n/a" + if kind == "int": + return str(int(value)) + if kind == "rate": + return f"{float(value):.4f}" + if kind == "seconds": + return f"{float(value):.3f}s" + if kind == "speed": + return f"{float(value):.2f}" + if kind == "speedup": + return f"{float(value):.2f}x" + return str(value) + + +def print_table(rows: list[dict[str, Any]]) -> None: + columns = [ + ("category", "category", ""), + ("samples", "requests", "int"), + ("avg_prompt_t/s", "avg_prompt_t_s", "speed"), + ("avg_pred_t/s", "avg_pred_t_s", "speed"), + ("avg_latency", "avg_latency", "seconds"), + ("accept_rate", "accept_rate", "rate"), + ] + print_rows(rows, columns) + + +def print_rows(rows: list[dict[str, Any]], columns: list[tuple[str, str, str]]) -> None: + rendered_rows = [] + for row in rows: + rendered_rows.append([fmt_value(row.get(key), kind) for _, key, kind in columns]) + + widths = [len(header) for header, _, _ in columns] + for rendered in rendered_rows: + for i, cell in enumerate(rendered): + widths[i] = max(widths[i], len(cell)) + + header = " ".join(header.ljust(widths[i]) for i, (header, _, _) in enumerate(columns)) + print(header) + print(" ".join("-" * width for width in widths)) + for rendered in rendered_rows: + print(" ".join(cell.ljust(widths[i]) for i, cell in enumerate(rendered))) + + +def save_output(path: str, args: argparse.Namespace, samples: list[Sample], results: list[RequestResult], summary: list[dict[str, Any]]) -> None: + payload = { + "config": { + "url": args.url, + "model": args.model, + "bench": args.bench, + "category": args.category, + "osl": args.osl, + "concurrency": args.concurrency, + "extra_inputs": args.extra_inputs, + }, + "selected_samples": len(samples), + "completed_samples": sum(1 for result in results if result.ok), + "failed_samples": sum(1 for result in results if not result.ok), + "summary": summary, + "results": [asdict(result) for result in results], + } + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, sort_keys=True) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Run SPEED-Bench against an OpenAI-compatible llama-server.") + parser.add_argument("--url", default="localhost:8080", help="Server URL, for example localhost:8080 or http://localhost:8080/v1") + parser.add_argument("--model", default=None, help="Optional model name to send in OpenAI requests") + parser.add_argument("--bench", default="qualitative", help="SPEED-Bench config to run, for example qualitative or throughput_1k") + parser.add_argument("--category", default="all", help="Category to run within the selected bench; use all for no category filter") + parser.add_argument("--osl", type=int, default=4096, help="Output sequence length, mapped to max_tokens") + parser.add_argument("--extra-inputs", default='{"temperature":0}', help="Extra request fields as a JSON object") + parser.add_argument("--concurrency", type=int, default=1, help="Concurrent client requests; usually match llama-server --np") + parser.add_argument("--limit", type=int, default=None, help="Optional sample limit per category for smoke tests") + parser.add_argument("--timeout", type=float, default=600, help="Per-request timeout in seconds") + parser.add_argument("--output", default=None, help="Optional path to save raw results JSON") + args = parser.parse_args(argv) + try: + base_url = normalize_base_url(args.url) + endpoint = base_url + "/chat/completions" + extra_inputs = parse_extra_inputs(args.extra_inputs) + args.extra_inputs = extra_inputs + samples = load_samples(args) + except Exception as exc: + print(f"speed_bench: setup failed: {exc}", file=sys.stderr) + return 2 + + print(f"speed_bench: loaded {len(samples)} samples from bench={args.bench} category={args.category}") + + results: list[RequestResult] = [] + started = time.perf_counter() + with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor: + futures = [ + executor.submit(run_one, sample, endpoint, args.model, args.osl, extra_inputs, args.timeout) + for sample in samples + ] + for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="speed_bench", unit="sample"): + result = future.result() + results.append(result) + + elapsed = time.perf_counter() - started + categories = list(dict.fromkeys(sample.category for sample in samples)) + summary = [ + summarize_group(category, [result for result in results if result.category == category]) + for category in categories + ] + summary.append(summarize_group("overall", results)) + print() + print(f"Summary (elapsed={elapsed:.2f}s)") + print_table(summary) + + if args.output: + save_output(args.output, args, samples, results, summary) + print(f"\nspeed_bench: wrote {args.output}") + + failed = sum(1 for result in results if not result.ok) + if failed: + print(f"\nspeed_bench: {failed} samples failed", file=sys.stderr) + first_error = next((result.error for result in results if result.error), None) + if first_error: + print(f"first error: {first_error}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/server/bench/speed-bench/speed_bench_compare.py b/tools/server/bench/speed-bench/speed_bench_compare.py new file mode 100644 index 00000000000..070ab57db5d --- /dev/null +++ b/tools/server/bench/speed-bench/speed_bench_compare.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sys +from typing import Any + +from speed_bench import fmt_value, print_rows + + +def load_summary(path: str) -> list[dict[str, Any]]: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + summary = data.get("summary") + if not isinstance(summary, list): + raise ValueError(f"{path} does not contain a summary list") + return summary + + +def compare_rows(baseline: list[dict[str, Any]], speculative: list[dict[str, Any]]) -> list[dict[str, Any]]: + baseline_by_category = {row["category"]: row for row in baseline} + comparisons = [] + for row in speculative: + base = baseline_by_category.get(row["category"]) + if not base: + continue + base_speed = base.get("avg_pred_t_s") + spec_speed = row.get("avg_pred_t_s") + base_latency = base.get("avg_latency") + spec_latency = row.get("avg_latency") + comparisons.append( + { + "category": row["category"], + "base_avg_pred_t_s": base_speed, + "spec_avg_pred_t_s": spec_speed, + "decode_speedup": (spec_speed / base_speed) if base_speed and spec_speed else None, + "base_avg_latency": base_latency, + "spec_avg_latency": spec_latency, + "latency_speedup": (base_latency / spec_latency) if base_latency and spec_latency else None, + "accept_rate": row.get("accept_rate"), + } + ) + return comparisons + + +def print_comparison(rows: list[dict[str, Any]]) -> None: + if not rows: + print("No overlapping categories found for comparison.") + return + columns = [ + ("category", "category", ""), + ("base_avg_pred_t/s", "base_avg_pred_t_s", "speed"), + ("spec_avg_pred_t/s", "spec_avg_pred_t_s", "speed"), + ("decode_speedup", "decode_speedup", "speedup"), + ("base_avg_latency", "base_avg_latency", "seconds"), + ("spec_avg_latency", "spec_avg_latency", "seconds"), + ("latency_speedup", "latency_speedup", "speedup"), + ("accept_rate", "accept_rate", "rate"), + ] + print_rows(rows, columns) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Compare two SPEED-Bench runs (baseline vs speculative).") + parser.add_argument("--baseline", required=True, help="Baseline results JSON produced by speed_bench.py --output") + parser.add_argument("--speculative", required=True, help="Speculative decoding results JSON produced by speed_bench.py --output") + args = parser.parse_args(argv) + + try: + baseline = load_summary(args.baseline) + speculative = load_summary(args.speculative) + except Exception as exc: + print(f"speed_bench_compare: failed to load inputs: {exc}", file=sys.stderr) + return 2 + + comparisons = compare_rows(baseline, speculative) + print(f"Comparison: baseline={args.baseline} speculative={args.speculative}") + print_comparison(comparisons) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/server/chat-llama2.sh b/tools/server/chat-llama2.sh deleted file mode 100755 index 450445f17e3..00000000000 --- a/tools/server/chat-llama2.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env bash - -API_URL="${API_URL:-http://127.0.0.1:8080}" - -CHAT=( - "Hello, Assistant." - "Hello. How may I help you today?" -) - -INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." - -trim() { - shopt -s extglob - set -- "${1##+([[:space:]])}" - printf "%s" "${1%%+([[:space:]])}" -} - -trim_trailing() { - shopt -s extglob - printf "%s" "${1%%+([[:space:]])}" -} - -format_prompt() { - if [[ "${#CHAT[@]}" -eq 0 ]]; then - echo -n "[INST] <>\n${INSTRUCTION}\n<>" - else - LAST_INDEX=$(( ${#CHAT[@]} - 1 )) - echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]" - fi -} - -tokenize() { - curl \ - --silent \ - --request POST \ - --url "${API_URL}/tokenize" \ - --header "Content-Type: application/json" \ - --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ - | jq '.tokens[]' -} - -N_KEEP=$(tokenize "[INST] <>\n${INSTRUCTION}\n<>" | wc -l) - -chat_completion() { - PROMPT="$(trim_trailing "$(format_prompt "$1")")" - DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ - prompt: ., - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: $n_keep, - n_predict: 1024, - stop: ["[INST]"], - stream: true - }')" - - # Create a temporary file to hold the Python output - TEMPFILE=$(mktemp) - - exec 3< <(curl \ - --silent \ - --no-buffer \ - --request POST \ - --url "${API_URL}/completion" \ - --header "Content-Type: application/json" \ - --data-raw "${DATA}") - - python -c " -import json -import sys - -answer = '' -while True: - line = sys.stdin.readline() - if not line: - break - if line.startswith('data: '): - json_content = line[6:].strip() - content = json.loads(json_content)['content'] - sys.stdout.write(content) - sys.stdout.flush() - answer += content - -answer = answer.rstrip('\n') - -# Write the answer to the temporary file -with open('$TEMPFILE', 'w') as f: - f.write(answer) - " <&3 - - exec 3<&- - - # Read the answer from the temporary file - ANSWER=$(cat $TEMPFILE) - - # Clean up the temporary file - rm $TEMPFILE - - printf "\n" - - CHAT+=("$1" "$(trim "$ANSWER")") -} - -while true; do - echo -en "\033[0;32m" # Green color - read -r -e -p "> " QUESTION - echo -en "\033[0m" # Reset color - chat_completion "${QUESTION}" -done diff --git a/tools/server/chat.mjs b/tools/server/chat.mjs deleted file mode 100644 index 4fef5655a89..00000000000 --- a/tools/server/chat.mjs +++ /dev/null @@ -1,131 +0,0 @@ -import * as readline from 'node:readline' -import { stdin, stdout } from 'node:process' -import { readFileSync } from 'node:fs' -import { SchemaConverter } from './public_legacy/json-schema-to-grammar.mjs' - -const args = process.argv.slice(2); -const grammarJsonSchemaFile = args.find( - (_, index) => args[index - 1] === "--grammar-json-schema" -); - -const no_cached_prompt = args.find( - (_, index) => args[index - 1] === "--no-cache-prompt" -) ?? "false"; - -const grammarFile = args.find((_, index) => args[index - 1] === "--grammar"); - -// Example usage: function,arguments -const grammarJsonSchemaPropOrder = args.find( - (_, index) => args[index - 1] === "--grammar-json-schema-prop-order" -); -const propOrder = grammarJsonSchemaPropOrder - ? grammarJsonSchemaPropOrder - .split(",") - .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {}) - : {}; - -let grammar = null -if (grammarJsonSchemaFile) { - let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8')) - const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true}) - schema = await converter.resolveRefs(schema, grammarJsonSchemaFile) - converter.visit(schema, '') - grammar = converter.formatGrammar() -} -if (grammarFile) { - grammar = readFileSync(grammarFile, 'utf-8') -} - -// for cached prompt -let slot_id = -1; - -const API_URL = 'http://127.0.0.1:8080' - -const chat = [ - { - human: "Hello, Assistant.", - assistant: "Hello. How may I help you today?" - }, - { - human: "Please tell me the largest city in Europe.", - assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia." - }, -] - -const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.` - -function format_prompt(question) { - return `${instruction}\n${ - chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n") - }\n### Human: ${question}\n### Assistant:` -} - -async function tokenize(content) { - const result = await fetch(`${API_URL}/tokenize`, { - method: 'POST', - body: JSON.stringify({ content }) - }) - - if (!result.ok) { - return [] - } - - return await result.json().tokens -} - -const n_keep = await tokenize(instruction).length - -async function chat_completion(question) { - const result = await fetch(`${API_URL}/completion`, { - method: 'POST', - body: JSON.stringify({ - prompt: format_prompt(question), - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: n_keep, - n_predict: 256, - cache_prompt: no_cached_prompt === "false", - slot_id: slot_id, - stop: ["\n### Human:"], // stop completion after generating this - grammar, - stream: true, - }) - }) - - if (!result.ok) { - return - } - - let answer = '' - - for await (var chunk of result.body) { - const t = Buffer.from(chunk).toString('utf8') - if (t.startsWith('data: ')) { - const message = JSON.parse(t.substring(6)) - slot_id = message.slot_id - answer += message.content - process.stdout.write(message.content) - if (message.stop) { - if (message.truncated) { - chat.shift() - } - break - } - } - } - - process.stdout.write('\n') - chat.push({ human: question, assistant: answer.trimStart() }) -} - -const rl = readline.createInterface({ input: stdin, output: stdout }); - -const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => { - rl.question(query, options, resolve) -}); - -while(true) { - const question = await readlineQuestion(rl, '> ') - await chat_completion(question) -} diff --git a/tools/server/chat.sh b/tools/server/chat.sh deleted file mode 100755 index 84cea2d56a0..00000000000 --- a/tools/server/chat.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env bash - -API_URL="${API_URL:-http://127.0.0.1:8080}" - -CHAT=( - "Hello, Assistant." - "Hello. How may I help you today?" - "Please tell me the largest city in Europe." - "Sure. The largest city in Europe is Moscow, the capital of Russia." -) - -INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." - -trim() { - shopt -s extglob - set -- "${1##+([[:space:]])}" - printf "%s" "${1%%+([[:space:]])}" -} - -trim_trailing() { - shopt -s extglob - printf "%s" "${1%%+([[:space:]])}" -} - -format_prompt() { - echo -n "${INSTRUCTION}" - printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1" -} - -tokenize() { - curl \ - --silent \ - --request POST \ - --url "${API_URL}/tokenize" \ - --header "Content-Type: application/json" \ - --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ - | jq '.tokens[]' -} - -N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l) - -chat_completion() { - PROMPT="$(trim_trailing "$(format_prompt "$1")")" - DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ - prompt: ., - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: $n_keep, - n_predict: 256, - cache_prompt: true, - stop: ["\n### Human:"], - stream: true - }')" - - ANSWER='' - - while IFS= read -r LINE; do - if [[ $LINE = data:* ]]; then - CONTENT="$(echo "${LINE:5}" | jq -r '.content')" - printf "%s" "${CONTENT}" - ANSWER+="${CONTENT}" - fi - done < <(curl \ - --silent \ - --no-buffer \ - --request POST \ - --url "${API_URL}/completion" \ - --header "Content-Type: application/json" \ - --data-raw "${DATA}") - - printf "\n" - - CHAT+=("$1" "$(trim "$ANSWER")") -} - -while true; do - read -r -e -p "> " QUESTION - chat_completion "${QUESTION}" -done diff --git a/tools/server/main.cpp b/tools/server/main.cpp new file mode 100644 index 00000000000..7f17c56a8c2 --- /dev/null +++ b/tools/server/main.cpp @@ -0,0 +1,5 @@ +int llama_server(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_server(argc, argv); +} diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp index f276f8da58f..02858a2a028 100644 --- a/tools/server/server-chat.cpp +++ b/tools/server/server-chat.cpp @@ -257,8 +257,11 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { for (json resp_tool : response_body.at("tools")) { json chatcmpl_tool; - if (json_value(resp_tool, "type", std::string()) != "function") { - throw std::invalid_argument("'type' of tool must be 'function'"); + const std::string type = json_value(resp_tool, "type", std::string()); + if (type != "function") { + // Non-function Responses tools have no Chat Completions equivalent. + SRV_WRN("unsupported Responses tool type '%s' skipped\n", type.c_str()); + continue; } resp_tool.erase("type"); chatcmpl_tool["type"] = "function"; @@ -270,7 +273,9 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) { chatcmpl_tools.push_back(chatcmpl_tool); } chatcmpl_body.erase("tools"); - chatcmpl_body["tools"] = chatcmpl_tools; + if (!chatcmpl_tools.empty()) { + chatcmpl_body["tools"] = chatcmpl_tools; + } } if (response_body.contains("max_output_tokens")) { diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 3fb87c856cc..4d23f33f135 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1589,19 +1589,29 @@ json oaicompat_chat_params_parse(json & body, /* openai api auto caps = common_chat_templates_get_caps(opt.tmpls.get()); common_chat_templates_inputs inputs; - inputs.messages = common_chat_msgs_parse_oaicompat(messages); - inputs.tools = common_chat_tools_parse_oaicompat(tools); - inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice); - inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump(); - inputs.grammar = grammar; - inputs.use_jinja = opt.use_jinja; - inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]); - inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); - const bool continue_final_message = json_value(body, "continue_final_message", false); - if (continue_final_message && inputs.add_generation_prompt) { + inputs.messages = common_chat_msgs_parse_oaicompat(messages); + inputs.tools = common_chat_tools_parse_oaicompat(tools); + inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice); + inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump(); + inputs.grammar = grammar; + inputs.use_jinja = opt.use_jinja; + inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]); + inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); + inputs.continue_final_message = body.contains("continue_final_message") ? + common_chat_continuation_parse(body.at("continue_final_message")) : + COMMON_CHAT_CONTINUATION_NONE; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_NONE && opt.prefill_assistant + && !inputs.messages.empty() && inputs.messages.back().role == "assistant") { + if (inputs.messages.size() >= 2 && inputs.messages[inputs.messages.size() - 2].role == "assistant") { + throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list."); + } + inputs.continue_final_message = COMMON_CHAT_CONTINUATION_AUTO; + inputs.add_generation_prompt = false; + } + if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && inputs.add_generation_prompt) { throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true."); } - inputs.reasoning_format = opt.reasoning_format; + inputs.reasoning_format = opt.reasoning_format; if (body.contains("reasoning_format")) { inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get()); } @@ -1630,32 +1640,6 @@ json oaicompat_chat_params_parse(json & body, /* openai api throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)"); } - // if the assistant message appears at the end of list, we do not add end-of-turn token - // for ex. this can be useful to modify the reasoning process in reasoning models - // continue_final_message is the explicit opt in alias from the vLLM/transformers API, - // equivalent to the prefill_assistant heuristic - bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" - && (continue_final_message || opt.prefill_assistant); - common_chat_msg last_message; - if (prefill_assistant_message) { - last_message = inputs.messages.back(); - inputs.messages.pop_back(); - - /* sanity check, max one assistant message at the end of the list */ - if (!inputs.messages.empty() && inputs.messages.back().role == "assistant") { - throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list."); - } - - // reject reasoning prefill on channel based templates that do not expose explicit thinking tags - if (!last_message.reasoning_content.empty() && inputs.enable_thinking) { - auto probe_params = common_chat_templates_apply(opt.tmpls.get(), inputs); - if (probe_params.supports_thinking && probe_params.thinking_end_tag.empty()) { - throw std::invalid_argument("Assistant prefill with reasoning_content is not supported yet for this template."); - } - } - - inputs.add_generation_prompt = true; - } inputs.force_pure_content = opt.force_pure_content; // Qwen3-ASR expects audio markers to be lifted into a fixed ASR prompt shape. @@ -1673,53 +1657,6 @@ json oaicompat_chat_params_parse(json & body, /* openai api chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs); } - /* Append assistant prefilled message */ - if (prefill_assistant_message) { - const bool thinking_active = chat_params.supports_thinking && !chat_params.thinking_end_tag.empty(); - const bool has_reasoning = !last_message.reasoning_content.empty(); - const bool has_content = !last_message.content.empty() || !last_message.content_parts.empty(); - const bool mid_reasoning = has_reasoning && !has_content; - - // some templates inject thinking_start in generation_prompt, others let the model emit it - const bool gp_has_think = thinking_active - && chat_params.generation_prompt.find(chat_params.thinking_start_tag) != std::string::npos; - - // open the thinking block when reasoning is present and the template did not inject it - if (has_reasoning) { - if (thinking_active && !gp_has_think) { - chat_params.prompt += chat_params.thinking_start_tag; - } - chat_params.prompt += last_message.reasoning_content; - } - - if (thinking_active) { - if (mid_reasoning) { - // model continues inside the thinking block, keep generation_prompt open on think - if (!gp_has_think) { - chat_params.generation_prompt += chat_params.thinking_start_tag; - } - } else { - // close thinking block when reasoning is followed by content, or when the template forced it open - if (has_reasoning || gp_has_think) { - chat_params.prompt += chat_params.thinking_end_tag; - } - // strip thinking_start from generation_prompt so the parser routes model output as content - auto pos = chat_params.generation_prompt.rfind(chat_params.thinking_start_tag); - if (pos != std::string::npos) { - chat_params.generation_prompt = chat_params.generation_prompt.substr(0, pos); - } - } - } - - if (!last_message.content_parts.empty()) { - for (auto & p : last_message.content_parts) { - chat_params.prompt += p.text; - } - } else { - chat_params.prompt += last_message.content; - } - } - llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) { @@ -1742,6 +1679,16 @@ json oaicompat_chat_params_parse(json & body, /* openai api llama_params["chat_parser"] = chat_params.parser; } + llama_params["message_spans"] = json::array(); + + for (const auto & span : chat_params.message_spans) { + llama_params["message_spans"].push_back({ + { "role", span.role }, + { "pos", span.pos }, + { "len", span.len }, + }); + } + // Reasoning budget: pass parameters through to sampling layer { int reasoning_budget = opt.reasoning_budget; @@ -1754,6 +1701,7 @@ json oaicompat_chat_params_parse(json & body, /* openai api llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag; llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag; llama_params["reasoning_budget_message"] = opt.reasoning_budget_message; + llama_params["reasoning_control"] = json_value(body, "reasoning_control", false); } } @@ -1781,6 +1729,9 @@ json oaicompat_chat_params_parse(json & body, /* openai api return llama_params; } +json convert_responses_to_chatcmpl(const json & response_body); +json convert_anthropic_to_oai(const json & body); + json convert_responses_to_chatcmpl(const json & response_body) { if (!response_body.contains("input")) { throw std::invalid_argument("'input' is required"); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index fba422a5a9b..1372da3135e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3,6 +3,7 @@ #include "build-info.h" #include "common.h" +#include "fit.h" #include "llama.h" #include "log.h" #include "mtmd-helper.h" @@ -37,6 +38,21 @@ using json = nlohmann::ordered_json; constexpr int HTTP_POLLING_SECONDS = 1; +static uint32_t server_n_outputs_max(const common_params & params) { + const uint32_t n_batch = params.n_batch; + + if (params.embedding || + (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED && params.pooling_type != LLAMA_POOLING_TYPE_NONE)) { + return n_batch; + } + + const uint32_t n_outputs_per_seq = 1 + common_speculative_n_max(¶ms.speculative); + + const uint64_t n_outputs = (uint64_t) params.n_parallel * n_outputs_per_seq; + + return std::max(1, std::min(n_batch, n_outputs)); +} + // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, @@ -154,9 +170,9 @@ struct server_slot { SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size()); - llama_memory_seq_rm(llama_get_memory(ctx_tgt), id, -1, -1); + common_context_seq_rm(ctx_tgt, id, -1, -1); if (ctx_dft) { - llama_memory_seq_rm(llama_get_memory(ctx_dft), id, -1, -1); + common_context_seq_rm(ctx_dft, id, -1, -1); } prompt.tokens.clear(); @@ -263,8 +279,19 @@ struct server_slot { (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size()); } + bool need_embd() const { + GGML_ASSERT(task); + return task->need_embd() || (spec && common_speculative_need_embd(spec)); + } + + bool need_embd_pre_norm() const { + GGML_ASSERT(task); + return spec && common_speculative_need_embd_pre_norm(spec); + } + // if the context does not have a memory module then all embeddings have to be computed within a single ubatch // also we cannot split if the pooling would require any past tokens + // (MTP supports splitting โ€” uses task->need_embd() not need_embd()) bool can_split() const { GGML_ASSERT(task); @@ -503,6 +530,10 @@ struct server_slot { t_token_generation, n_decoded, t_gen_per_tok, n_gen_per_sec); SLT_INF(*this, " total = %10.2f ms / %5d tokens\n", t_total_ms, n_total_tokens); + SLT_INF(*this, + " graphs reused = %10d\n", + llama_perf_context(ctx_tgt).n_reused); + if (n_draft_total > 0) { const float draft_ratio = (float) n_draft_accepted / n_draft_total; SLT_CNT(*this, "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", draft_ratio, @@ -525,8 +556,11 @@ struct server_slot { const auto & ptask = task ? task : task_prev; if (ptask) { - res["id_task"] = ptask->id; - res["params"] = ptask->params.to_json(only_metrics); + res["id_task"] = ptask->id; + res["n_prompt_tokens"] = (int32_t) prompt.tokens.size(); + res["n_prompt_tokens_processed"] = n_prompt_tokens_processed; + res["n_prompt_tokens_cache"] = n_prompt_tokens_cache; + res["params"] = ptask->params.to_json(only_metrics); res["next_token"] = { { { "has_next_token", has_next_token }, @@ -548,12 +582,12 @@ struct server_slot { void copy_state_to(server_slot & other) const { GGML_ASSERT(state == SLOT_STATE_DONE_PROMPT); - llama_memory_seq_rm(llama_get_memory(ctx_tgt), other.id, -1, -1); - llama_memory_seq_cp(llama_get_memory(ctx_tgt), id, other.id, -1, -1); + common_context_seq_rm(ctx_tgt, other.id, -1, -1); + common_context_seq_cp(ctx_tgt, id, other.id, -1, -1); if (ctx_dft) { - llama_memory_seq_rm(llama_get_memory(ctx_dft), other.id, -1, -1); - llama_memory_seq_cp(llama_get_memory(ctx_dft), id, other.id, -1, -1); + common_context_seq_rm(ctx_dft, other.id, -1, -1); + common_context_seq_cp(ctx_dft, id, other.id, -1, -1); } other.n_decoded = n_decoded; @@ -703,7 +737,8 @@ struct server_context_impl { server_metrics metrics; - json json_webui_settings = json::object(); + json json_ui_settings = json::object(); // Primary: new name + json json_webui_settings = json::object(); // Deprecated: use json_ui_settings instead (kept for compat) // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -731,6 +766,10 @@ struct server_context_impl { } void destroy() { + spec.reset(); + ctx_dft.reset(); + model_dft.reset(); + llama_init.reset(); ctx_tgt = nullptr; @@ -778,6 +817,127 @@ struct server_context_impl { SRV_INF("loading model '%s'\n", params.model.path.c_str()); params_base = params; + params_base.n_outputs_max = server_n_outputs_max(params_base); + + std::string & mmproj_path = params_base.mmproj.path; + bool has_mmproj = !mmproj_path.empty(); + mtmd_context_params mparams = mtmd_context_params_default(); + if (has_mmproj) { + mparams.use_gpu = params_base.mmproj_use_gpu; + mparams.print_timings = false; + mparams.n_threads = params_base.cpuparams.n_threads; + mparams.flash_attn_type = params_base.flash_attn_type; + mparams.warmup = params_base.warmup; + mparams.image_min_tokens = params_base.image_min_tokens; + mparams.image_max_tokens = params_base.image_max_tokens; + mparams.media_marker = get_media_marker(); + } + + // optionally get the memory usage of mmproj + if (has_mmproj && params_base.fit_params) { + auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); + if (!mmproj_mem.empty()) { + size_t total = 0; + for (auto & [dev, size] : mmproj_mem) { + total += size; + } + SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); + GGML_ASSERT(!params_base.fit_params_target.empty()); + for (auto & [dev, size] : mmproj_mem) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + if (ggml_backend_dev_get(i) == dev) { + if (i < params_base.fit_params_target.size()) { + SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev)); + params_base.fit_params_target[i] += size; + } + break; + } + } + } + } else { + SRV_ERR("%s", "[mtmd] failed to get memory usage of mmproj\n"); + } + } + + // optionally reserve VRAM for the draft / MTP context before fitting the target model + if (params_base.fit_params) { + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + const bool has_draft = params_base.speculative.has_dft(); + + if (has_draft || spec_mtp) { + common_params params_dft = params_base; + bool measure_model_bytes = true; + + if (has_draft) { + const auto & params_spec = params_base.speculative.draft; + params_dft.devices = params_spec.devices; + params_dft.model = params_spec.mparams; + params_dft.n_gpu_layers = params_spec.n_gpu_layers; + params_dft.cache_type_k = params_spec.cache_type_k; + params_dft.cache_type_v = params_spec.cache_type_v; + params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides; + } else { + // MTP draft context lives on the target model, only context+compute are new + measure_model_bytes = false; + } + + params_dft.n_outputs_max = params_base.n_parallel; + + auto mparams_dft = common_model_params_to_llama(params_dft); + auto cparams_dft = common_context_params_to_llama(params_dft); + if (spec_mtp) { + cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_dft.type_k = params_base.speculative.draft.cache_type_k; + cparams_dft.type_v = params_base.speculative.draft.cache_type_v; + } + cparams_dft.n_rs_seq = 0; + + std::vector devs; + uint32_t hp_ngl = 0; + uint32_t hp_nct = 0; + uint32_t hp_nex = 0; + try { + auto dmd = common_get_device_memory_data( + params_dft.model.path.c_str(), &mparams_dft, &cparams_dft, + devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); + + GGML_ASSERT(!params_base.fit_params_target.empty()); + size_t total = 0; + + std::vector tgt_devices = params.devices; + + if (tgt_devices.empty()) { + for(size_t i = 0; i < ggml_backend_dev_count(); ++i) { + tgt_devices.push_back(ggml_backend_dev_get(i)); + } + } + + for (size_t j = 0; j < devs.size(); ++j) { + const size_t bytes = + (measure_model_bytes ? dmd[j].mb.model : 0) + + dmd[j].mb.context + + dmd[j].mb.compute; + total += bytes; + for (size_t i = 0; i < tgt_devices.size(); i++) { + if (tgt_devices[i] == devs[j]) { + SRV_DBG("[spec] adding %.2f MiB to fit_params_target for device %s\n", + bytes / (1024.0 * 1024.0), ggml_backend_dev_name(devs[j])); + params_base.fit_params_target[i] += bytes; + break; + } + } + } + SRV_INF("[spec] estimated memory usage of %s is %.2f MiB\n", + has_draft ? "draft model" : "MTP context", + total / (1024.0 * 1024.0)); + } catch (const std::exception & e) { + SRV_ERR("[spec] failed to measure %s memory: %s\n", + has_draft ? "draft model" : "MTP context", e.what()); + } + } + } llama_init = common_init_from_params(params_base); @@ -825,15 +985,46 @@ struct server_context_impl { } auto cparams = common_context_params_to_llama(params_dft); + + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + if (spec_mtp) { + cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + } + + // note: for small models maybe we can set this to the maximum possible draft from all speculative types + // the extra memory for small models is likely negligible? + cparams.n_rs_seq = 0; ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); - } + } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end()) { + SRV_INF("creating MTP draft context against the target model '%s'\n", + params_base.model.path.c_str()); + + auto cparams_mtp = common_context_params_to_llama(params_base); + cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_mtp.type_k = params_base.speculative.draft.cache_type_k; + cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; + cparams_mtp.n_rs_seq = 0; + cparams_mtp.n_outputs_max = params_base.n_parallel; + + ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); + if (ctx_dft == nullptr) { + SRV_ERR("%s", "failed to create MTP context\n"); + return false; + } - const std::string & mmproj_path = params_base.mmproj.path; + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + + params_base.speculative.draft.ctx_tgt = ctx_tgt; + params_base.speculative.draft.ctx_dft = ctx_dft.get(); + } server_vision_backend_mode selected_backend = SERVER_VISION_BACKEND_NONE; #if defined(LLAMA_SERVER_SMT_VISION) @@ -868,16 +1059,6 @@ struct server_context_impl { if (!is_resume) { mtmd_helper_log_set(common_log_default_callback, nullptr); } - mtmd_context_params mparams = mtmd_context_params_default(); - - mparams.use_gpu = params_base.mmproj_use_gpu; - mparams.print_timings = false; - mparams.n_threads = params_base.cpuparams.n_threads; - mparams.flash_attn_type = params_base.flash_attn_type; - mparams.warmup = params_base.warmup; - mparams.image_min_tokens = params_base.image_min_tokens; - mparams.image_max_tokens = params_base.image_max_tokens; - mparams.media_marker = get_media_marker(); mctx = mtmd_init_from_file(mmproj_path.c_str(), model_tgt, mparams); if (mctx == nullptr) { @@ -1045,6 +1226,13 @@ struct server_context_impl { } SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); + if (params_base.n_ctx_checkpoints > 0) { + SRV_INF("context checkpoints enabled, max = %d, min spacing = %d\n", + params_base.n_ctx_checkpoints, params_base.checkpoint_min_step); + } else { + SRV_INF("%s", "context checkpoints disabled\n"); + } + if (!params_base.model_alias.empty()) { // backward compat: use first alias as model name model_name = *params_base.model_alias.begin(); @@ -1096,13 +1284,18 @@ struct server_context_impl { } } - // populate webui settings + // populate UI settings (from either new ui_config_json or deprecated webui_config_json) { - if (!params_base.webui_config_json.empty()) { + const std::string & cfg = !params_base.ui_config_json.empty() + ? params_base.ui_config_json + : params_base.webui_config_json; + if (!cfg.empty()) { try { - json_webui_settings = json::parse(params_base.webui_config_json); + json json_settings = json::parse(cfg); + json_ui_settings = json_settings; + json_webui_settings = json_settings; // deprecated: keep in sync } catch (const std::exception & e) { - SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + SRV_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); return false; } } @@ -1173,6 +1366,20 @@ struct server_context_impl { return nullptr; } + server_slot * get_slot_by_cmpl_id(const std::string & cmpl_id) { + if (cmpl_id.empty()) { + return nullptr; + } + + for (server_slot & slot : slots) { + if (slot.is_processing() && slot.task && slot.task->params.oaicompat_cmpl_id == cmpl_id) { + return &slot; + } + } + + return nullptr; + } + server_slot * get_available_slot(const server_task & task) { server_slot * ret = nullptr; @@ -1679,7 +1886,7 @@ struct server_context_impl { return true; } - void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { + void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress, bool is_begin = false) { auto res = std::make_unique(); res->id = slot.task->id; @@ -1691,6 +1898,9 @@ struct server_context_impl { res->progress.cache = slot.n_prompt_tokens_cache; res->progress.processed = slot.prompt.tokens.size(); res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt) / 1000; + } + if (is_begin) { + res->is_begin = true; } else { res->content = tkn.text_to_send; res->tokens = { tkn.tok }; @@ -2045,8 +2255,38 @@ struct server_context_impl { break; } } - } - break; + } break; + case SERVER_TASK_TYPE_CONTROL: + { + auto res = std::make_unique(); + res->id = task.id; + + server_slot * slot = get_slot_by_cmpl_id(task.params.control_cmpl_id); + if (slot == nullptr) { + res->success = false; + res->message = "no active completion for this id"; + queue_results.send(std::move(res)); + break; + } + + if (task.params.control_action == "reasoning_end") { + // the budget sampler only exists when reasoning control was armed + if (!slot->task->params.sampling.reasoning_control) { + res->success = false; + res->message = "reasoning control not enabled for this completion"; + queue_results.send(std::move(res)); + break; + } + // act on the live slot mid generation, never defer + common_sampler_reasoning_budget_force(slot->smpl.get()); + res->success = true; + } else { + res->success = false; + res->message = "unknown control action"; + } + + queue_results.send(std::move(res)); + } break; case SERVER_TASK_TYPE_NEXT_RESPONSE: { // do nothing @@ -2334,14 +2574,12 @@ struct server_context_impl { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, n_keep, n_keep + n_discard); - llama_memory_seq_add(llama_get_memory(ctx_tgt), slot.id, n_keep + n_discard, slot.prompt.n_tokens(), - -n_discard); + common_context_seq_rm (ctx_tgt, slot.id, n_keep , n_keep + n_discard); + common_context_seq_add(ctx_tgt, slot.id, n_keep + n_discard, slot.prompt.n_tokens(), -n_discard); if (ctx_dft) { - llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), slot.id, n_keep, n_keep + n_discard); - llama_memory_seq_add(llama_get_memory(ctx_dft.get()), slot.id, n_keep + n_discard, - slot.prompt.tokens.pos_next(), -n_discard); + common_context_seq_rm (ctx_dft.get(), slot.id, n_keep , n_keep + n_discard); + common_context_seq_add(ctx_dft.get(), slot.id, n_keep + n_discard, slot.prompt.tokens.pos_next(), -n_discard); } // add generated tokens to cache @@ -2447,15 +2685,23 @@ struct server_context_impl { slot.n_draft_total += draft.size(); // TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL] + const bool use_ckpt_dft = ctx_dft_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + if (ctx_dft) { - ckpt.load_dft(ctx_dft.get(), slot.id, - LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + if (use_ckpt_dft) { + ckpt.load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + } - llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), slot.id, ckpt.pos_max + 1, -1); + common_context_seq_rm(ctx_dft.get(), slot.id, ckpt.pos_max + 1, -1); } if (!draft.empty()) { - const bool use_ckpt_tgt = ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; + const bool use_ckpt_tgt = + ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || + (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS && draft.size() > llama_n_rs_seq(ctx_tgt)); + + const bool use_ckpt_dft = + (ctx_dft_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS && draft.size() > llama_n_rs_seq(ctx_dft.get())); if (use_ckpt_tgt) { //const int64_t t_start = ggml_time_us(); @@ -2472,6 +2718,10 @@ struct server_context_impl { ckpt.pos_min, ckpt.pos_max, slot.prompt.n_tokens(), (float) ckpt.size() / 1024 / 1024, (float) ckpt.data_dft.size() / 1024 / 1024); } + + if (use_ckpt_dft) { + ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + } } } @@ -2649,15 +2899,12 @@ struct server_context_impl { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, head_p, head_c); - llama_memory_seq_add(llama_get_memory(ctx_tgt), slot.id, head_c, - head_c + n_match, kv_shift); + common_context_seq_rm (ctx_tgt, slot.id, head_p, head_c); + common_context_seq_add(ctx_tgt, slot.id, head_c, head_c + n_match, kv_shift); if (ctx_dft) { - llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), slot.id, head_p, - head_c); - llama_memory_seq_add(llama_get_memory(ctx_dft.get()), slot.id, head_c, - head_c + n_match, kv_shift); + common_context_seq_rm (ctx_dft.get(), slot.id, head_p, head_c); + common_context_seq_add(ctx_dft.get(), slot.id, head_c, head_c + n_match, kv_shift); } for (size_t i = 0; i < n_match; i++) { @@ -2683,11 +2930,11 @@ struct server_context_impl { llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa); + const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); // note: disallow with multimodal contexts for now // https://github.com/ggml-org/llama.cpp/issues/17043 - if (!slot.prompt.tokens.has_mtmd && n_past > 0 && n_past < slot.prompt.n_tokens()) { + if (!slot.prompt.tokens.has_mtmd && n_past > 0 && n_past <= slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); if (pos_min == -1) { SLT_ERR(slot, @@ -2769,8 +3016,7 @@ struct server_context_impl { if (!do_reset) { // restore the context checkpoint - - it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); @@ -2828,10 +3074,15 @@ struct server_context_impl { slot.prompt.tokens.keep_first(n_past); - // send initial 0% progress update if needed // this is to signal the client that the request has started processing - if (slot.task->params.stream && slot.task->params.return_progress) { - send_partial_response(slot, {}, true); + if (slot.task->params.stream) { + if (slot.task->params.return_progress) { + // send initial 0% progress update if needed + send_partial_response(slot, {}, true); + } else { + // otherwise, for streaming without progress, signal HTTP to send the headers (i.e. 200 status) + send_partial_response(slot, {}, false, true); + } } } @@ -2851,57 +3102,9 @@ struct server_context_impl { SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); - if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) { - SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - - slot.prompt_clear(true); - - // there is no common part left - slot.n_prompt_tokens_cache = 0; - } else { - if (ctx_dft && !llama_memory_seq_rm(llama_get_memory(ctx_dft.get()), slot.id, p0, -1)) { - GGML_ABORT("failed to truncate draft context\n"); - } - } - - // check if we should process the image - if (slot.prompt.n_tokens() < slot.task->n_tokens() && - input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { - // process the image - size_t n_tokens_out = 0; - const int64_t t_prefill_start = ggml_time_us(); - int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, smt_ctx, slot.prompt.n_tokens(), - slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); - const double t_prefill_ms = (ggml_time_us() - t_prefill_start) / 1e3; - if (res != 0) { - SLT_ERR(slot, "failed to process image, res = %d\n", res); - send_error(slot, "failed to process image", ERROR_TYPE_SERVER); - slot.release(); - continue; - } - - slot.n_prompt_tokens_processed += n_tokens_out; - - // add the image chunk to cache - { - if (input_tokens.is_smt()) { - const auto & chunk = input_tokens.find_smt_chunk(slot.prompt.n_tokens()); - // accumulate encoder and prefill timing - if (chunk.type == server_smt_media_type::audio) { - slot.t_audio_encode_ms += chunk.t_encode_ms; - slot.t_audio_prefill_ms += t_prefill_ms; - slot.n_audio_tokens += chunk.n_tokens; - } else { - slot.t_vision_encode_ms += chunk.t_encode_ms; - slot.t_vision_prefill_ms += t_prefill_ms; - slot.n_vision_tokens += chunk.n_tokens; - } - slot.prompt.tokens.push_back(chunk); // copy - } else { - const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); - slot.prompt.tokens.push_back(chunk.get()); // copy - } - } + common_context_seq_rm(ctx_tgt, slot.id, p0, -1); + if (ctx_dft) { + common_context_seq_rm(ctx_dft.get(), slot.id, p0, -1); } // If using an alora, there may be uncached tokens that come @@ -2930,8 +3133,11 @@ struct server_context_impl { // checkpoints are created only if: // - the model does not support partial sequence removal // - the model uses SWA (and we are not using `swa_full`) - do_checkpoint = - do_checkpoint && ((ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) || (n_swa > 0)); + // - the model supports partial sequence removal but only up to a fixed bound + do_checkpoint = do_checkpoint && ( + ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || + ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS || + n_swa > 0); bool has_mtmd = false; @@ -2939,9 +3145,11 @@ struct server_context_impl { while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { // process the image - size_t n_tokens_out = 0; - int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, smt_ctx, slot.prompt.n_tokens(), - slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); + size_t n_tokens_out = 0; + const int64_t t_prefill_start = ggml_time_us(); + int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, smt_ctx, slot.prompt.n_tokens(), + slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); + const double t_prefill_ms = (ggml_time_us() - t_prefill_start) / 1e3; if (res != 0) { SLT_ERR(slot, "failed to process image, res = %d\n", res); send_error(slot, "failed to process image", ERROR_TYPE_SERVER); @@ -2964,13 +3172,31 @@ struct server_context_impl { // add the image chunk to cache { - const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); - slot.prompt.tokens.push_back(chunk.get()); // copy + if (input_tokens.is_smt()) { + const auto & chunk = input_tokens.find_smt_chunk(slot.prompt.n_tokens()); + // accumulate encoder and prefill timing + if (chunk.type == server_smt_media_type::audio) { + slot.t_audio_encode_ms += chunk.t_encode_ms; + slot.t_audio_prefill_ms += t_prefill_ms; + slot.n_audio_tokens += chunk.n_tokens; + } else { + slot.t_vision_encode_ms += chunk.t_encode_ms; + slot.t_vision_prefill_ms += t_prefill_ms; + slot.n_vision_tokens += chunk.n_tokens; + } + slot.prompt.tokens.push_back(chunk); // copy + } else { + const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); + slot.prompt.tokens.push_back(chunk.get()); // copy + } } has_mtmd = true; } + const int32_t n_before_user = slot.task->params.n_before_user; + const bool n_before_user_known = n_before_user > 0; + // add prompt tokens for processing in the current batch while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) { // get next token to process @@ -2988,13 +3214,25 @@ struct server_context_impl { break; } - // embedding requires all tokens in the batch to be output - common_batch_add(batch, cur_tok, slot.prompt.tokens.pos_next(), { slot.id }, - slot.task->need_embd()); + // embedding requires all tokens in the batch to be output; + // MTP also wants logits at every prompt position so the + // streaming hook can mirror t_h_pre_norm into ctx_dft. + common_batch_add(batch, + cur_tok, + slot.prompt.tokens.pos_next(), + { slot.id }, + slot.need_embd()); slot.prompt.tokens.push_back(cur_tok); slot.n_prompt_tokens_processed++; + // stop the prompt batch exactly before the latest user input, so a checkpoint + // can be created after the previous messages + if (n_before_user_known && + slot.prompt.n_tokens() == n_before_user) { + break; + } + // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created. // create checkpoints that many tokens before the end of the prompt: // - 4 + n_ubatch @@ -3020,6 +3258,8 @@ struct server_context_impl { // the number of tokens added to the batch for the current slot const auto n_tokens_cur = batch.n_tokens - n_tokens_prev; + const bool near_prompt_end = slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch; + // entire prompt has been processed if (slot.prompt.n_tokens() == slot.task->n_tokens()) { slot.state = SLOT_STATE_DONE_PROMPT; @@ -3034,48 +3274,50 @@ struct server_context_impl { slot.init_sampler(); } else { - if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) { - // near the end of the prompt - do_checkpoint = do_checkpoint && true; - } else { - // only do non-end checkpoints if the "checkpoint every n tokens" option is set - do_checkpoint = do_checkpoint && params_base.checkpoint_every_nt > 0; - - if (do_checkpoint) { - llama_pos last_checkpoint = 0; - if (!slot.prompt.checkpoints.empty()) { - last_checkpoint = slot.prompt.checkpoints.back().n_tokens; - } - - do_checkpoint = - do_checkpoint && slot.prompt.n_tokens() - batch.n_tokens - last_checkpoint >= - params_base.checkpoint_every_nt; - - if (do_checkpoint) { - SLT_INF(slot, - "%d tokens since last checkpoint at %d, creating new checkpoint during " - "processing at position %d\n", - params_base.checkpoint_every_nt, last_checkpoint, slot.prompt.n_tokens()); - } - } + // skip ordinary mid-prompt checkpoints + if (!n_before_user_known && !near_prompt_end) { + do_checkpoint = false; } } const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id); - // no need for empty or small checkpoints - do_checkpoint = do_checkpoint && (pos_min >= 0 && slot.prompt.n_tokens() >= 64); + // checkpoints are created before the current batch is decoded, so + // their token position is the batch start rather than the prompt end + const int32_t n_tokens_start = slot.prompt.n_tokens() - n_tokens_cur; + + { + const bool is_on_user = + n_before_user_known && + n_tokens_start == n_before_user; + + const bool is_after_user = + n_before_user_known && + n_tokens_start > n_before_user; + + const bool is_allowed = + !n_before_user_known || + is_on_user || + (is_after_user && near_prompt_end); + + if (do_checkpoint && !is_allowed) { + do_checkpoint = false; + } + } + + // nothing to checkpoint yet + // TODO: is this check needed? + if (do_checkpoint && pos_min < 0) { + do_checkpoint = false; + } // do not checkpoint after mtmd chunks do_checkpoint = do_checkpoint && !has_mtmd; // no need to create checkpoints that are too close together - do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || - slot.prompt.n_tokens() - n_tokens_cur > - slot.prompt.checkpoints.back().n_tokens + 64); - SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", - pos_min, pos_max); + do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step); + SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max); // note: we create the checkpoint before calling llama_decode(), so the current batch is not // yet processed and therefore it is not part of the checkpoint. @@ -3112,7 +3354,7 @@ struct server_context_impl { slot_batched->lora[alora_disabled_id].scale = alora_scale; } - llama_set_embeddings(ctx_tgt, slot_batched->task->need_embd()); + llama_set_embeddings(ctx_tgt, slot_batched->need_embd()); } if (batch.n_tokens == 0) { @@ -3376,13 +3618,8 @@ struct server_context_impl { // verify and try to accept the draft { - const bool use_ckpt_tgt = ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL; - - // only save the sampler sampler state if we use checkpoints - common_sampler_ptr smpl_save; - if (use_ckpt_tgt) { - smpl_save.reset(common_sampler_clone(slot.smpl.get())); - } + // save the sampler sampler state in case we need to restore it + common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get())); GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1); auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx_tgt, slot.spec_i_batch, @@ -3391,8 +3628,14 @@ struct server_context_impl { GGML_ASSERT(accepted.size() >= 1); + const uint32_t n_rollback = slot.spec_draft.size() + 1 - accepted.size(); + + const bool use_ckpt_tgt = + ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || + (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS && n_rollback > llama_n_rs_seq(ctx_tgt)); + // check for partial draft acceptance - if (accepted.size() < slot.spec_draft.size() + 1) { + if (n_rollback > 0) { if (use_ckpt_tgt) { if (trace > 0) { SLT_INF(slot, "accepted %2zu/%2zu draft tokens (restore checkpoint)\n", @@ -3411,14 +3654,14 @@ struct server_context_impl { ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - llama_memory_seq_rm(llama_get_memory(slot.ctx_tgt), slot.id, ckpt.pos_max + 1, -1); + common_context_seq_rm(slot.ctx_tgt, slot.id, ckpt.pos_max + 1, -1); } if (slot.ctx_dft) { ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - llama_memory_seq_rm(llama_get_memory(slot.ctx_dft), slot.id, ckpt.pos_max + 1, -1); + common_context_seq_rm(slot.ctx_dft, slot.id, ckpt.pos_max + 1, -1); } slot.prompt.tokens.keep_first(ckpt.n_tokens); @@ -3441,7 +3684,6 @@ struct server_context_impl { const auto ids = std::move(slot.spec_draft); - slot.n_decoded += ids.size(); slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; // update how many tokens out of those tested were accepted @@ -3455,9 +3697,9 @@ struct server_context_impl { SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft); - llama_memory_seq_rm(llama_get_memory(slot.ctx_tgt), slot.id, slot.prompt.tokens.pos_next(), -1); + common_context_seq_rm(slot.ctx_tgt, slot.id, slot.prompt.tokens.pos_next(), -1); if (slot.ctx_dft) { - llama_memory_seq_rm(llama_get_memory(slot.ctx_dft), slot.id, slot.prompt.tokens.pos_next(), -1); + common_context_seq_rm(slot.ctx_dft, slot.id, slot.prompt.tokens.pos_next(), -1); } for (size_t i = 0; i < ids.size(); ++i) { @@ -3470,6 +3712,8 @@ struct server_context_impl { // TODO: set result.probs + slot.n_decoded += 1; + if (!process_token(result, slot)) { slot.print_timings(); send_final_response(slot); @@ -3542,7 +3786,8 @@ server_context_meta server_context::get_meta() const { /* has_mtmd */ impl->has_multimodal(), /* has_inp_image */ impl->chat_params.allow_image, /* has_inp_audio */ impl->chat_params.allow_audio, - /* json_webui_settings */ impl->json_webui_settings, + /* json_ui_settings */ impl->json_ui_settings, + /* json_webui_settings */ impl->json_webui_settings, // Deprecated /* slot_n_ctx */ impl->get_slot_n_ctx(), /* pooling_type */ llama_pooling_type(impl->ctx_tgt), @@ -3603,6 +3848,61 @@ void server_context::on_sleeping_changed(std::function callback) { impl->queue_tasks.on_sleeping_state(std::move(callback)); } +// compute the number of tokens before the last user message in the prompt +static int32_t prompt_get_n_before_user( + const json & message_spans, + const std::string & prompt, + const std::vector & files, + const llama_vocab * vocab, + mtmd_context * mctx, + server_smt_vision_context * smt_ctx) { + int32_t result = -1; + int32_t byte_pos = -1; + + for (const auto & span : message_spans) { + const std::string role = json_value(span, "role", std::string()); + + if (role == "user") { + byte_pos = json_value(span, "pos", -1); + } + } + + if (byte_pos >= 0) { + GGML_ASSERT((size_t) byte_pos <= prompt.size()); + + const std::string prefix = prompt.substr(0, (size_t) byte_pos); + + const std::string marker = get_media_marker(); + size_t n_prefix_media = 0; + for (size_t pos = 0; (pos = prefix.find(marker, pos)) != std::string::npos; pos += marker.size()) { + n_prefix_media++; + } + + GGML_ASSERT(n_prefix_media <= files.size()); + + if (n_prefix_media > 0) { + // TODO: this makes a copy - avoid it + std::vector prefix_files(files.begin(), files.begin() + n_prefix_media); + + if (mctx != nullptr) { + result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size(); + } else if (smt_ctx != nullptr) { + result = (int32_t) process_smt_prompt(smt_ctx, vocab, prefix, prefix_files).size(); + } else { + result = (int32_t) tokenize_input_prompts(vocab, nullptr, nullptr, prefix, true, true)[0].size(); + } + } else { + result = (int32_t) tokenize_input_prompts(vocab, nullptr, nullptr, prefix, true, true)[0].size(); + } + + SRV_TRC("message_spans: last user message: byte_pos=%d, media=%zu, n_before_user=%d\n", + byte_pos, n_prefix_media, result); + } + + return result; +} + + // // server_routes // @@ -3648,9 +3948,26 @@ std::unique_ptr server_routes::handle_completions_impl(con task.id = rd.get_new_id(); - task.tokens = std::move(inputs[i]); - task.params = server_task::params_from_json_cmpl(ctx_server.vocab, params, meta->slot_n_ctx, - meta->logit_bias_eog, data); + task.tokens = std::move(inputs[i]); + task.params = server_task::params_from_json_cmpl( + ctx_server.vocab, + params, + meta->slot_n_ctx, + meta->logit_bias_eog, + data); + + const auto message_spans = json_value(data, "message_spans", json::array()); + if (prompt.is_string() && message_spans.is_array()) { + task.params.n_before_user = + prompt_get_n_before_user( + message_spans, + prompt.get(), + files, + ctx_server.vocab, + ctx_server.mctx, + ctx_server.smt_ctx); + } + task.id_slot = json_value(data, "id_slot", -1); // OAI-compat @@ -3728,7 +4045,9 @@ std::unique_ptr server_routes::handle_completions_impl(con // next responses are streamed // to be sent immediately json first_result_json = first_result->to_json(); - if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { + if (first_result_json == nullptr) { + res->data = ""; // simply send HTTP headers and status code + } else if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { res->data = format_anthropic_sse(first_result_json); } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) { res->data = format_oai_resp_sse(first_result_json); @@ -4055,29 +4374,33 @@ void server_routes::init_routes() { std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use"); json props = { - {"default_generation_settings", default_generation_settings_for_props}, - { "total_slots", params.n_parallel }, - { "model_alias", meta->model_name }, - { "model_path", meta->model_path }, - { "media_backend", meta->media_backend }, - { "vision_backend", meta->media_backend }, - { "modalities", - json{ - { "vision", meta->has_inp_image }, - { "audio", meta->has_inp_audio }, - } }, - { "media_marker", get_media_marker() }, - { "endpoint_slots", params.endpoint_slots }, - { "endpoint_props", params.endpoint_props }, - { "endpoint_metrics", params.endpoint_metrics }, - { "webui", params.webui }, - { "webui_settings", meta->json_webui_settings }, - { "chat_template", tmpl_default }, - { "chat_template_caps", meta->chat_template_caps }, - { "bos_token", meta->bos_token_str }, - { "eos_token", meta->eos_token_str }, - { "build_info", meta->build_info }, - { "is_sleeping", queue_tasks.is_sleeping() }, + { "default_generation_settings", default_generation_settings_for_props }, + { "total_slots", params.n_parallel }, + { "model_alias", meta->model_name }, + { "model_path", meta->model_path }, + { "media_backend", meta->media_backend }, + { "vision_backend", meta->media_backend }, + { "modalities", json { + {"vision", meta->has_inp_image}, + {"audio", meta->has_inp_audio}, + } }, + { "media_marker", get_media_marker() }, + { "endpoint_slots", params.endpoint_slots }, + { "endpoint_props", params.endpoint_props }, + { "endpoint_metrics", params.endpoint_metrics }, + // New keys + { "ui", params.ui }, + { "ui_settings", meta->json_ui_settings }, + // Deprecated: use ui/ui_settings instead (kept for backward compat) + { "webui", params.webui }, + { "webui_settings", meta->json_webui_settings }, + { "chat_template", tmpl_default }, + { "chat_template_caps", meta->chat_template_caps }, + { "bos_token", meta->bos_token_str }, + { "eos_token", meta->eos_token_str }, + { "build_info", meta->build_info }, + { "is_sleeping", queue_tasks.is_sleeping() }, + { "cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy }, }; if (params.use_jinja) { if (!tmpl_tools.empty()) { @@ -4202,6 +4525,43 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_control = [this](const server_http_req & req) { + auto res = create_response(); + const json body = json::parse(req.body); + + const std::string cmpl_id = json_value(body, "id", std::string()); + const std::string action = json_value(body, "action", std::string()); + if (cmpl_id.empty()) { + res->error(format_error_response("missing completion id", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + if (action != "reasoning_end") { + res->error(format_error_response("unknown control action", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + auto & rd = res->rd; + { + server_task task(SERVER_TASK_TYPE_CONTROL); + task.id = rd.get_new_id(); + task.params.control_cmpl_id = cmpl_id; + task.params.control_action = action; + rd.post_task(std::move(task)); + } + + auto result = rd.next(req.should_stop); + if (!result) { + GGML_ASSERT(req.should_stop()); + return res; + } + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + res->ok(result->to_json()); + return res; + }; + this->post_responses_oai = [this](const server_http_req & req) { auto res = create_response(); std::vector files; @@ -4692,7 +5052,7 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons } } - int embd_normalize = 2; // default to Euclidean/L2 norm + int embd_normalize = params.embd_normalize; if (body.count("embd_normalize") != 0) { embd_normalize = body.at("embd_normalize"); if (meta->pooling_type == LLAMA_POOLING_TYPE_NONE) { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 96d0513ca90..77a935d47b9 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -22,7 +22,8 @@ struct server_context_meta { bool has_mtmd; bool has_inp_image; bool has_inp_audio; - json json_webui_settings; + json json_ui_settings; // Primary: new name + json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat) int slot_n_ctx; enum llama_pooling_type pooling_type; @@ -110,6 +111,7 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_control; server_http_context::handler_t post_responses_oai; server_http_context::handler_t post_transcriptions_oai; server_http_context::handler_t post_anthropic_messages; diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index af4536fdd77..34a20c9d22d 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -1,23 +1,16 @@ #include "common.h" #include "server-http.h" #include "server-common.h" +#include "ui.h" #include -#include #include #include +#include #include #include -#ifdef LLAMA_BUILD_WEBUI -// auto generated files (see README.md for details) -#include "index.html.hpp" -#include "bundle.js.hpp" -#include "bundle.css.hpp" -#include "loading.html.hpp" -#endif - // // HTTP implementation using cpp-httplib // @@ -28,7 +21,7 @@ class server_http_context::Impl { }; server_http_context::server_http_context() - : pimpl(std::make_unique()) + : pimpl(std::make_unique()) {} server_http_context::~server_http_context() = default; @@ -69,7 +62,7 @@ struct gcp_params { } static std::string getenv(const char * name, const std::string & default_value, bool ensure_leading_slash = false) { - const char * value = std::getenv(name); + const auto * value = std::getenv(name); if (value == nullptr || value[0] == '\0') { return default_value; } @@ -101,14 +94,15 @@ bool server_http_context::init(const common_params & params) { auto & srv = pimpl->srv; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if (params.ssl_file_key != "" && params.ssl_file_cert != "") { + if (!params.ssl_file_key.empty() && !params.ssl_file_cert.empty()) { SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); - srv.reset( - new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) + srv = std::make_unique( + params.ssl_file_cert.c_str(), params.ssl_file_key.c_str() ); + is_ssl = true; } else { SRV_INF("%s", "running without SSL\n"); - srv.reset(new httplib::Server()); + srv = std::make_unique(); } #else if (params.ssl_file_key != "" && params.ssl_file_cert != "") { @@ -156,7 +150,7 @@ bool server_http_context::init(const common_params & params) { // set timeouts and change hostname and port srv->set_read_timeout (params.timeout_read); srv->set_write_timeout(params.timeout_write); - srv->set_socket_options([reuse_port = params.reuse_port](socket_t sock) { + srv->set_socket_options([reuse_port = params.reuse_port](const socket_t sock) { httplib::set_socket_opt(sock, SOL_SOCKET, SO_REUSEADDR, 1); if (reuse_port) { #ifdef SO_REUSEPORT @@ -168,8 +162,8 @@ bool server_http_context::init(const common_params & params) { }); if (params.api_keys.size() == 1) { - auto key = params.api_keys[0]; - std::string substr = key.substr(std::max((int)(key.length() - 4), 0)); + const auto key = params.api_keys[0]; + const std::string substr = key.substr(std::max(static_cast(key.length() - 4), 0)); SRV_INF("api_keys: ****%s\n", substr.c_str()); } else if (params.api_keys.size() > 1) { SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size()); @@ -209,7 +203,7 @@ bool server_http_context::init(const common_params & params) { } // remove the "Bearer " prefix if needed - std::string prefix = "Bearer "; + static std::string prefix = "Bearer "; if (req_api_key.substr(0, prefix.size()) == prefix) { req_api_key = req_api_key.substr(prefix.size()); } @@ -238,16 +232,18 @@ bool server_http_context::init(const common_params & params) { }; auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) { - (void)req; // suppress unused parameter warning when LLAMA_BUILD_WEBUI is not defined - bool ready = is_ready.load(); - if (!ready) { -#ifdef LLAMA_BUILD_WEBUI - auto tmp = string_split(req.path, '.'); - if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) { - res.status = 503; - res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); - return false; + if (!is_ready.load()) { +#if defined(LLAMA_UI_HAS_ASSETS) + if (const auto tmp = string_split(req.path, '.'); + req.path == "/" || (!tmp.empty() && tmp.back() == "html")) { + if (const llama_ui_asset * a = llama_ui_find_asset("loading.html")) { + res.status = 503; + res.set_content(reinterpret_cast(a->data), a->size, "text/html; charset=utf-8"); + return false; + } } +#else + (void)req; #endif // no endpoints are allowed to be accessed when the server is not ready // this is to prevent any data races or inconsistent states @@ -287,17 +283,17 @@ bool server_http_context::init(const common_params & params) { return httplib::Server::HandlerResponse::Unhandled; }); - int n_threads_http = params.n_threads_http; + auto n_threads_http = params.n_threads_http; if (n_threads_http < 1) { // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future - n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1); + n_threads_http = std::max(params.n_parallel + 4, static_cast(std::thread::hardware_concurrency() - 1)); } SRV_INF("using %d threads for HTTP server\n", n_threads_http); srv->new_task_queue = [n_threads_http] { // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request // ref: https://github.com/yhirose/cpp-httplib/pull/2368 - size_t max_threads = (size_t)n_threads_http + 1024; + const auto max_threads = static_cast(n_threads_http + 1024); return new httplib::ThreadPool(n_threads_http, max_threads); }; @@ -305,35 +301,47 @@ bool server_http_context::init(const common_params & params) { // Web UI setup // - if (!params.webui) { - SRV_INF("%s", "the WebUI is disabled\n"); + // Use new `params.ui` field (backed by old `params.webui` for compat) + if (!params.ui) { + SRV_INF("%s", "The UI is disabled\n"); + SRV_INF("%s", "Use --ui/--no-ui (or deprecated --webui/--no-webui) to enable/disable\n"); } else { // register static assets routes if (!params.public_path.empty()) { // Set the base directory for serving static files - bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); - if (!is_found) { + if (const auto is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); !is_found) { SRV_ERR("static assets path not found: %s\n", params.public_path.c_str()); - return 1; + return false; } } else { -#ifdef LLAMA_BUILD_WEBUI - // using embedded static index.html - srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) { - // COEP and COOP headers, required by pyodide (python interpreter) - res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); - res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); - return false; - }); - srv->Get(params.api_prefix + "/bundle.js", [](const httplib::Request & /*req*/, httplib::Response & res) { - res.set_content(reinterpret_cast(bundle_js), bundle_js_len, "application/javascript; charset=utf-8"); - return false; - }); - srv->Get(params.api_prefix + "/bundle.css", [](const httplib::Request & /*req*/, httplib::Response & res) { - res.set_content(reinterpret_cast(bundle_css), bundle_css_len, "text/css; charset=utf-8"); - return false; - }); +#if defined(LLAMA_UI_HAS_ASSETS) + auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) { + return [name, mime, with_isolation_headers](const httplib::Request & req, httplib::Response & res) { + const llama_ui_asset * a = llama_ui_find_asset(name.c_str()); + if (!a) { + res.status = 404; + return false; + } + res.set_header("ETag", a->etag); + // Check If-None-Match for conditional GET (304 Not Modified) + if (const std::string & inm = req.get_header_value("If-None-Match"); + !inm.empty() && (inm == a->etag || inm == std::string("W/") + a->etag)) { + res.status = 304; + return false; + } + if (with_isolation_headers) { + // COEP and COOP headers, required by pyodide (python interpreter) + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); + } + res.set_content(reinterpret_cast(a->data), a->size, mime); + return false; + }; + }; + + srv->Get(params.api_prefix + "/", serve_asset("index.html", "text/html; charset=utf-8", true)); + srv->Get(params.api_prefix + "/bundle.js", serve_asset("bundle.js", "application/javascript; charset=utf-8", false)); + srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8", false)); #endif } } @@ -343,9 +351,9 @@ bool server_http_context::init(const common_params & params) { bool server_http_context::start() { // Bind and listen - auto & srv = pimpl->srv; - bool was_bound = false; - bool is_sock = false; + const auto & srv = pimpl->srv; + auto was_bound = false; + auto is_sock = false; if (string_ends_with(std::string(hostname), ".sock")) { is_sock = true; SRV_INF("%s", "setting address family to AF_UNIX\n"); @@ -357,7 +365,7 @@ bool server_http_context::start() { SRV_INF("%s", "binding port with default address family\n"); // bind HTTP listen port if (port == 0) { - int bound_port = srv->bind_to_any_port(hostname); + const auto bound_port = srv->bind_to_any_port(hostname); was_bound = (bound_port >= 0); if (was_bound) { port = bound_port; @@ -373,11 +381,11 @@ bool server_http_context::start() { } // run the HTTP server in a thread - thread = std::thread([this]() { pimpl->srv->listen_after_bind(); }); + thread = std::thread([this] { pimpl->srv->listen_after_bind(); }); srv->wait_until_ready(); - listening_address = is_sock ? string_format("unix://%s", hostname.c_str()) - : string_format("http://%s:%d", hostname.c_str(), port); + listening_address = is_sock ? string_format("unix://%s", hostname.c_str()) + : string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port); return true; } @@ -430,13 +438,13 @@ static void process_handler_response(server_http_req_ptr && request, server_http if (response->is_stream()) { res.status = response->status; set_headers(res, response->headers); - std::string content_type = response->content_type; + const std::string content_type = response->content_type; // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it - std::shared_ptr q_ptr = std::move(request); - std::shared_ptr r_ptr = std::move(response); - const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { + std::shared_ptr q_ptr = std::move(request); + std::shared_ptr r_ptr = std::move(response); + const auto chunked_content_provider = [response = r_ptr](size_t, const httplib::DataSink & sink) -> bool { std::string chunk; - bool has_next = response->next(chunk); + const bool has_next = response->next(chunk); if (!chunk.empty()) { if (!sink.write(chunk.data(), chunk.size())) { return false; @@ -547,7 +555,7 @@ static std::string path_to_gcp_format(const std::string & path) { if (c == '/' || c == '-' || c == '_') { cap = true; } else { - result += cap ? (char)std::toupper(c) : (char)c; + result += static_cast(cap ? std::toupper(c) : c); cap = false; } } @@ -571,7 +579,7 @@ static json parse_gcp_predict_response(const server_http_res_ptr & res) { } } -void server_http_context::register_gcp_compat() { +void server_http_context::register_gcp_compat() const { const gcp_params gcp; if (!gcp.enabled) { @@ -592,7 +600,7 @@ void server_http_context::register_gcp_compat() { } if (!gcp.path_health.empty()) { - auto health_handler = handlers.find("/health"); + const auto health_handler = handlers.find("/health"); GGML_ASSERT(health_handler != handlers.end()); get(gcp.path_health, health_handler->second); } diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 66ee555f501..fede8c8f30a 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -73,7 +73,8 @@ struct server_http_context { std::string path_prefix; std::string hostname; - int port; + int port = 8080; + bool is_ssl = false; server_http_context(); ~server_http_context(); @@ -87,7 +88,7 @@ struct server_http_context { // Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict) // Must be called AFTER all other API routes are registered - void register_gcp_compat(); + void register_gcp_compat() const; // for debugging std::string listening_address; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 698489a11e2..49b0e423f46 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str // TODO: maybe validate preset before rendering ? // render args args = preset.to_args(bin_path); + + // unified binary dispatches by subcommand, re-inject it right after the + // binary path so the child starts as 'llama serve ...' not 'llama ...' + const char * app_cmd = std::getenv("LLAMA_APP_CMD"); + if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) { + args.insert(args.begin() + 1, app_cmd); + } } void server_model_meta::update_caps() { @@ -172,7 +180,8 @@ void server_model_meta::update_caps() { "LLAMA_ARG_HF_REPO", "LLAMA_ARG_HF_REPO_FILE", }); - params.offline = true; // avoid any unwanted network call during capability detection + params.offline = true; + // params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time common_params_handle_models(params, LLAMA_EXAMPLE_SERVER); if (params.mmproj.path.empty()) { multimodal = { false, false }; @@ -363,18 +372,19 @@ void server_models::load_models() { // FIRST LOAD: add all models, then unlock for autoloading for (const auto & [name, preset] : final_presets) { server_model_meta meta{ - /* preset */ preset, - /* name */ name, - /* aliases */ {}, - /* tags */ {}, - /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0, - /* args */ std::vector(), - /* loaded_info */ {}, - /* exit_code */ 0, - /* stop_timeout */ DEFAULT_STOP_TIMEOUT, - /* multimodal */ mtmd_caps{false, false}, + /* preset */ preset, + /* name */ name, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* loaded_info */ {}, + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* multimodal */ mtmd_caps{false, false}, + /* need_download */ false, }; add_model(std::move(meta)); } @@ -516,18 +526,19 @@ void server_models::load_models() { for (const auto & [name, preset] : final_presets) { if (mapping.find(name) == mapping.end()) { server_model_meta meta{ - /* preset */ preset, - /* name */ name, - /* aliases */ {}, - /* tags */ {}, - /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0, - /* args */ std::vector(), - /* loaded_info */ {}, - /* exit_code */ 0, - /* stop_timeout */ DEFAULT_STOP_TIMEOUT, - /* multimodal */ mtmd_caps{false, false}, + /* preset */ preset, + /* name */ name, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* loaded_info */ {}, + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* multimodal */ mtmd_caps{false, false}, + /* need_download */ false, }; add_model(std::move(meta)); newly_added.push_back(name); @@ -798,9 +809,10 @@ void server_models::load(const std::string & name) { std::thread log_thread([&]() { // read stdout/stderr and forward to main server log // also handle status report from child process + std::vector vec_buf(128 * 1024); // large buffer for storing info + char * buffer = vec_buf.data(); if (stdout_file) { - char buffer[128 * 1024]; // large buffer for storing info - while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { + while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { @@ -1152,15 +1164,19 @@ void server_models_routes::init_routes() { {"role", "router"}, {"max_instances", params.models_max}, {"models_autoload", params.models_autoload}, - // this is a dummy response to make sure webui doesn't break + // this is a dummy response to make sure the UI doesn't break {"model_alias", "llama-server"}, {"model_path", "none"}, {"default_generation_settings", { {"params", json{}}, {"n_ctx", 0}, }}, - {"webui_settings", webui_settings}, + // New key + {"ui_settings", ui_settings}, + // Deprecated: use ui_settings instead (kept for backward compat) + {"webui_settings", webui_settings}, {"build_info", std::string(llama_build_info())}, + {"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy}, }); return res; } @@ -1250,14 +1266,15 @@ void server_models_routes::init_routes() { }; json model_info = json { - {"id", meta.name}, - {"aliases", meta.aliases}, - {"tags", meta.tags}, - {"object", "model"}, // for OAI-compat - {"owned_by", "llamacpp"}, // for OAI-compat - {"created", t}, // for OAI-compat - {"status", status}, - {"architecture", architecture}, + {"id", meta.name}, + {"aliases", meta.aliases}, + {"tags", meta.tags}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"status", status}, + {"architecture", architecture}, + {"need_download", meta.need_download}, // TODO: add other fields, may require reading GGUF metadata }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index f1206c71448..2198589a7aa 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -67,6 +67,7 @@ struct server_model_meta { int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown mtmd_caps multimodal; // multimodal capabilities + bool need_download = false; // whether the model needs to be downloaded before loading bool is_ready() const { return status == SERVER_MODEL_STATUS_LOADED; @@ -175,15 +176,22 @@ struct server_models { struct server_models_routes { common_params params; - json webui_settings = json::object(); + json ui_settings = json::object(); // Primary: new name + json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat) server_models models; server_models_routes(const common_params & params, int argc, char ** argv) : params(params), models(params, argc, argv) { - if (!this->params.webui_config_json.empty()) { + // Support both new ui_config_json and deprecated webui_config_json + const std::string & cfg = !this->params.ui_config_json.empty() + ? this->params.ui_config_json + : this->params.webui_config_json; + if (!cfg.empty()) { try { - webui_settings = json::parse(this->params.webui_config_json); + json json_settings = json::parse(cfg); + ui_settings = json_settings; + webui_settings = json_settings; // Deprecated: keep in sync } catch (const std::exception & e) { - LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); throw; } } diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index d5fceb1b131..588e1a82b18 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -381,8 +381,10 @@ server_task_result_ptr server_response_reader::next(const std::function if (result == nullptr) { // timeout, check stop condition if (should_stop()) { - SRV_WRN("%s", "stopping wait for next result due to should_stop condition (adjust the --timeout argument if needed)\n"); - SRV_WRN("%s", "ref: https://github.com/ggml-org/llama.cpp/pull/22907\n"); + const int64_t time_elapsed_ms = ggml_time_ms() - time_start_ms; + if (time_elapsed_ms > 30000) { + SRV_WRN("%s", "request cancelled after 30s, potentially a client-side timeout; please check your client's code\n"); + } return nullptr; } } else { diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 35f010401fc..8ce32c69fb0 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -169,6 +169,8 @@ struct server_response_reader { bool cancelled = false; int polling_interval_seconds; + const int64_t time_start_ms = ggml_time_ms(); + // tracking generation state and partial tool calls // only used by streaming completions std::vector states; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index cbc40a35fc4..33de2e4d9ca 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -144,6 +144,17 @@ json task_params::to_json(bool only_metrics) const { // // task_result_state // +task_result_state::task_result_state(const common_chat_parser_params & chat_parser_params) + : chat_parser_params(chat_parser_params) + , oai_resp_id("resp_" + random_string()) + , oai_resp_reasoning_id("rs_" + random_string()) + , oai_resp_message_id("msg_" + random_string()) { + if (chat_parser_params.is_continuation && !chat_parser_params.echo) { + // initialize chat_msg to avoid emitting a delta containing the assistant prefill + chat_msg = common_chat_parse("", true, chat_parser_params); + } +} + common_chat_msg task_result_state::update_chat_msg( const std::string & text_added, bool is_partial, @@ -421,6 +432,11 @@ task_params server_task::params_from_json_cmpl( if (data.contains("chat_parser")) { params.chat_parser_params.parser.load(data.at("chat_parser").get()); } + if (data.contains("continue_final_message")) { + auto continuation = common_chat_continuation_parse(data.at("continue_final_message")); + params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE; + } + params.chat_parser_params.echo = json_value(data, "echo", false); } { @@ -483,6 +499,7 @@ task_params server_task::params_from_json_cmpl( const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); const auto message = json_value(data, "reasoning_budget_message", std::string()); params.sampling.reasoning_budget_tokens = budget; + params.sampling.reasoning_control = json_value(data, "reasoning_control", false); if (!start_tag.empty()) { params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true); @@ -1406,6 +1423,9 @@ void server_task_result_cmpl_partial::update(task_result_state & state) { json server_task_result_cmpl_partial::to_json() { GGML_ASSERT(is_updated && "update() must be called before to_json()"); + if (is_begin) { + return nullptr; // simply signal to HTTP handler to send the headers and status code + } switch (res_type) { case TASK_RESPONSE_TYPE_NONE: return to_json_non_oaicompat(); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 64bdecd794f..bdadcff7652 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -19,6 +19,7 @@ enum server_task_type { SERVER_TASK_TYPE_RERANK, SERVER_TASK_TYPE_INFILL, SERVER_TASK_TYPE_CANCEL, + SERVER_TASK_TYPE_CONTROL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, SERVER_TASK_TYPE_SLOT_SAVE, @@ -47,7 +48,7 @@ enum stop_type { }; struct task_params { - bool stream = true; + bool stream = false; bool include_usage = false; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt bool return_tokens = false; @@ -61,6 +62,9 @@ struct task_params { int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled) + // number of prompt tokens before the latest user message + int32_t n_before_user = -1; + int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit @@ -81,6 +85,10 @@ struct task_params { std::string oaicompat_model; std::string oaicompat_cmpl_id; + // realtime control (SERVER_TASK_TYPE_CONTROL) + std::string control_action; + std::string control_cmpl_id; + // per-request parameters for chat parsing common_chat_parser_params chat_parser_params; @@ -112,11 +120,7 @@ struct task_result_state { const std::string oai_resp_message_id; std::string oai_resp_fc_id; // function call ID for current args delta - task_result_state(const common_chat_parser_params & chat_parser_params) - : chat_parser_params(chat_parser_params) - , oai_resp_id("resp_" + random_string()) - , oai_resp_reasoning_id("rs_" + random_string()) - , oai_resp_message_id("msg_" + random_string()) {} + task_result_state(const common_chat_parser_params & chat_parser_params); // parse partial tool calls and update the internal state common_chat_msg update_chat_msg( @@ -419,6 +423,8 @@ struct server_task_result_cmpl_partial : server_task_result { bool post_sampling_probs; bool is_progress = false; + bool is_begin = false; // whether to send 200 status to HTTP client (begin of SSE stream) + // ref: https://github.com/ggml-org/llama.cpp/pull/23884 completion_token_output prob_output; result_timings timings; result_prompt_progress progress; @@ -550,6 +556,19 @@ struct server_task_result_slot_erase : server_task_result { virtual json to_json() override; }; +struct server_task_result_control : server_task_result { + bool success = false; + std::string message; // optional detail when success is false + + virtual json to_json() override { + json out = json { { "success", success } }; + if (!message.empty()) { + out["message"] = message; + } + return out; + } +}; + struct server_task_result_get_lora : server_task_result { struct lora { common_adapter_lora_info info; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 823ae5bda36..769e80a802f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -71,7 +71,10 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t }; } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_server(int argc, char ** argv); + +int llama_server(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); // own arguments required by this example @@ -86,7 +89,10 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - common_params_print_info(params); + // router server never loads a model and must not touch the GPU + // skip device enumeration so the CUDA primary context stays uncreated + const bool is_router_server = params.model.path.empty(); + common_params_print_info(params, !is_router_server); // validate batch size for embeddings // embeddings require all tokens to be processed in a single ubatch @@ -126,7 +132,6 @@ int main(int argc, char ** argv) { server_routes routes(params, ctx_server); server_tools tools; - bool is_router_server = params.model.path.empty(); std::optional models_routes{}; if (is_router_server) { // setup server instances manager @@ -144,6 +149,7 @@ int main(int argc, char ** argv) { routes.post_completions = models_routes->proxy_post; routes.post_completions_oai = models_routes->proxy_post; routes.post_chat_completions = models_routes->proxy_post; + routes.post_control = models_routes->proxy_post; routes.post_responses_oai = models_routes->proxy_post; routes.post_transcriptions_oai = models_routes->proxy_post; routes.post_anthropic_messages = models_routes->proxy_post; @@ -180,6 +186,7 @@ int main(int argc, char ** argv) { ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/v1/chat/completions/control", ex_wrapper(routes.post_control)); ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai)); ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai)); ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); @@ -208,7 +215,8 @@ int main(int argc, char ** argv) { ctx_http.register_gcp_compat(); // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP) - if (params.webui_mcp_proxy) { + // Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields + if (params.ui_mcp_proxy || params.webui_mcp_proxy) { SRV_WRN("%s", "-----------------\n"); SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n"); SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n"); diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 243e4160578..f80e46133c7 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -158,11 +158,12 @@ def test_chat_template(): @pytest.mark.parametrize("prefill,re_prefill", [ ("Whill", "Whill"), - ([{"type": "text", "text": "Wh"}, {"type": "text", "text": "ill"}], "Whill"), + ([{"type": "text", "text": "Wh"}, {"type": "text", "text": "ill"}], "Wh\n\nill"), ]) def test_chat_template_assistant_prefill(prefill, re_prefill): global server - server.chat_template = "llama3" + server.jinja = True + server.chat_template_file = "../../../models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja" server.debug = True # to get the "__verbose" object in the response server.start() res = server.make_request("POST", "/chat/completions", data={ @@ -175,14 +176,15 @@ def test_chat_template_assistant_prefill(prefill, re_prefill): }) assert res.status_code == 200 assert "__verbose" in res.body - assert res.body["__verbose"]["prompt"] == f" <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}" + assert res.body["__verbose"]["prompt"].endswith(f"<|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}") def test_chat_template_continue_final_message_vllm_compat(): """continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic. Both must produce the same prompt.""" global server - server.chat_template = "llama3" + server.jinja = True + server.chat_template_file = "../../../models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja" server.debug = True server.start() res = server.make_request("POST", "/chat/completions", data={ @@ -197,7 +199,7 @@ def test_chat_template_continue_final_message_vllm_compat(): }) assert res.status_code == 200 assert "__verbose" in res.body - assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill" + assert res.body["__verbose"]["prompt"].endswith("<|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill") def test_chat_template_continue_final_message_mutual_exclusion(): diff --git a/tools/server/webui/scripts/install-git-hooks.sh b/tools/server/webui/scripts/install-git-hooks.sh deleted file mode 100755 index 8aa1014ba36..00000000000 --- a/tools/server/webui/scripts/install-git-hooks.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# Script to install pre-commit hook for webui -# Pre-commit: formats, checks, and builds webui - -REPO_ROOT=$(git rev-parse --show-toplevel) -PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit" - -echo "Installing pre-commit hook for webui..." - -# Create the pre-commit hook -cat > "$PRE_COMMIT_HOOK" << 'EOF' -#!/bin/bash - -# Check if there are any changes in the webui directory -if git diff --cached --name-only | grep -q "^tools/server/webui/"; then - REPO_ROOT=$(git rev-parse --show-toplevel) - cd "$REPO_ROOT/tools/server/webui" - - # Check if package.json exists - if [ ! -f "package.json" ]; then - echo "Error: package.json not found in tools/server/webui" - exit 1 - fi - - echo "Formatting and checking webui code..." - - # Run the format command - npm run format - if [ $? -ne 0 ]; then - echo "Error: npm run format failed" - exit 1 - fi - - # Run the lint command - npm run lint - if [ $? -ne 0 ]; then - echo "Error: npm run lint failed" - exit 1 - fi - - # Run the check command - npm run check - if [ $? -ne 0 ]; then - echo "Error: npm run check failed" - exit 1 - fi - - echo "โœ… Webui code formatted and checked successfully" - - # Build the webui - echo "Building webui..." - npm run build - if [ $? -ne 0 ]; then - echo "โŒ npm run build failed" - exit 1 - fi - - echo "โœ… Webui built successfully" -fi - -exit 0 -EOF - -# Make hook executable -chmod +x "$PRE_COMMIT_HOOK" - -if [ $? -eq 0 ]; then - echo "โœ… Git hook installed successfully!" - echo " Pre-commit: $PRE_COMMIT_HOOK" - echo "" - echo "The hook will automatically:" - echo " โ€ข Format, lint and check webui code before commits" - echo " โ€ข Build webui" -else - echo "โŒ Failed to make hook executable" - exit 1 -fi diff --git a/tools/server/webui/scripts/post-build.sh b/tools/server/webui/scripts/post-build.sh deleted file mode 100755 index 55e46d5d5c6..00000000000 --- a/tools/server/webui/scripts/post-build.sh +++ /dev/null @@ -1,3 +0,0 @@ -rm -rf ../public/_app; -rm ../public/favicon.svg; -rm -f ../public/index.html.gz; # deprecated, but may still be generated by older versions of the build process diff --git a/tools/server/webui/src/lib/constants/api-endpoints.ts b/tools/server/webui/src/lib/constants/api-endpoints.ts deleted file mode 100644 index f89ebe42130..00000000000 --- a/tools/server/webui/src/lib/constants/api-endpoints.ts +++ /dev/null @@ -1,13 +0,0 @@ -export const API_MODELS = { - LIST: '/v1/models', - LOAD: '/models/load', - UNLOAD: '/models/unload' -}; - -export const API_TOOLS = { - LIST: '/tools', - EXECUTE: '/tools' -}; - -/** CORS proxy endpoint path */ -export const CORS_PROXY_ENDPOINT = '/cors-proxy'; diff --git a/tools/server/webui/src/lib/constants/localstorage-keys.ts b/tools/server/webui/src/lib/constants/localstorage-keys.ts deleted file mode 100644 index a04194c46f6..00000000000 --- a/tools/server/webui/src/lib/constants/localstorage-keys.ts +++ /dev/null @@ -1,6 +0,0 @@ -export const ALWAYS_ALLOWED_TOOLS_LOCALSTORAGE_KEY = 'LlamaCppWebui.alwaysAllowedTools'; -export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config'; -export const DISABLED_TOOLS_LOCALSTORAGE_KEY = 'LlamaCppWebui.disabledTools'; -export const FAVORITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favoriteModels'; -export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled'; -export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides'; diff --git a/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts b/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts deleted file mode 100644 index 6454fc5b58a..00000000000 --- a/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants'; -import { MediaQuery } from 'svelte/reactivity'; - -export class IsMobile extends MediaQuery { - constructor(breakpoint: number = DEFAULT_MOBILE_BREAKPOINT) { - super(`max-width: ${breakpoint - 1}px`); - } -} diff --git a/tools/server/webui/vitest-setup-client.ts b/tools/server/webui/vitest-setup-client.ts deleted file mode 100644 index 570b9f0e1a0..00000000000 --- a/tools/server/webui/vitest-setup-client.ts +++ /dev/null @@ -1,2 +0,0 @@ -/// -/// diff --git a/tools/ui/.env.example b/tools/ui/.env.example new file mode 100644 index 00000000000..9a995b746f6 --- /dev/null +++ b/tools/ui/.env.example @@ -0,0 +1,2 @@ +VITE_PUBLIC_APP_NAME='llama-ui' +# VITE_DEBUG='true' diff --git a/tools/server/webui/.gitignore b/tools/ui/.gitignore similarity index 93% rename from tools/server/webui/.gitignore rename to tools/ui/.gitignore index 051d884b08e..22ed6125f4c 100644 --- a/tools/server/webui/.gitignore +++ b/tools/ui/.gitignore @@ -25,4 +25,4 @@ vite.config.ts.timestamp-* *storybook.log storybook-static -*.code-workspace \ No newline at end of file +*.code-workspace diff --git a/tools/server/webui/.npmrc b/tools/ui/.npmrc similarity index 100% rename from tools/server/webui/.npmrc rename to tools/ui/.npmrc diff --git a/tools/server/webui/.prettierignore b/tools/ui/.prettierignore similarity index 64% rename from tools/server/webui/.prettierignore rename to tools/ui/.prettierignore index 7d74fe24687..14b01e961bd 100644 --- a/tools/server/webui/.prettierignore +++ b/tools/ui/.prettierignore @@ -7,3 +7,9 @@ bun.lockb # Miscellaneous /static/ + +# Build output +/dist/ +/build/ +/.svelte-kit/ +test-results diff --git a/tools/server/webui/.prettierrc b/tools/ui/.prettierrc similarity index 100% rename from tools/server/webui/.prettierrc rename to tools/ui/.prettierrc diff --git a/tools/server/webui/.storybook/decorators/ModeWatcherDecorator.svelte b/tools/ui/.storybook/decorators/ModeWatcherDecorator.svelte similarity index 100% rename from tools/server/webui/.storybook/decorators/ModeWatcherDecorator.svelte rename to tools/ui/.storybook/decorators/ModeWatcherDecorator.svelte diff --git a/tools/server/webui/.storybook/decorators/TooltipProviderDecorator.svelte b/tools/ui/.storybook/decorators/TooltipProviderDecorator.svelte similarity index 100% rename from tools/server/webui/.storybook/decorators/TooltipProviderDecorator.svelte rename to tools/ui/.storybook/decorators/TooltipProviderDecorator.svelte diff --git a/tools/server/webui/.storybook/main.ts b/tools/ui/.storybook/main.ts similarity index 100% rename from tools/server/webui/.storybook/main.ts rename to tools/ui/.storybook/main.ts diff --git a/tools/server/webui/.storybook/preview.ts b/tools/ui/.storybook/preview.ts similarity index 100% rename from tools/server/webui/.storybook/preview.ts rename to tools/ui/.storybook/preview.ts diff --git a/tools/server/webui/.storybook/vitest.setup.ts b/tools/ui/.storybook/vitest.setup.ts similarity index 100% rename from tools/server/webui/.storybook/vitest.setup.ts rename to tools/ui/.storybook/vitest.setup.ts diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt new file mode 100644 index 00000000000..60d9020da38 --- /dev/null +++ b/tools/ui/CMakeLists.txt @@ -0,0 +1,104 @@ +set(TARGET llama-ui) + +set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets") + +# Backward compat: forward old var to new one +if(DEFINED LLAMA_BUILD_WEBUI) + set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI}) + message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead") +endif() +if(DEFINED LLAMA_USE_PREBUILT_WEBUI) + set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI}) + message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead") +endif() +if(DEFINED LLAMA_WEBUI_HF_BUCKET) + set(LLAMA_UI_HF_BUCKET ${LLAMA_WEBUI_HF_BUCKET}) + message(DEPRECATION "LLAMA_WEBUI_HF_BUCKET is deprecated, use LLAMA_UI_HF_BUCKET instead") +endif() + +# Resolve HF asset version: explicit env var > derived from build number > unset +if(DEFINED ENV{HF_WEBUI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}") + message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead") +elseif(DEFINED ENV{HF_UI_VERSION}) + set(HF_UI_VERSION "$ENV{HF_UI_VERSION}") +elseif(DEFINED LLAMA_BUILD_NUMBER) + set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}") +else() + set(HF_UI_VERSION "") +endif() + +if(NOT "${HF_UI_VERSION}" STREQUAL "" AND NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$") + message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$") +endif() + +set(UI_CPP "${CMAKE_CURRENT_BINARY_DIR}/ui.cpp") +set(UI_H "${CMAKE_CURRENT_BINARY_DIR}/ui.h") + +if(CMAKE_CROSSCOMPILING) + find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH) + if(NOT HOST_CXX_COMPILER) + message(FATAL_ERROR "UI: no host C++ compiler (g++/clang++) found to build llama-ui-embed; set -DHOST_CXX_COMPILER=") + endif() + message(STATUS "UI: building llama-ui-embed with host compiler ${HOST_CXX_COMPILER}") + + if(CMAKE_HOST_WIN32) + set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed-host.exe") + else() + set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed-host") + endif() + + add_custom_command( + OUTPUT "${LLAMA_UI_EMBED_EXE}" + COMMAND "${HOST_CXX_COMPILER}" -O2 -std=c++17 + -o "${LLAMA_UI_EMBED_EXE}" "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp" + COMMENT "Building llama-ui-embed (host)" + VERBATIM + ) + + # phony target to tie it into the dependency graph + add_custom_target(llama-ui-embed DEPENDS "${LLAMA_UI_EMBED_EXE}") +else() + add_executable(llama-ui-embed embed.cpp) + target_compile_features(llama-ui-embed PRIVATE cxx_std_17) + set_target_properties(llama-ui-embed PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + ) + set(LLAMA_UI_EMBED_EXE "$") +endif() + +# Run the provisioning script every build so source changes in tools/ui/ are +# always picked up. The script uses copy_if_different for ui.cpp/ui.h, so the +# library only recompiles when contents actually change. +add_custom_target(llama-ui-assets ALL + BYPRODUCTS ${UI_CPP} ${UI_H} + COMMAND ${CMAKE_COMMAND} + "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}" + "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}" + "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}" + "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}" + "-DHF_VERSION=${HF_UI_VERSION}" + "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}" + "-DBUILD_UI=${LLAMA_BUILD_UI}" + "-DLLAMA_UI_EMBED=${LLAMA_UI_EMBED_EXE}" + -P "${PROJECT_SOURCE_DIR}/scripts/ui-assets.cmake" + COMMENT "Provisioning UI assets" + VERBATIM +) + +add_dependencies(llama-ui-assets llama-ui-embed) + +set_source_files_properties(${UI_CPP} ${UI_H} PROPERTIES GENERATED TRUE) + +add_library(${TARGET} STATIC ${UI_CPP} ${UI_H}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) +add_dependencies(${TARGET} llama-ui-assets) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PUBLIC + ${CMAKE_CURRENT_BINARY_DIR} +) diff --git a/tools/server/webui/README.md b/tools/ui/README.md similarity index 95% rename from tools/server/webui/README.md rename to tools/ui/README.md index 40742b00e1a..4334f968162 100644 --- a/tools/server/webui/README.md +++ b/tools/ui/README.md @@ -2,7 +2,7 @@ A modern, feature-rich web interface for llama-server built with SvelteKit. This UI provides an intuitive chat interface with advanced file handling, conversation management, and comprehensive model interaction capabilities. -The WebUI supports two server operation modes: +Llama UI supports two server operation modes: - **MODEL mode** - Single model operation (standard llama-server) - **ROUTER mode** - Multi-model operation with dynamic model loading/unloading @@ -88,7 +88,7 @@ The WebUI supports two server operation modes: ### 1. Install Dependencies ```bash -cd tools/server/webui +cd tools/ui npm install ``` @@ -112,7 +112,7 @@ npm run dev This starts: -- **Vite dev server** at `http://localhost:5173` - The main WebUI +- **Vite dev server** at `http://localhost:5173` - The main UI frontend app - **Storybook** at `http://localhost:6006` - Component documentation The Vite dev server proxies API requests to `http://localhost:8080` (default llama-server port): @@ -186,7 +186,7 @@ npm run build The build process: 1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS -2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory) +2. **Static Adapter** - Outputs to `../../build/tools/ui/dist` (llama-server's static file directory) 3. **Post-Build Script** - Cleans up intermediate files 4. **Custom Plugin** - Creates `index.html` with: - Inlined favicon as base64 @@ -194,7 +194,7 @@ The build process: - Deterministic output (zeroed timestamps) ```text -tools/server/webui/ โ†’ build โ†’ tools/server/public/ +tools/ui/ โ†’ build โ†’ build/tools/ui/dist/ โ”œโ”€โ”€ src/ โ”œโ”€โ”€ index.html (served by llama-server) โ”œโ”€โ”€ static/ โ””โ”€โ”€ (favicon inlined) โ””โ”€โ”€ ... @@ -205,8 +205,8 @@ tools/server/webui/ โ†’ build โ†’ tools/server/public/ ```javascript // svelte.config.js adapter: adapter({ - pages: '../public', // Output directory - assets: '../public', // Static assets + pages: '../../build/tools/ui/dist', // Output directory + assets: '../../build/tools/ui/dist', // Static assets fallback: 'index.html', // SPA fallback strict: true }), @@ -217,20 +217,19 @@ output: { ### Integration with llama-server -The WebUI is embedded directly into the llama-server binary: +llama-ui is embedded directly into the llama-server binary: -1. `npm run build` outputs `index.html` to `tools/server/public/` +1. `npm run build` outputs `index.html` to `build/tools/ui/dist/` 2. llama-server compiles this into the binary at build time -3. When accessing `/`, llama-server serves the gzipped HTML -4. All assets are inlined (CSS, JS, fonts, favicon) +3. When accessing `/`, llama-server serves the bundled HTML -This results in a **single portable binary** with the full WebUI included. +This results in a **single portable binary** with the full Llama UI included. --- ## Architecture -The WebUI follows a layered architecture with unidirectional data flow: +Llama UI follows a layered architecture with unidirectional data flow: ```text Routes โ†’ Components โ†’ Hooks โ†’ Stores โ†’ Services โ†’ Storage/API @@ -659,7 +658,7 @@ npm run check # TypeScript type checking ## Project Structure ```text -tools/server/webui/ +tools/ui/ โ”œโ”€โ”€ src/ โ”‚ โ”œโ”€โ”€ lib/ โ”‚ โ”‚ โ”œโ”€โ”€ components/ # UI components (app/, ui/) @@ -682,6 +681,6 @@ tools/server/webui/ ## Related Documentation -- [llama.cpp Server README](../README.md) - Full server documentation -- [Multimodal Documentation](../../../docs/multimodal.md) - Image and audio support -- [Function Calling](../../../docs/function-calling.md) - Tool use capabilities +- [llama.cpp Server README](../server/README.md) - Full server documentation +- [Multimodal Documentation](../../docs/multimodal.md) - Image and audio support +- [Function Calling](../../docs/function-calling.md) - Tool use capabilities diff --git a/tools/server/webui/components.json b/tools/ui/components.json similarity index 100% rename from tools/server/webui/components.json rename to tools/ui/components.json diff --git a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md b/tools/ui/docs/architecture/high-level-architecture-simplified.md similarity index 100% rename from tools/server/webui/docs/architecture/high-level-architecture-simplified.md rename to tools/ui/docs/architecture/high-level-architecture-simplified.md diff --git a/tools/server/webui/docs/architecture/high-level-architecture.md b/tools/ui/docs/architecture/high-level-architecture.md similarity index 100% rename from tools/server/webui/docs/architecture/high-level-architecture.md rename to tools/ui/docs/architecture/high-level-architecture.md diff --git a/tools/server/webui/docs/flows/chat-flow.md b/tools/ui/docs/flows/chat-flow.md similarity index 100% rename from tools/server/webui/docs/flows/chat-flow.md rename to tools/ui/docs/flows/chat-flow.md diff --git a/tools/server/webui/docs/flows/conversations-flow.md b/tools/ui/docs/flows/conversations-flow.md similarity index 100% rename from tools/server/webui/docs/flows/conversations-flow.md rename to tools/ui/docs/flows/conversations-flow.md diff --git a/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md b/tools/ui/docs/flows/data-flow-simplified-model-mode.md similarity index 100% rename from tools/server/webui/docs/flows/data-flow-simplified-model-mode.md rename to tools/ui/docs/flows/data-flow-simplified-model-mode.md diff --git a/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md b/tools/ui/docs/flows/data-flow-simplified-router-mode.md similarity index 100% rename from tools/server/webui/docs/flows/data-flow-simplified-router-mode.md rename to tools/ui/docs/flows/data-flow-simplified-router-mode.md diff --git a/tools/server/webui/docs/flows/database-flow.md b/tools/ui/docs/flows/database-flow.md similarity index 100% rename from tools/server/webui/docs/flows/database-flow.md rename to tools/ui/docs/flows/database-flow.md diff --git a/tools/server/webui/docs/flows/mcp-flow.md b/tools/ui/docs/flows/mcp-flow.md similarity index 100% rename from tools/server/webui/docs/flows/mcp-flow.md rename to tools/ui/docs/flows/mcp-flow.md diff --git a/tools/server/webui/docs/flows/models-flow.md b/tools/ui/docs/flows/models-flow.md similarity index 100% rename from tools/server/webui/docs/flows/models-flow.md rename to tools/ui/docs/flows/models-flow.md diff --git a/tools/server/webui/docs/flows/server-flow.md b/tools/ui/docs/flows/server-flow.md similarity index 100% rename from tools/server/webui/docs/flows/server-flow.md rename to tools/ui/docs/flows/server-flow.md diff --git a/tools/server/webui/docs/flows/settings-flow.md b/tools/ui/docs/flows/settings-flow.md similarity index 98% rename from tools/server/webui/docs/flows/settings-flow.md rename to tools/ui/docs/flows/settings-flow.md index 40ad3bd94d7..260713a17b8 100644 --- a/tools/server/webui/docs/flows/settings-flow.md +++ b/tools/ui/docs/flows/settings-flow.md @@ -58,8 +58,8 @@ sequenceDiagram end end - alt serverStore.props has webuiSettings - settingsStore->>settingsStore: Apply webuiSettings from server + alt serverStore.props has uiSettings + settingsStore->>settingsStore: Apply uiSettings from server Note right of settingsStore: Server-provided UI settings
(e.g. showRawOutputSwitch) end diff --git a/tools/ui/embed.cpp b/tools/ui/embed.cpp new file mode 100644 index 00000000000..cf0d437fd6d --- /dev/null +++ b/tools/ui/embed.cpp @@ -0,0 +1,167 @@ +// llama-ui-embed: generate ui.cpp / ui.h that embed UI assets as C arrays. +// +// Usage: +// llama-ui-embed [ ]... + +#include +#include +#include +#include +#include +#include +#include +#include + +// Computes FNV-1a hash of the data +static uint64_t fnv_hash(const uint8_t * data, size_t len) { + const uint64_t fnv_prime = 0x100000001b3ULL; + uint64_t hash = 0xcbf29ce484222325ULL; + + for (size_t i = 0; i < len; ++i) { + hash ^= data[i]; + hash *= fnv_prime; + } + return hash; +} + +static bool read_file(const std::string & path, std::vector & out) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + fprintf(stderr, "embed: cannot open %s\n", path.c_str()); + return false; + } + const auto sz = f.tellg(); + if (sz < 0) { + return false; + } + f.seekg(0); + out.resize(static_cast(sz)); + if (sz > 0 && !f.read(reinterpret_cast(out.data()), sz)) { + return false; + } + return true; +} + +static void append_bytes_hex(std::string & out, const std::vector & bytes) { + static const char hex[] = "0123456789abcdef"; + out.reserve(out.size() + bytes.size() * 5); + for (unsigned char b : bytes) { + out += '0'; + out += 'x'; + out += hex[b >> 4]; + out += hex[b & 0xf]; + out += ','; + } +} + +static bool write_if_different(const std::string & path, const std::string & content) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (f) { + const auto sz = f.tellg(); + if (sz >= 0 && static_cast(sz) == content.size()) { + std::string existing(static_cast(sz), '\0'); + f.seekg(0); + if (sz == 0 || f.read(existing.data(), sz)) { + if (existing == content) { + return true; + } + } + } + } + + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out) { + fprintf(stderr, "embed: cannot write %s\n", path.c_str()); + return false; + } + if (!content.empty()) { + out.write(content.data(), static_cast(content.size())); + } + return out.good(); +} + +static std::string fmt(const char * pattern, ...) { + char tmp[512]; + va_list ap; + va_start(ap, pattern); + const int n = vsnprintf(tmp, sizeof(tmp), pattern, ap); + va_end(ap); + return (n > 0) ? std::string(tmp, static_cast(n)) : std::string(); +} + +int main(int argc, char ** argv) { + if (argc < 3 || ((argc - 3) % 2) != 0) { + fprintf(stderr, "usage: %s [ ]...\n", argv[0]); + return 1; + } + + const std::string out_cpp = argv[1]; + const std::string out_h = argv[2]; + const int n_assets = (argc - 3) / 2; + + std::string h; + h += "#pragma once\n\n#include \n\n"; + if (n_assets > 0) { + h += "#define LLAMA_UI_HAS_ASSETS 1\n\n"; + } + h += + "struct llama_ui_asset {\n" + " const char * name;\n" + " const unsigned char * data;\n" + " size_t size;\n" + " const char * etag;\n" + "};\n\n" + "const llama_ui_asset * llama_ui_find_asset(const char * name);\n"; + + std::string cpp; + cpp += "#include \"ui.h\"\n\n#include \n\n"; + + if (n_assets > 0) { + for (int i = 0; i < n_assets; i++) { + const char * path = argv[3 + i * 2 + 1]; + std::vector bytes; + if (!read_file(path, bytes)) { + return 1; + } + cpp += fmt("static const unsigned char asset_%d_data[] = {", i); + if (bytes.empty()) { + cpp += '0'; + } else { + append_bytes_hex(cpp, bytes); + } + const auto hash = fnv_hash(bytes.data(), bytes.size()); + + cpp += fmt("};\nstatic const size_t asset_%d_size = %zu;\n", + i, bytes.size()); + cpp += fmt("static const char asset_%d_etag[] = \"\\\"0x%016" PRIx64 "\\\"\";\n\n", + i, hash); + } + + cpp += "static const llama_ui_asset g_assets[] = {\n"; + for (int i = 0; i < n_assets; i++) { + cpp += fmt(" { \"%s\", asset_%d_data, asset_%d_size, asset_%d_etag },\n", + argv[3 + i * 2], i, i, i); + } + cpp += "};\n\n"; + + cpp += + "const llama_ui_asset * llama_ui_find_asset(const char * name) {\n" + " for (const auto & a : g_assets) {\n" + " if (strcmp(a.name, name) == 0) {\n" + " return &a;\n" + " }\n" + " }\n" + " return nullptr;\n" + "}\n"; + } else { + cpp += + "const llama_ui_asset * llama_ui_find_asset(const char *) {\n" + " return nullptr;\n" + "}\n"; + } + + bool ok = true; + ok = write_if_different(out_h, h) && ok; + ok = write_if_different(out_cpp, cpp) && ok; + return ok ? 0 : 1; +} diff --git a/tools/server/webui/eslint.config.js b/tools/ui/eslint.config.js similarity index 81% rename from tools/server/webui/eslint.config.js rename to tools/ui/eslint.config.js index cd20fb383a4..6b3b1b5c048 100644 --- a/tools/server/webui/eslint.config.js +++ b/tools/ui/eslint.config.js @@ -20,16 +20,17 @@ export default ts.config( prettier, ...svelte.configs.prettier, { - languageOptions: { - globals: { ...globals.browser, ...globals.node } - }, + languageOptions: { globals: { ...globals.browser, ...globals.node } }, rules: { // typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects. // see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors 'no-undef': 'off', 'svelte/no-at-html-tags': 'off', // This app uses hash-based routing (#/) where resolve() from $app/paths does not apply - 'svelte/no-navigation-without-resolve': 'off' + 'svelte/no-navigation-without-resolve': 'off', + + // Enforce empty line at end of file + 'eol-last': 'error' } }, { @@ -44,8 +45,8 @@ export default ts.config( } }, { - // Exclude Storybook files from main ESLint rules - ignores: ['.storybook/**/*'] + // Exclude generated build output and Storybook files from ESLint + ignores: ['dist/**', 'build/**', '.svelte-kit/**', 'test-results/**', '.storybook/**/*'] }, storybook.configs['flat/recommended'] ); diff --git a/tools/server/webui/package-lock.json b/tools/ui/package-lock.json similarity index 99% rename from tools/server/webui/package-lock.json rename to tools/ui/package-lock.json index bf23307b82c..4d012c81990 100644 --- a/tools/server/webui/package-lock.json +++ b/tools/ui/package-lock.json @@ -2307,9 +2307,9 @@ } }, "node_modules/@sveltejs/kit": { - "version": "2.59.1", - "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz", - "integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==", + "version": "2.60.1", + "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz", + "integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==", "dev": true, "license": "MIT", "dependencies": { @@ -2318,7 +2318,7 @@ "@types/cookie": "^0.6.0", "acorn": "^8.14.1", "cookie": "^0.6.0", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.2", "kleur": "^4.1.5", "magic-string": "^0.30.5", @@ -4296,9 +4296,9 @@ } }, "node_modules/devalue": { - "version": "5.6.4", - "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz", - "integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==", + "version": "5.8.1", + "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz", + "integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==", "license": "MIT" }, "node_modules/devlop": { @@ -4856,12 +4856,12 @@ } }, "node_modules/express-rate-limit": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz", - "integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==", + "version": "8.5.2", + "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz", + "integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==", "license": "MIT", "dependencies": { - "ip-address": "10.1.0" + "ip-address": "^10.2.0" }, "engines": { "node": ">= 16" @@ -4909,9 +4909,9 @@ "license": "MIT" }, "node_modules/fast-uri": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", - "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", + "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", "funding": [ { "type": "github", @@ -5541,9 +5541,9 @@ } }, "node_modules/hono": { - "version": "4.12.14", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz", - "integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==", + "version": "4.12.19", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz", + "integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==", "license": "MIT", "engines": { "node": ">=16.9.0" @@ -5722,9 +5722,9 @@ "license": "MIT" }, "node_modules/ip-address": { - "version": "10.1.0", - "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz", - "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==", + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz", + "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==", "license": "MIT", "engines": { "node": ">= 12" @@ -6008,9 +6008,9 @@ } }, "node_modules/katex": { - "version": "0.16.22", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.22.tgz", - "integrity": "sha512-XCHRdUw4lf3SKBaJe4EvgqIuWwkPSo9XoeO8GjQW94Bp7TWv9hNhzZjZ+OH9yf1UmLygb7DIT5GSFQiyt16zYg==", + "version": "0.16.47", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.47.tgz", + "integrity": "sha512-Eeo8Ys1doU1z+x8AZsPpQu+p/QcZBI5PeOo7QGQdy2x2m0MU/hYagBbGOmXwr5KVbEfVuWv9LpnQWeehogurjg==", "dev": true, "funding": [ "https://opencollective.com/katex", @@ -9245,9 +9245,9 @@ } }, "node_modules/svelte": { - "version": "5.55.1", - "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz", - "integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==", + "version": "5.55.7", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz", + "integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==", "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", @@ -9259,7 +9259,7 @@ "aria-query": "5.3.1", "axobject-query": "^4.1.0", "clsx": "^2.1.1", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.1", "esrap": "^2.2.4", "is-reference": "^3.0.3", @@ -10606,9 +10606,9 @@ "license": "ISC" }, "node_modules/ws": { - "version": "8.18.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", - "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "version": "8.20.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", + "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", "dev": true, "license": "MIT", "engines": { diff --git a/tools/server/webui/package.json b/tools/ui/package.json similarity index 98% rename from tools/server/webui/package.json rename to tools/ui/package.json index 2338c38402a..5a1cec66645 100644 --- a/tools/server/webui/package.json +++ b/tools/ui/package.json @@ -5,7 +5,7 @@ "type": "module", "scripts": { "dev": "bash scripts/dev.sh", - "build": "vite build && ./scripts/post-build.sh", + "build": "vite build", "preview": "vite preview", "prepare": "svelte-kit sync || echo ''", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", diff --git a/tools/server/webui/playwright.config.ts b/tools/ui/playwright.config.ts similarity index 70% rename from tools/server/webui/playwright.config.ts rename to tools/ui/playwright.config.ts index 26d3be535d1..178fd7ba856 100644 --- a/tools/server/webui/playwright.config.ts +++ b/tools/ui/playwright.config.ts @@ -2,7 +2,7 @@ import { defineConfig } from '@playwright/test'; export default defineConfig({ webServer: { - command: 'npm run build && http-server ../public -p 8181', + command: 'npm run build && http-server ../../build/tools/ui/dist -p 8181', port: 8181, timeout: 120000, reuseExistingServer: false diff --git a/tools/server/webui/scripts/dev.sh b/tools/ui/scripts/dev.sh similarity index 78% rename from tools/server/webui/scripts/dev.sh rename to tools/ui/scripts/dev.sh index 97c14b87328..7e1d3c15e5b 100644 --- a/tools/server/webui/scripts/dev.sh +++ b/tools/ui/scripts/dev.sh @@ -2,33 +2,37 @@ # Development script for llama-ui # -# This script starts the webui development servers (Storybook and Vite). +# This script starts the llama-ui development servers (Storybook and Vite). # Note: You need to start llama-server separately. # # Usage: # bash scripts/dev.sh # npm run dev -cd ../../../ +cd ../../ + +# Ensure node_modules are installed +if [ ! -d "tools/ui/node_modules" ]; then + echo "๐Ÿ“ฆ Installing npm dependencies..." + cd tools/ui && npm install && cd ../../ +fi # Check and install git hooks if missing check_and_install_hooks() { local hooks_missing=false # Check for required hooks - if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ] || [ ! -f ".git/hooks/post-push" ]; then + if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ]; then hooks_missing=true fi if [ "$hooks_missing" = true ]; then echo "๐Ÿ”ง Git hooks missing, installing them..." - cd tools/server/webui - if bash scripts/install-git-hooks.sh; then + if bash "$(dirname "$0")/git-hooks/install.sh"; then echo "โœ… Git hooks installed successfully" else echo "โš ๏ธ Failed to install git hooks, continuing anyway..." fi - cd ../../../ else echo "โœ… Git hooks already installed" fi @@ -48,7 +52,7 @@ trap cleanup SIGINT SIGTERM echo "๐Ÿš€ Starting development servers..." echo "๐Ÿ“ Note: Make sure to start llama-server separately if needed" -cd tools/server/webui +cd tools/ui # Use --insecure-http-parser to handle malformed HTTP responses from llama-server # (some responses have both Content-Length and Transfer-Encoding headers) storybook dev -p 6006 --ci & NODE_OPTIONS="--insecure-http-parser" vite dev --host 0.0.0.0 & diff --git a/tools/ui/scripts/git-hooks/install.sh b/tools/ui/scripts/git-hooks/install.sh new file mode 100755 index 00000000000..1a17dcad6d3 --- /dev/null +++ b/tools/ui/scripts/git-hooks/install.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# Install git hooks for llama-ui +# Copies pre-commit and pre-push hooks into the repo's .git/hooks directory. + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +HOOKS_DIR="$REPO_ROOT/$(cd "$REPO_ROOT" && git rev-parse --git-path hooks)" + +# Verify package.json exists +if [ ! -f "$REPO_ROOT/tools/ui/package.json" ]; then + echo "โŒ package.json not found in tools/ui" + exit 1 +fi + +echo "Installing git hooks for llama-ui..." + +for hook in pre-commit pre-push; do + src="$SCRIPT_DIR/${hook}.sh" + dst="$HOOKS_DIR/$hook" + + if cp "$src" "$dst" && chmod +x "$dst"; then + echo " โœ… $hook" + else + echo " โŒ Failed to install $hook" + exit 1 + fi +done + +echo "" +echo "Pre-commit: format (staged) + type-check" +echo "Pre-push: lint + test" +echo "" +echo "Hooks stash unstaged changes temporarily and restore them after." +echo "Skip with: git commit --no-verify / git push --no-verify" diff --git a/tools/ui/scripts/git-hooks/pre-commit.sh b/tools/ui/scripts/git-hooks/pre-commit.sh new file mode 100755 index 00000000000..1fa83efde59 --- /dev/null +++ b/tools/ui/scripts/git-hooks/pre-commit.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +# Pre-commit hook for llama-ui +# Runs: format (staged files only) + type-check +# Stashes unstaged changes temporarily and restores them after. + +# Only run when there are staged changes in tools/ui/ +if ! git diff --cached --name-only | grep -q "^tools/ui/"; then + exit 0 +fi + +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT/tools/ui" + +# Check that node_modules exists +if [ ! -d "node_modules" ]; then + echo "โŒ node_modules not found. Run 'npm install' first." + exit 1 +fi + +# Stash unstaged changes in tools/ui/ so they don't interfere +stash_name="pi-ui-precommit" +git stash push --keep-index -u -m "$stash_name" -- tools/ui/ 2>/dev/null || true + +echo "Running pre-commit checks for llama-ui..." + +# Format only staged files +staged_ui=$(git diff --cached --name-only -- tools/ui/) +if [ -n "$staged_ui" ]; then + echo "$staged_ui" | xargs npx --no-install prettier --write + format_ok=$? + # Re-stage formatted files + git add tools/ui/ +else + format_ok=0 +fi + +# Type-check the clean tree +npm run check +check_ok=$? + +# Restore stashed changes +if git stash list | grep -q "$stash_name"; then + git stash pop 2>/dev/null || true +fi + +if [ $format_ok -ne 0 ]; then + echo "โŒ Format failed" + exit 1 +fi +if [ $check_ok -ne 0 ]; then + echo "โŒ Type check failed" + exit 1 +fi + +echo "โœ… Pre-commit checks passed" +exit 0 diff --git a/tools/ui/scripts/git-hooks/pre-push.sh b/tools/ui/scripts/git-hooks/pre-push.sh new file mode 100755 index 00000000000..953d3a22430 --- /dev/null +++ b/tools/ui/scripts/git-hooks/pre-push.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Pre-push hook for llama-ui +# Runs: lint + test +# Ignores unstaged changes (stashes them temporarily and restores after). + +needs_check=false + +# Read refs from stdin: local_ref local_sha remote_ref remote_sha +while read local_ref local_sha remote_ref remote_sha; do + # New branch or force-push โ€” always check + if [ "$local_sha" = "0000000000000000000000000000000000000000" ] || \ + [ "$remote_sha" = "0000000000000000000000000000000000000000" ]; then + needs_check=true + continue + fi + + # Check for changes in tools/ui/ between remote and local + if git diff --name-only "$remote_sha...$local_sha" -- tools/ui/ | grep -q .; then + needs_check=true + fi +done + +if [ "$needs_check" = false ]; then + exit 0 +fi + +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT/tools/ui" + +# Check that node_modules exists +if [ ! -d "node_modules" ]; then + echo "โŒ node_modules not found. Run 'npm install' first." + exit 1 +fi + +# Stash unstaged changes so they don't interfere with checks +stash_name="pi-ui-prepush" +git stash push -u -m "$stash_name" -- tools/ui/ 2>/dev/null || true + +echo "Running pre-push checks for llama-ui..." + +# Lint +npm run lint +lint_ok=$? + +# Test +npm test +test_ok=$? + +# Restore stashed changes +if git stash list | grep -q "$stash_name"; then + git stash pop 2>/dev/null || true +fi + +if [ $lint_ok -ne 0 ]; then + echo "โŒ Lint failed" + exit 1 +fi +if [ $test_ok -ne 0 ]; then + echo "โŒ Tests failed" + exit 1 +fi + +echo "โœ… Pre-push checks passed" +exit 0 diff --git a/tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts similarity index 64% rename from tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts rename to tools/ui/scripts/vite-plugin-llama-cpp-build.ts index 0330a1dda12..01c714a241d 100644 --- a/tools/server/webui/scripts/vite-plugin-llama-cpp-build.ts +++ b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts @@ -1,4 +1,12 @@ -import { readFileSync, writeFileSync, existsSync, readdirSync, copyFileSync } from 'fs'; +import { + readFileSync, + writeFileSync, + existsSync, + readdirSync, + copyFileSync, + rmSync, + unlinkSync +} from 'fs'; import { resolve } from 'path'; import type { Plugin } from 'vite'; @@ -11,28 +19,28 @@ const GUIDE_FOR_FRONTEND = ` --> `.trim(); +const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist'; + export function llamaCppBuildPlugin(): Plugin { return { name: 'llamacpp:build', apply: 'build', closeBundle() { - // Ensure the SvelteKit adapter has finished writing to ../public setTimeout(() => { try { - const indexPath = resolve('../public/index.html'); + const outDir = resolve(OUTPUT_DIR); + const indexPath = resolve(outDir, 'index.html'); if (!existsSync(indexPath)) return; let content = readFileSync(indexPath, 'utf-8'); + // Inline favicon as base64 data URL const faviconPath = resolve('static/favicon.svg'); - if (existsSync(faviconPath)) { const faviconContent = readFileSync(faviconPath, 'utf-8'); const faviconBase64 = Buffer.from(faviconContent).toString('base64'); const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`; - content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`); - console.log('โœ“ Inlined favicon.svg as base64 data URL'); } @@ -48,17 +56,16 @@ export function llamaCppBuildPlugin(): Plugin { writeFileSync(indexPath, content, 'utf-8'); console.log('โœ“ Updated index.html'); - // Copy bundle.*.js -> ../public/bundle.js - const immutableDir = resolve('../public/_app/immutable'); - const bundleDir = resolve('../public/_app/immutable/assets'); + // Copy bundle.*.js -> bundle.js at output root + const immutableDir = resolve(outDir, '_app/immutable'); + const bundleDir = resolve(outDir, '_app/immutable/assets'); if (existsSync(immutableDir)) { const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/)); - if (jsFiles.length > 0) { - copyFileSync(resolve(immutableDir, jsFiles[0]), resolve('../public/bundle.js')); + copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js')); // Normalize __sveltekit_ to __sveltekit__ in bundle.js - const bundleJsPath = resolve('../public/bundle.js'); + const bundleJsPath = resolve(outDir, 'bundle.js'); let bundleJs = readFileSync(bundleJsPath, 'utf-8'); bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__'); writeFileSync(bundleJsPath, bundleJs, 'utf-8'); @@ -66,17 +73,29 @@ export function llamaCppBuildPlugin(): Plugin { } } - // Copy bundle.*.css -> ../public/bundle.css + // Copy bundle.*.css -> bundle.css at output root if (existsSync(bundleDir)) { const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/)); - if (cssFiles.length > 0) { - copyFileSync(resolve(bundleDir, cssFiles[0]), resolve('../public/bundle.css')); + copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css')); console.log(`โœ“ Copied ${cssFiles[0]} -> bundle.css`); } } + + // Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz + const appDir = resolve(outDir, '_app'); + if (existsSync(appDir)) { + rmSync(appDir, { recursive: true, force: true }); + console.log('โœ“ Removed _app directory'); + } + + const faviconOut = resolve(outDir, 'favicon.svg'); + if (existsSync(faviconOut)) { + unlinkSync(faviconOut); + console.log('โœ“ Removed favicon.svg'); + } } catch (error) { - console.error('Failed to update index.html:', error); + console.error('Failed to process build output:', error); } }, 100); } diff --git a/tools/ui/sources.cmake b/tools/ui/sources.cmake new file mode 100644 index 00000000000..de9dbf78b79 --- /dev/null +++ b/tools/ui/sources.cmake @@ -0,0 +1,15 @@ +# Inputs used to decide whether the npm build output is up-to-date. + +set(UI_SOURCE_GLOBS + src/* + static/* +) + +set(UI_SOURCE_FILES + package.json + package-lock.json + vite.config.ts + svelte.config.js + tsconfig.json + scripts/vite-plugin-llama-cpp-build.ts +) diff --git a/tools/server/webui/src/app.css b/tools/ui/src/app.css similarity index 94% rename from tools/server/webui/src/app.css rename to tools/ui/src/app.css index 6e29b70a35a..29b1d3c640b 100644 --- a/tools/server/webui/src/app.css +++ b/tools/ui/src/app.css @@ -1,5 +1,7 @@ @import 'tailwindcss'; - +@source '.'; +@plugin '@tailwindcss/forms'; +@plugin '@tailwindcss/typography'; @import 'tw-animate-css'; @custom-variant dark (&:is(.dark *)); @@ -39,6 +41,9 @@ --sidebar-ring: oklch(0.708 0 0); --code-background: oklch(0.985 0 0); --code-foreground: oklch(0.145 0 0); + --font-mono: + ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas, + 'Liberation Mono', Menlo, monospace; --layer-popover: 1000000; --chat-form-area-height: 8rem; @@ -171,6 +176,10 @@ *::-webkit-scrollbar-thumb:hover { background: hsl(var(--muted-foreground) / 0.5); } + + :where(code, pre, kbd, samp) { + font-family: var(--font-mono); + } } @layer utilities { diff --git a/tools/server/webui/src/app.d.ts b/tools/ui/src/app.d.ts similarity index 97% rename from tools/server/webui/src/app.d.ts rename to tools/ui/src/app.d.ts index f5af7323c8b..ec65952e9a7 100644 --- a/tools/server/webui/src/app.d.ts +++ b/tools/ui/src/app.d.ts @@ -39,6 +39,7 @@ import type { DatabaseMessage, DatabaseMessageExtra, DatabaseMessageExtraAudioFile, + DatabaseMessageExtraVideoFile, DatabaseMessageExtraImageFile, DatabaseMessageExtraTextFile, DatabaseMessageExtraPdfFile, @@ -102,6 +103,7 @@ declare global { DatabaseMessage, DatabaseMessageExtra, DatabaseMessageExtraAudioFile, + DatabaseMessageExtraVideoFile, DatabaseMessageExtraImageFile, DatabaseMessageExtraTextFile, DatabaseMessageExtraPdfFile, diff --git a/tools/server/webui/src/app.html b/tools/ui/src/app.html similarity index 100% rename from tools/server/webui/src/app.html rename to tools/ui/src/app.html diff --git a/tools/server/webui/src/lib/actions/fade-in-view.svelte.ts b/tools/ui/src/lib/actions/fade-in-view.svelte.ts similarity index 100% rename from tools/server/webui/src/lib/actions/fade-in-view.svelte.ts rename to tools/ui/src/lib/actions/fade-in-view.svelte.ts diff --git a/tools/server/webui/src/lib/components/app/SKILL.md b/tools/ui/src/lib/components/app/SKILL.md similarity index 100% rename from tools/server/webui/src/lib/components/app/SKILL.md rename to tools/ui/src/lib/components/app/SKILL.md diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte b/tools/ui/src/lib/components/app/actions/ActionIcon.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/ActionIcon.svelte rename to tools/ui/src/lib/components/app/actions/ActionIcon.svelte diff --git a/tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte b/tools/ui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte rename to tools/ui/src/lib/components/app/actions/ActionIconCopyToClipboard.svelte diff --git a/tools/server/webui/src/lib/components/app/actions/index.ts b/tools/ui/src/lib/components/app/actions/index.ts similarity index 100% rename from tools/server/webui/src/lib/components/app/actions/index.ts rename to tools/ui/src/lib/components/app/actions/index.ts diff --git a/tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte b/tools/ui/src/lib/components/app/badges/BadgeInfo.svelte similarity index 100% rename from tools/server/webui/src/lib/components/app/badges/BadgeInfo.svelte rename to tools/ui/src/lib/components/app/badges/BadgeInfo.svelte diff --git a/tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte b/tools/ui/src/lib/components/app/badges/BadgesModality.svelte similarity index 73% rename from tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte rename to tools/ui/src/lib/components/app/badges/BadgesModality.svelte index 841f1dd9fa9..d87184ea9bc 100644 --- a/tools/server/webui/src/lib/components/app/badges/BadgesModality.svelte +++ b/tools/ui/src/lib/components/app/badges/BadgesModality.svelte @@ -1,5 +1,5 @@ {#each modalities as modality (modality)} - {#if modality === ModelModality.VISION || modality === ModelModality.AUDIO} + {#if modality === ModelModality.VISION || modality === ModelModality.AUDIO || modality === ModelModality.VIDEO} - Vision + Vision (Image) + {:else if modality === ModelModality.VIDEO} +