diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index 955a2962ff4..8e830d46251 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
## Build Image
diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml
new file mode 100644
index 00000000000..2a6642292e6
--- /dev/null
+++ b/.github/workflows/build-sycl.yml
@@ -0,0 +1,142 @@
+name: CI (sycl)
+
+on:
+ workflow_dispatch: # allows manual triggering
+ push:
+ branches:
+ - master
+ paths: [
+ '.github/workflows/build-sycl.yml',
+ '**/CMakeLists.txt',
+ '**/.cmake',
+ '**/*.h',
+ '**/*.hpp',
+ '**/*.c',
+ '**/*.cpp'
+ ]
+
+ pull_request:
+ types: [opened, synchronize, reopened]
+ paths: [
+ '.github/workflows/build-sycl.yml',
+ 'ggml/src/ggml-sycl/**'
+ ]
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+ cancel-in-progress: true
+
+env:
+ GGML_NLOOP: 3
+ GGML_N_THREADS: 1
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+
+ ubuntu-24-sycl:
+ strategy:
+ matrix:
+ build: [fp32, fp16]
+ include:
+ - build: fp32
+ fp16: OFF
+ - build: fp16
+ fp16: ON
+
+ runs-on: ubuntu-24.04
+
+ env:
+ ONEAPI_ROOT: /opt/intel/oneapi/
+ ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+ continue-on-error: true
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Use oneAPI Installation Cache
+ uses: actions/cache@v5
+ id: cache-sycl
+ with:
+ path: ${{ env.ONEAPI_ROOT }}
+ key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+ - name: Download & Install oneAPI
+ shell: bash
+ if: steps.cache-sycl.outputs.cache-hit != 'true'
+ run: |
+ cd /tmp
+ wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+ sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.21
+ with:
+ key: ubuntu-24-sycl-${{ matrix.build }}
+ evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+ - name: Build
+ id: cmake_build
+ run: |
+ source /opt/intel/oneapi/setvars.sh
+ cmake -B build \
+ -G "Ninja" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_SYCL=ON \
+ -DCMAKE_C_COMPILER=icx \
+ -DCMAKE_CXX_COMPILER=icpx \
+ -DLLAMA_OPENSSL=OFF \
+ -DGGML_NATIVE=OFF \
+ -DGGML_SYCL_F16=${{ matrix.fp16 }}
+ time cmake --build build --config Release -j $(nproc)
+
+ windows-latest-sycl:
+ runs-on: windows-2022
+
+ defaults:
+ run:
+ shell: bash
+
+ env:
+ WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+ WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+ ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+ ONEAPI_INSTALLER_VERSION: "2025.3.3"
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Use oneAPI Installation Cache
+ uses: actions/cache@v5
+ id: cache-sycl
+ with:
+ path: ${{ env.ONEAPI_ROOT }}
+ key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+ - name: Download & Install oneAPI
+ shell: bash
+ if: steps.cache-sycl.outputs.cache-hit != 'true'
+ run: |
+ scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.21
+ with:
+ key: windows-latest-sycl
+ variant: ccache
+ evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+ # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+ - name: Build
+ id: cmake_build
+ run: examples/sycl/win-build-sycl.bat
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c7f00e3592b..21eb4d97b3e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -555,106 +555,6 @@ jobs:
-DGGML_MUSA=ON
time cmake --build build --config Release -j $(nproc)
- ubuntu-22-sycl:
- runs-on: ubuntu-22.04
-
- continue-on-error: true
-
- steps:
- - uses: actions/checkout@v6
-
- - name: add oneAPI to apt
- shell: bash
- run: |
- cd /tmp
- wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
- - name: install oneAPI dpcpp compiler
- shell: bash
- run: |
- sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
-
- - name: install oneAPI MKL library
- shell: bash
- run: |
- sudo apt install intel-oneapi-mkl-devel
-
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: ccache
- uses: ggml-org/ccache-action@v1.2.21
- with:
- key: ubuntu-22-sycl
- evict-old-files: 1d
- save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
- - name: Build
- id: cmake_build
- run: |
- source /opt/intel/oneapi/setvars.sh
- cmake -B build \
- -DGGML_SYCL=ON \
- -DCMAKE_C_COMPILER=icx \
- -DCMAKE_CXX_COMPILER=icpx
- time cmake --build build --config Release -j $(nproc)
-
- ubuntu-22-sycl-fp16:
- runs-on: ubuntu-22.04
-
- continue-on-error: true
-
- steps:
- - uses: actions/checkout@v6
-
- - name: add oneAPI to apt
- shell: bash
- run: |
- cd /tmp
- wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
- - name: install oneAPI dpcpp compiler
- shell: bash
- run: |
- sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
-
- - name: install oneAPI MKL library
- shell: bash
- run: |
- sudo apt install intel-oneapi-mkl-devel
-
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: ccache
- uses: ggml-org/ccache-action@v1.2.21
- with:
- key: ubuntu-22-sycl-fp16
- evict-old-files: 1d
- save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
- - name: Build
- id: cmake_build
- run: |
- source /opt/intel/oneapi/setvars.sh
- cmake -B build \
- -G "Ninja" \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_SYCL=ON \
- -DCMAKE_C_COMPILER=icx \
- -DCMAKE_CXX_COMPILER=icpx \
- -DGGML_SYCL_F16=ON
- time cmake --build build --config Release -j $(nproc)
windows-latest:
runs-on: windows-2025
@@ -863,39 +763,6 @@ jobs:
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
- windows-latest-sycl:
- runs-on: windows-2022
-
- defaults:
- run:
- shell: bash
-
- env:
- WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
- WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
- ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: ccache
- uses: ggml-org/ccache-action@v1.2.21
- with:
- key: windows-latest-sycl
- variant: ccache
- evict-old-files: 1d
- save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
- - name: Install
- run: |
- scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
- # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
- - name: Build
- id: cmake_build
- run: examples/sycl/win-build-sycl.bat
windows-latest-hip:
runs-on: windows-2022
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f1cc12cd452..89563c51c35 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -598,15 +598,29 @@ jobs:
shell: bash
env:
- WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+ WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+ ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
+ - name: Use oneAPI Installation Cache
+ uses: actions/cache@v5
+ id: cache-sycl
+ with:
+ path: ${{ env.ONEAPI_ROOT }}
+ key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+ - name: Download & Install oneAPI
+ shell: bash
+ if: steps.cache-sycl.outputs.cache-hit != 'true'
+ run: |
+ scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
@@ -614,10 +628,6 @@ jobs:
variant: ccache
evict-old-files: 1d
- - name: Install
- run: |
- scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
- name: Build
id: cmake_build
shell: cmd
@@ -670,6 +680,82 @@ jobs:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
+ ubuntu-24-sycl:
+ strategy:
+ matrix:
+ build: [fp32, fp16]
+ include:
+ - build: fp32
+ fp16: OFF
+ - build: fp16
+ fp16: ON
+
+ runs-on: ubuntu-24.04
+
+ env:
+ ONEAPI_ROOT: /opt/intel/oneapi/
+ ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+ steps:
+ - uses: actions/checkout@v6
+
+ - name: Use oneAPI Installation Cache
+ uses: actions/cache@v5
+ id: cache-sycl
+ with:
+ path: ${{ env.ONEAPI_ROOT }}
+ key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+ - name: Download & Install oneAPI
+ shell: bash
+ if: steps.cache-sycl.outputs.cache-hit != 'true'
+ run: |
+ cd /tmp
+ wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+ sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.21
+ with:
+ key: ubuntu-24-sycl-${{ matrix.build }}
+ evict-old-files: 1d
+ save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+ - name: Build
+ id: cmake_build
+ run: |
+ source /opt/intel/oneapi/setvars.sh
+ cmake -B build \
+ -G "Ninja" \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DGGML_SYCL=ON \
+ -DCMAKE_C_COMPILER=icx \
+ -DCMAKE_CXX_COMPILER=icpx \
+ -DLLAMA_OPENSSL=OFF \
+ -DGGML_NATIVE=OFF \
+ -DGGML_SYCL_F16=${{ matrix.fp16 }}
+ time cmake --build build --config Release -j $(nproc)
+
+ - name: Determine tag name
+ id: tag
+ uses: ./.github/actions/get-tag-name
+
+ - name: Pack artifacts
+ id: pack_artifacts
+ run: |
+ cp LICENSE ./build/bin/
+ tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v6
+ with:
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+ name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+
ubuntu-22-rocm:
runs-on: ubuntu-22.04
@@ -1045,6 +1131,7 @@ jobs:
- ubuntu-cpu
- ubuntu-vulkan
- ubuntu-24-openvino
+ - ubuntu-24-sycl
- android-arm64
- macOS-cpu
- ios-xcode-build
@@ -1133,6 +1220,8 @@ jobs:
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
+ - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
+ - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
**Android:**
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
diff --git a/.gitignore b/.gitignore
index 15dc4014f43..6136524d75a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,3 +145,5 @@ poetry.toml
/.windsurf/
# emscripten
a.out.*
+
+AGENTS.local.md
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 090686b1531..93d5509e6af 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -746,7 +746,12 @@ def prepare_tensors(self):
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
- quant_config = json.load(f).get("quantization") or {}
+ hf_quant_config = json.load(f)
+ quant_config = hf_quant_config.get("quantization") or {}
+ producer = hf_quant_config.get("producer") or {}
+ producer_name = (producer.get("name") or "").lower()
+ if quant_method is None:
+ self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index d52c61acb66..1b86b3d4acb 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -31,6 +31,8 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
## Recommended Release
+### Windows
+
The following releases are verified and recommended:
|Commit ID|Tag|Release|Verified Platform| Update date|
@@ -39,6 +41,13 @@ The following releases are verified and recommended:
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1||
+### Ubuntu 24.04
+
+The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
+
+It is recommended to use them with Intel Docker.
+
+The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
## News
@@ -229,6 +238,7 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|Verified release|
|-|
+|2025.3.3 |
|2025.2.1|
|2025.1|
|2024.1|
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 48fbe208d90..52754e1b9d6 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
- target_link_libraries(ggml-base PRIVATE m)
+ target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
endif()
endif()
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 27cb1683518..f17f7e2e0ce 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -931,13 +931,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
}
struct ggml_metal_event {
- void * obj; // id
+ void * obj; // id
atomic_int value;
};
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
- id event = (id)ev->obj;
+ id event = (id)ev->obj;
id cmd_buf = (id) cmd_buf_raw;
@@ -945,7 +945,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
}
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
- id event = (id)ev->obj;
+ id event = (id)ev->obj;
id cmd_buf = (id) cmd_buf_raw;
@@ -953,7 +953,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
}
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
- id event = [dev->mtl_device newEvent];
+ id event = [dev->mtl_device newSharedEvent];
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
@@ -964,7 +964,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
}
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
- id event = ev->obj;
+ id event = ev->obj;
[event release];
free(ev);
@@ -973,14 +973,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
}
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
- @autoreleasepool {
- id event = ev->obj;
-
- id cmd_buf = [dev->mtl_queue commandBuffer];
- [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
- [cmd_buf commit];
- [cmd_buf waitUntilCompleted];
+ id event = ev->obj;
+ const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
+ if (!res) {
+ GGML_ABORT("%s: failed to wait for event\n", __func__);
}
+
+ GGML_UNUSED(dev);
}
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3829da87903..36923160d72 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3808,6 +3808,51 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
}
}
+// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
+static bool ggml_sycl_mul_mat_id_mmvq_fused(
+ ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
+ const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
+{
+ const int64_t ne10 = src1->ne[0];
+ const int64_t ne11 = src1->ne[1];
+ const int64_t ne12 = src1->ne[2];
+ if (ne12 != 1) return false;
+ if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
+ if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
+ if (!ggml_is_contiguous(src1)) return false;
+
+ // Reorder layout not supported; fall back.
+ const ggml_tensor_extra_gpu * src0_extra =
+ static_cast(src0->extra);
+ if (src0_extra && src0_extra->optimized_feature.reorder) return false;
+
+ const int64_t n_ids_per_group = ids->ne[0];
+ if (ids->ne[1] != 1) return false;
+ if (ne11 != 1 && ne11 != n_ids_per_group) return false;
+
+ const queue_ptr stream = ctx.stream();
+ const int src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
+ const int n_experts_used = (int) n_ids_per_group;
+ const int nrows = (int) src0->ne[1];
+
+ ggml_sycl_pool_alloc src1_q8_alloc(ctx.pool(),
+ (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
+ char * src1_ddq = src1_q8_alloc.get();
+ quantize_row_q8_1_sycl(
+ (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
+ src1_padded_cols, stream);
+
+ const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
+ const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
+
+ return ggml_sycl_mul_mat_vec_q_id(
+ src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
+ (float *) dst->data, (int) ne10, nrows, n_experts_used,
+ /*expert_weight_stride=*/ src0->nb[2],
+ /*dst_row_stride=*/ dst->nb[1],
+ src1_row_stride, stream);
+}
+
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
ggml_tensor *dst) try {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3823,6 +3868,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
const int64_t n_as = ne02;
const int64_t n_ids = ids->ne[0];
+ if (ne12 == 1) {
+ if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
+ return;
+ }
+ }
+
std::vector ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data;
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 3a4577ecbbc..8fa2198f35a 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1199,3 +1199,154 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
GGML_UNUSED(src1_ddf_i);
GGML_UNUSED(ctx);
}
+
+// src1_row_stride: 0 for shared src1 (gate/up proj), else per-expert stride (down proj).
+template
+static void mul_mat_vec_q_moe(
+ const void * __restrict__ vx_base, const void * __restrict__ vy_base,
+ float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
+ const int ncols, const int nrows,
+ const size_t expert_weight_stride, const size_t dst_row_stride,
+ const size_t src1_row_stride,
+ const sycl::nd_item<3> & item_ct1) {
+
+ const int expert_idx = item_ct1.get_group(1);
+ const int i02 = ids_dev[expert_idx];
+
+ const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
+ const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
+ float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
+
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
+
+ if (row >= nrows) {
+ return;
+ }
+
+ const int blocks_per_row = ncols / qk;
+ constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;
+
+ float tmp = 0.0f;
+
+ const block_q_t * x = (const block_q_t *) vx;
+ const block_q8_1 * y = (const block_q8_1 *) vy;
+
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
+ const int ibx = row * blocks_per_row + i;
+ const int iby = i * (qk / QK8_1);
+
+ for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
+ const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr));
+ tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+ }
+ }
+
+#pragma unroll
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+ }
+
+ if (item_ct1.get_local_id(2) == 0) {
+ dst[row] = tmp;
+ }
+}
+
+template
+static void launch_mul_mat_vec_q_moe(
+ const void * vx_base, const void * vy, const int32_t * ids_dev,
+ float * dst_base, const int ncols, const int nrows, const int n_experts_used,
+ const size_t expert_weight_stride, const size_t dst_row_stride,
+ const size_t src1_row_stride,
+ dpct::queue_ptr stream) {
+ const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+ const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+ stream->submit([&](sycl::handler & cgh) {
+ cgh.parallel_for(
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
+ [=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+ mul_mat_vec_q_moe(
+ vx_base, vy, dst_base, ids_dev, ncols, nrows,
+ expert_weight_stride, dst_row_stride, src1_row_stride, item);
+ });
+ });
+}
+
+bool ggml_sycl_mul_mat_vec_q_id(
+ enum ggml_type src0_type,
+ const void * vx_base,
+ const void * vy,
+ const int32_t * ids_dev,
+ float * dst_base,
+ int ncols,
+ int nrows,
+ int n_experts_used,
+ size_t expert_weight_stride,
+ size_t dst_row_stride,
+ size_t src1_row_stride,
+ dpct::queue_ptr stream) {
+ switch (src0_type) {
+ case GGML_TYPE_Q4_0:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q4_1:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q5_0:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q5_1:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q8_0:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q2_K:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q3_K:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q4_K:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q5_K:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_Q6_K:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_MXFP4:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ case GGML_TYPE_NVFP4:
+ launch_mul_mat_vec_q_moe(
+ vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+ expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/ggml/src/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp
index 049b43d4535..d674dc1d61e 100644
--- a/ggml/src/ggml-sycl/mmvq.hpp
+++ b/ggml/src/ggml-sycl/mmvq.hpp
@@ -24,4 +24,20 @@ void ggml_sycl_op_mul_mat_vec_q(
const int64_t src1_ncols, const int64_t src1_padded_row_size,
const dpct::queue_ptr &stream);
+// Requires standard (non-reorder) block layout for src0.
+// Returns false if src0_type isn't handled; caller should fall back.
+bool ggml_sycl_mul_mat_vec_q_id(
+ enum ggml_type src0_type,
+ const void * vx_base, // start of stacked expert weights
+ const void * vy, // pre-quantized src1 (Q8_1)
+ const int32_t * ids_dev, // device-side int32, length n_experts_used
+ float * dst_base,
+ int ncols,
+ int nrows,
+ int n_experts_used,
+ size_t expert_weight_stride, // bytes between experts in vx_base
+ size_t dst_row_stride, // bytes between dst rows
+ size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
+ dpct::queue_ptr stream);
+
#endif // GGML_SYCL_MMVQ_HPP
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index 6593a9fe16b..efc5b8c97a7 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -281,6 +281,25 @@ struct ggml_webgpu_conv2d_pipeline_key_hash {
}
};
+/** Im2Col **/
+struct ggml_webgpu_im2col_pipeline_key {
+ ggml_type input_type;
+ ggml_type output_type;
+
+ bool operator==(const ggml_webgpu_im2col_pipeline_key & other) const {
+ return input_type == other.input_type && output_type == other.output_type;
+ }
+};
+
+struct ggml_webgpu_im2col_pipeline_key_hash {
+ size_t operator()(const ggml_webgpu_im2col_pipeline_key & key) const {
+ size_t seed = 0;
+ ggml_webgpu_hash_combine(seed, key.input_type);
+ ggml_webgpu_hash_combine(seed, key.output_type);
+ return seed;
+ }
+};
+
/** Gated Delta Net **/
struct ggml_webgpu_gated_delta_net_pipeline_key {
int type;
@@ -833,6 +852,8 @@ class ggml_webgpu_shader_lib {
soft_max_pipelines;
std::unordered_map
conv2d_pipelines;
+ std::unordered_map
+ im2col_pipelines;
std::unordered_maptype;
+ key.output_type = context.dst->type;
+
+ auto it = im2col_pipelines.find(key);
+ if (it != im2col_pipelines.end()) {
+ return it->second;
+ }
+
+ std::vector defines;
+ std::string variant = "im2col";
+
+ auto push_type_defines = [&](const char * prefix, ggml_type type) {
+ std::string s_prefix = prefix;
+ if (type == GGML_TYPE_F32) {
+ defines.push_back(s_prefix + "_F32");
+ } else if (type == GGML_TYPE_F16) {
+ defines.push_back(s_prefix + "_F16");
+ } else {
+ GGML_ABORT("Unsupported type for IM2COL shader");
+ }
+ };
+
+ push_type_defines("INPUT", key.input_type);
+ push_type_defines("OUTPUT", key.output_type);
+
+ defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
+
+ auto processed = preprocessor.preprocess(wgsl_im2col, defines);
+ auto decisions = std::make_shared();
+ decisions->wg_size = context.max_wg_size;
+ webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
+ pipeline.context = decisions;
+ im2col_pipelines[key] = pipeline;
+ return im2col_pipelines[key];
+ }
+
private:
static webgpu_pipeline ggml_webgpu_create_pipeline(wgpu::Device & device,
std::string shader_code,
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 44e3bf82216..bcca2bd4627 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -979,25 +979,108 @@ static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
};
- uint32_t max_wg_size =
- std::min((uint32_t) WEBGPU_MAX_WG_SIZE, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupSizeX);
- uint32_t wg_size =
- std::min((uint32_t) ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup, max_wg_size);
-
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.dst = dst;
- shader_lib_ctx.max_wg_size = wg_size;
+ shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
webgpu_pipeline pipeline = ctx->shader_lib->get_conv2d_pipeline(shader_lib_ctx);
auto * decisions = static_cast(pipeline.context.get());
- uint32_t n_out = ggml_nelements(dst);
- uint32_t total_wg = CEIL_DIV(n_out, decisions->wg_size);
- uint32_t max_wg = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
- uint32_t wg_x = std::min(total_wg, max_wg);
+ uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
+ uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
+ uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
+
+ return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
+}
+
+static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
+ ggml_tensor * src0,
+ ggml_tensor * src1,
+ ggml_tensor * dst) {
+ const int32_t s0 = ggml_get_op_params_i32(dst, 0);
+ const int32_t s1 = ggml_get_op_params_i32(dst, 1);
+ const int32_t p0 = ggml_get_op_params_i32(dst, 2);
+ const int32_t p1 = ggml_get_op_params_i32(dst, 3);
+ const int32_t d0 = ggml_get_op_params_i32(dst, 4);
+ const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+ const bool is_2D = ggml_get_op_params_i32(dst, 6) == 1;
+
+ const uint32_t KW = src0->ne[0];
+ const uint32_t KH = is_2D ? src0->ne[1] : 1;
+ const uint32_t IC = is_2D ? src0->ne[2] : src0->ne[1];
+
+ const uint32_t IW = src1->ne[0];
+ const uint32_t IH = is_2D ? src1->ne[1] : 1;
+ const uint32_t N = is_2D ? src1->ne[3] : src1->ne[2];
+
+ const uint32_t OW = dst->ne[1];
+ const uint32_t OH = is_2D ? dst->ne[2] : 1;
+
+ const uint32_t si0 = (uint32_t) (src1->nb[0] / ggml_type_size(src1->type));
+ const uint32_t si1 = is_2D ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0;
+ const uint32_t si2 = is_2D ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
+ (uint32_t) (src1->nb[1] / ggml_type_size(src1->type));
+ const uint32_t si3 = is_2D ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
+ (uint32_t) (src1->nb[2] / ggml_type_size(src1->type));
+
+ const uint32_t so0 = (uint32_t) (dst->nb[0] / ggml_type_size(dst->type));
+ const uint32_t so1 = (uint32_t) (dst->nb[1] / ggml_type_size(dst->type));
+ const uint32_t so2 = is_2D ? (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)) : 0;
+ const uint32_t so3 = is_2D ? (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)) :
+ (uint32_t) (dst->nb[2] / ggml_type_size(dst->type));
+
+ std::vector params = {
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+ (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+
+ si0,
+ si1,
+ si2,
+ si3,
+ so0,
+ so1,
+ so2,
+ so3,
+
+ KW,
+ KH,
+ IC,
+
+ IW,
+ IH,
+ N,
+
+ OW,
+ OH,
+
+ (uint32_t) s0,
+ (uint32_t) s1,
+ (uint32_t) p0,
+ (uint32_t) p1,
+ (uint32_t) d0,
+ (uint32_t) d1,
+ };
+
+ std::vector entries = {
+ ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
+ ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
+ };
+
+ ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+ shader_lib_ctx.src0 = src0;
+ shader_lib_ctx.src1 = src1;
+ shader_lib_ctx.dst = dst;
+ shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
+
+ webgpu_pipeline pipeline = ctx->shader_lib->get_im2col_pipeline(shader_lib_ctx);
+
+ auto * decisions = static_cast(pipeline.context.get());
+
+ uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
+ uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
@@ -1988,8 +2071,8 @@ static std::optional ggml_webgpu_rms_norm_mul(webgpu_context
GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
}
- bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
- (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
+ bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
+ (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
uint32_t offset_merged_rn_src = 0;
@@ -2689,6 +2772,8 @@ static std::optional ggml_webgpu_encode(webgpu_context ctx,
return ggml_webgpu_sum_rows(ctx, src0, node);
case GGML_OP_CONV_2D:
return ggml_webgpu_conv_2d(ctx, src0, src1, node);
+ case GGML_OP_IM2COL:
+ return ggml_webgpu_im2col(ctx, src0, src1, node);
default:
return std::nullopt;
}
@@ -3455,7 +3540,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
ggml_backend_webgpu_device_context * dev_ctx = (ggml_backend_webgpu_device_context *) dev->context;
webgpu_context webgpu_ctx = std::make_shared();
webgpu_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
- webgpu_ctx->shader_lib = std::make_unique(dev_ctx->webgpu_global_ctx->device);
+ webgpu_ctx->shader_lib = std::make_unique(dev_ctx->webgpu_global_ctx->device);
webgpu_ctx->param_arena.init(
webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
@@ -3705,12 +3790,12 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
break;
}
// Head dimensions must fit in workgroup memory with minimum tile sizes
- size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
- const bool has_mask = op->src[3] != nullptr;
- const bool kv_direct = src1->type == GGML_TYPE_F16 &&
- (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
- (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
- const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
+ size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+ const bool has_mask = op->src[3] != nullptr;
+ const bool kv_direct = src1->type == GGML_TYPE_F16 &&
+ (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
+ (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
+ const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
(uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
if (min_bytes > limit_bytes) {
@@ -3802,6 +3887,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
break;
+ case GGML_OP_IM2COL:
+ supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+ break;
case GGML_OP_SSM_CONV:
supports_op = op->type == GGML_TYPE_F32;
break;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl
new file mode 100644
index 00000000000..386ebab879f
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl
@@ -0,0 +1,101 @@
+#include "common_decls.tmpl"
+enable f16;
+
+@group(0) @binding(0)
+#if defined(INPUT_F32)
+var input: array;
+#elif defined(INPUT_F16)
+var input: array;
+#endif
+
+@group(0) @binding(1)
+#if defined(OUTPUT_F32)
+var output: array;
+#elif defined(OUTPUT_F16)
+var output: array;
+#endif
+
+struct Params {
+ offset_i: u32,
+ offset_o: u32,
+
+ // element strides
+ si0: u32, si1: u32, si2: u32, si3: u32,
+ so0: u32, so1: u32, so2: u32, so3: u32,
+
+ KW: u32, KH: u32, IC: u32,
+ IW: u32, IH: u32, N: u32,
+ OW: u32, OH: u32,
+
+ // stride
+ s0: u32, s1: u32,
+ // padding
+ p0: u32, p1: u32,
+ // dilation
+ d0: u32, d1: u32,
+}
+
+@group(0) @binding(2)
+var params: Params;
+
+fn load_input(idx: u32) -> f32 {
+ #if defined(INPUT_F32)
+ return input[idx];
+ #elif defined(INPUT_F16)
+ return f32(input[idx]);
+ #endif
+}
+
+fn store_output(idx: u32, val: f32) {
+ #if defined(OUTPUT_F32)
+ output[idx] = val;
+ #elif defined(OUTPUT_F16)
+ output[idx] = f16(val);
+ #endif
+}
+
+@compute @workgroup_size(WG_SIZE)
+fn main(
+ @builtin(global_invocation_id) gid: vec3,
+ @builtin(num_workgroups) num_wg: vec3
+) {
+
+ let threads_per_group = u32(WG_SIZE);
+ let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
+ let K = params.KW * params.KH * params.IC;
+ let M = params.OW * params.OH;
+ let total = K * M * params.N;
+
+ if (i_out >= total) {
+ return;
+ }
+
+ // decode (k, m, n)
+ var i = i_out;
+ let n = i / (K * M);
+ i = i % (K * M);
+ let m = i / K;
+ let k = i % K;
+
+ // decode (oh, ow)
+ let oh = m / params.OW;
+ let ow = m % params.OW;
+
+ // decode (kw, kh, ic)
+ let kw = k % params.KW;
+ let tmp = k / params.KW;
+ let kh = tmp % params.KH;
+ let ic = tmp / params.KH;
+
+ let iw_i32 = i32(ow * params.s0 + kw * params.d0) - i32(params.p0);
+ let ih_i32 = i32(oh * params.s1 + kh * params.d1) - i32(params.p1);
+
+ if (iw_i32 >= 0 && iw_i32 < i32(params.IW) && ih_i32 >= 0 && ih_i32 < i32(params.IH)) {
+ let iw = u32(iw_i32);
+ let ih = u32(ih_i32);
+ let in_idx = params.offset_i + iw * params.si0 + ih * params.si1 + ic * params.si2 + n * params.si3;
+ store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, load_input(in_idx));
+ } else {
+ store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, 0.0);
+ }
+}