LongLeCE · pull · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04
 
-# Optional proxy build arguments - empty by default
+# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
+ARG IGC_VERSION=v2.30.1
+ARG IGC_VERSION_FULL=2_2.30.1+20950
+ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGDGMM_VERSION=22.9.0
+
+# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
+ARG NPU_DRIVER_VERSION=v1.32.0
+ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
+ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
+
+# Optional proxy build arguments
 ARG http_proxy=
 ARG https_proxy=
 
@@ -78,13 +90,47 @@ ARG http_proxy
 ARG https_proxy
 
 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl \
+    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
     && apt autoremove -y \
     && apt clean -y \
     && rm -rf /tmp/* /var/tmp/* \
     && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
     && find /var/cache -type f -delete
 
+# Install GPU drivers
+ARG IGC_VERSION
+ARG IGC_VERSION_FULL
+ARG COMPUTE_RUNTIME_VERSION
+ARG COMPUTE_RUNTIME_VERSION_FULL
+ARG IGDGMM_VERSION
+RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
+    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+    && dpkg --install *.deb \
+    && rm -rf /tmp/neo/
+
+# Install NPU drivers
+ARG NPU_DRIVER_VERSION
+ARG NPU_DRIVER_FULL
+ARG LIBZE1_VERSION
+RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
+    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
+    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
+    && dpkg --install *.deb \
+    && rm -rf /tmp/npu/
+
+RUN cd /tmp \
+    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
+    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
+    && rm libze1_${LIBZE1_VERSION}_amd64.deb
+
 COPY --from=build /app/lib/ /app/
 
 ### Full (all binaries)

diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml
@@ -0,0 +1,120 @@
+name: CI (openvino)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-openvino.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-openvino.yml',
+      'ggml/src/ggml-openvino/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+  ubuntu-24-openvino:
+    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+
+    concurrency:
+      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
+      cancel-in-progress: false
+
+    strategy:
+      matrix:
+        include:
+          - variant: cpu
+            runner: '"ubuntu-24.04"'
+            openvino_device: "CPU"
+          - variant: gpu
+            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
+            openvino_device: "GPU"
+
+    runs-on: ${{ fromJSON(matrix.runner) }}
+
+    env:
+      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+      OPENVINO_VERSION_MAJOR: "2026.0"
+      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        if: runner.environment == 'github-hosted'
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
+          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+      - name: Use OpenVINO Toolkit Cache
+        if: runner.environment == 'github-hosted'
+        uses: actions/cache@v5
+        id: cache-openvino
+        with:
+          path: ./openvino_toolkit
+          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+      - name: Setup OpenVINO Toolkit
+        if: steps.cache-openvino.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-openvino
+        with:
+          path: ./openvino_toolkit
+          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+          version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+      - name: Install OpenVINO dependencies
+        run: |
+          cd ./openvino_toolkit
+          chmod +x ./install_dependencies/install_openvino_dependencies.sh
+          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source ./openvino_toolkit/setupvars.sh
+          cmake -B build/ReleaseOV -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENVINO=ON
+          time cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        # TODO: fix and re-enable the `test-llama-archs` test below
+        run: |
+          cd ${{ github.workspace }}
+          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
+            export GGML_OPENVINO_DEVICE=GPU
+          fi
+          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
@@ -265,6 +265,10 @@ jobs:
   ggml-ci-intel-openvino-gpu-low-perf:
     runs-on: [self-hosted, Linux, Intel, OpenVINO]
 
+    concurrency:
+      group: openvino-gpu-${{ github.head_ref || github.ref }}
+      cancel-in-progress: false
+
     env:
       # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
       OPENVINO_VERSION_MAJOR: "2026.0"

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -656,86 +656,6 @@ jobs:
             -DGGML_SYCL_F16=ON
           time cmake --build build --config Release -j $(nproc)
 
-  ubuntu-24-openvino:
-      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
-      strategy:
-        matrix:
-          include:
-            - variant: cpu
-              runner: '"ubuntu-24.04"'
-              openvino_device: "CPU"
-            - variant: gpu
-              runner: '["self-hosted","Linux","X64","Intel"]'
-              openvino_device: "GPU"
-
-      runs-on: ${{ fromJSON(matrix.runner) }}
-
-      env:
-        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-        OPENVINO_VERSION_MAJOR: "2026.0"
-        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-      steps:
-        - name: Clone
-          id: checkout
-          uses: actions/checkout@v6
-
-        - name: ccache
-          if: runner.environment == 'github-hosted'
-          uses: ggml-org/ccache-action@v1.2.21
-          with:
-            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
-            evict-old-files: 1d
-            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-        - name: Dependencies
-          id: depends
-          run: |
-            sudo apt-get update
-            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-        - name: Use OpenVINO Toolkit Cache
-          if: runner.environment == 'github-hosted'
-          uses: actions/cache@v5
-          id: cache-openvino
-          with:
-            path: ./openvino_toolkit
-            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-        - name: Setup OpenVINO Toolkit
-          if: steps.cache-openvino.outputs.cache-hit != 'true'
-          uses: ./.github/actions/linux-setup-openvino
-          with:
-            path: ./openvino_toolkit
-            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-            version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-        - name: Install OpenVINO dependencies
-          run: |
-            cd ./openvino_toolkit
-            chmod +x ./install_dependencies/install_openvino_dependencies.sh
-            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-        - name: Build
-          id: cmake_build
-          run: |
-            source ./openvino_toolkit/setupvars.sh
-            cmake -B build/ReleaseOV -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DGGML_OPENVINO=ON
-            time cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-        - name: Test
-          id: cmake_test
-          # TODO: fix and re-enable the `test-llama-archs` test below
-          run: |
-            cd ${{ github.workspace }}
-            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-              export GGML_OPENVINO_DEVICE=GPU
-            fi
-            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
-
   windows-latest:
     runs-on: windows-2025
 

diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3902,6 +3902,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
+    add_opt(common_arg(
+        {"--spec-default"},
+        string_format("enable default speculative decoding config"),
+        [](common_params & params) {
+            params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
+            params.speculative.ngram_size_n = 24;
+            params.speculative.n_min = 48;
+            params.speculative.n_max = 64;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
     return ctx_arg;
 }
 

diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -749,6 +749,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
 
                     mod.reset();
                     n_low = 0;
+                    i_last = 0;
                 }
             } else {
                 n_low = 0;

diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md
@@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
 - `-fa 1` is required when running llama-bench with the OpenVINO backend.
   - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
 - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
-- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
 
 > [!NOTE]
 > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
 
-> [!NOTE]
-> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
 
 ```bash
 #  Run Docker container

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -19,7 +19,6 @@
 #include <iomanip>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
@@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
         break;
     }
     case GGML_OP_ROPE: {
+        const int mode = node->op_params[2];
+        switch (mode) {
+       case GGML_ROPE_TYPE_NEOX: {
+            op_case = 0x00010000;
+            break;
+        }
+       case GGML_ROPE_TYPE_IMROPE: {
+            op_case = 0x00020000;
+            break;
+        }
+        default:
+            op_case = 0x00000000;
+            break;
+        }
         if (node->src[0]->op == GGML_OP_VIEW) {
-            op_case = 2;
+            op_case = (op_case | 0x00000002);
         }
         break;
     }
@@ -573,9 +586,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }
 
 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
-    static std::mutex weights_mutex;
-    std::lock_guard<std::mutex> lock(weights_mutex);
-
     std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
     auto * nodes = cgraph->nodes;
     auto n_nodes = cgraph->n_nodes;