diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index f1221aac..793fb987 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -13,7 +13,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing CUDA")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")'
+                Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -76,7 +76,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing Metal")
                 using Pkg
-                Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")'
+                Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -108,9 +108,8 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing oneAPI")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
-                Pkg.develop(; name="AcceleratedKernels")'
-      sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
+                Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew")
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -141,11 +140,10 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing AMDGPU")
                 using Pkg
-                Pkg.develop(; name="AcceleratedKernels")'
-      sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.9, 0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e '
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")
+                Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew")
                 println("--- :julia: Instantiating project")
                 Pkg.develop(; path=pwd())' || exit 3
 
@@ -176,7 +174,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing OpenCL")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics")
+                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")
                 Pkg.develop(; name="SPIRVIntrinsics")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index cdb4dd96..3881da55 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -833,7 +833,7 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
+SharedMemory(::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(T, dims)
 
 __synchronize() = KI.barrier()
 
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 1811ad2f..d462bcc0 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -103,19 +103,119 @@ Returns the unique group ID.
 function get_group_id end
 
 """
-    localmemory(T, dims)
+    get_sub_group_size()::UInt32
+
+Returns the number of work-items in the sub-group.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_size()::UInt32
+    ```
+"""
+function get_sub_group_size end
+
+"""
+    get_max_sub_group_size()::UInt32
+
+Returns the maximum sub-group size for sub-groups in the current workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_max_sub_group_size()::UInt32
+    ```
+"""
+function get_max_sub_group_size end
+
+"""
+    get_num_sub_groups()::UInt32
+
+Returns the number of sub-groups in the current workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_num_sub_groups()::UInt32
+    ```
+"""
+function get_num_sub_groups end
+
+"""
+    get_sub_group_id()::UInt32
+
+Returns the sub-group ID within the work-group.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_id()::UInt32
+    ```
+"""
+function get_sub_group_id end
+
+"""
+    get_sub_group_local_id()::UInt32
+
+Returns the work-item ID within the current sub-group.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_local_id()::UInt32
+    ```
+"""
+function get_sub_group_local_id end
+
+
+"""
+    localmemory(::Type{T}, dims)
 
 Declare memory that is local to a workgroup.
 
 !!! note
     Backend implementations **must** implement:
     ```
-    @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    @device_override localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ```
     As well as the on-device functionality.
 """
 localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
 
+"""
+    shfl_down(val::T, offset::Integer) where T
+
+Read `val` from a lane with higher id given by `offset`.
+When writing kernels using this function, it should be
+assumed that it is not synchronized.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override shfl_down(val::T, offset::Integer) where T
+    ```
+    As well as the on-device functionality.
+"""
+function shfl_down end
+
+"""
+    shfl_down_types(::Backend)::Vector{DataType}
+
+Returns a vector of `DataType`s supported on `backend`
+
+!!! note
+    Backend implementations **must** implement this function
+    only if they support `shfl_down` for any types.
+"""
+shfl_down_types(::Backend) = DataType[]
+
+
 """
     barrier()
 
@@ -139,6 +239,29 @@ function barrier()
     error("Group barrier used outside kernel or not captured")
 end
 
+"""
+    sub_group_barrier()
+
+After a `sub_group_barrier()` call, all read and writes to global and local memory
+from each thread in the sub-group are visible in from all other threads in the
+sub-group.
+
+This does **not** guarantee that a write from a thread in a certain sub-group will
+be visible to a thread in a different sub-group.
+
+!!! note
+    `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override sub_group_barrier()
+    ```
+"""
+function sub_group_barrier()
+    error("Sub-group barrier used outside kernel or not captured")
+end
+
 """
     _print(args...)
 
@@ -174,7 +297,7 @@ kernel on the host.
 
     Backends must also implement the on-device kernel launch functionality.
 """
-struct Kernel{B, Kern}
+struct Kernel{B,Kern}
     backend::B
     kern::Kern
 end
@@ -220,6 +343,22 @@ kernel launch with too big a workgroup is attempted.
 """
 function max_work_group_size end
 
+"""
+    sub_group_size(backend)::Int
+
+Returns a reasonable sub-group size supported by the currently
+active device for the specified backend. This would typically
+be 32, or 64 for devices that don't support 32.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    sub_group_size(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+function sub_group_size end
+
 """
     multiprocessor_count(backend::NewBackend)::Int
 
@@ -299,7 +438,7 @@ There are a few keyword arguments that influence the behavior of `KI.@kernel`:
 """
 macro kernel(backend, ex...)
     call = ex[end]
-    kwargs = map(ex[1:(end - 1)]) do kwarg
+    kwargs = map(ex[1:(end-1)]) do kwarg
         if kwarg isa Symbol
             :($kwarg = $kwarg)
         elseif Meta.isexpr(kwarg, :(=))
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 97548c47..c1da42ec 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -1,21 +1,61 @@
 import KernelAbstractions.KernelIntrinsics as KI
 
+struct KernelData
+    global_size::Int
+    global_id::Int
+    local_size::Int
+    local_id::Int
+    num_groups::Int
+    group_id::Int
+end
 function test_intrinsics_kernel(results)
-    # Test all intrinsics return NamedTuples with x, y, z fields
-    global_size = KI.get_global_size()
-    global_id = KI.get_global_id()
-    local_size = KI.get_local_size()
-    local_id = KI.get_local_id()
-    num_groups = KI.get_num_groups()
-    group_id = KI.get_group_id()
-
-    if UInt32(global_id.x) <= UInt32(global_size.x)
-        results[1, global_id.x] = global_id.x
-        results[2, global_id.x] = local_id.x
-        results[3, global_id.x] = group_id.x
-        results[4, global_id.x] = global_size.x
-        results[5, global_id.x] = local_size.x
-        results[6, global_id.x] = num_groups.x
+    i = KI.get_global_id().x
+
+    if i <= length(results)
+        @inbounds results[i] = KernelData(KI.get_global_size().x,
+                                          KI.get_global_id().x,
+                                          KI.get_local_size().x,
+                                          KI.get_local_id().x,
+                                          KI.get_num_groups().x,
+                                          KI.get_group_id().x)
+    end
+    return
+end
+struct SubgroupData
+    sub_group_size::UInt32
+    max_sub_group_size::UInt32
+    num_sub_groups::UInt32
+    sub_group_id::UInt32
+    sub_group_local_id::UInt32
+end
+function test_subgroup_kernel(results)
+    i = KI.get_global_id().x
+
+    if i <= length(results)
+        @inbounds results[i] = SubgroupData(KI.get_sub_group_size(),
+                                            KI.get_max_sub_group_size(),
+                                            KI.get_num_sub_groups(),
+                                            KI.get_sub_group_id(),
+                                            KI.get_sub_group_local_id())
+    end
+    return
+end
+
+function shfl_down_test_kernel(a, b, ::Val{N}) where N
+    idx = KI.get_sub_group_local_id()
+
+    val = a[idx]
+
+    offset = 0x00000001
+    while offset < N
+        val += KI.shfl_down(val, offset)
+        offset <<= 1
+    end
+
+    KI.sub_group_barrier()
+
+    if idx == 1
+        b[idx] = val
     end
     return
 end
@@ -82,43 +122,91 @@ function intrinsics_testsuite(backend, AT)
             @test KI.multiprocessor_count(backend()) isa Int
 
             # Test with small kernel
-            N = 16
-            results = AT(zeros(Int, 6, N))
+            workgroupsize = 4
+            numworkgroups = 4
+            N = workgroupsize * numworkgroups
+            results = AT(Vector{KernelData}(undef, N))
             kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results)
 
             @test KI.kernel_max_work_group_size(kernel) isa Int
             @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1
 
-            kernel(results, workgroupsize = 4, numworkgroups = 4)
+            kernel(results; workgroupsize, numworkgroups)
             KernelAbstractions.synchronize(backend())
 
             host_results = Array(results)
 
             # Verify results make sense
-            for i in 1:N
-                global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]
+            for (i, k_data) in enumerate(host_results)
 
                 # Global IDs should be 1-based and sequential
-                @test global_id_x == i
+                @test k_data.global_id == i
 
                 # Global size should match our ndrange
-                @test global_size_x == N
+                @test k_data.global_size == N
+
+                @test k_data.local_size == workgroupsize
+
+                @test k_data.num_groups == numworkgroups
+
+                # Group ID should be 1-based
+                expected_group = div(i - 1, numworkgroups) + 1
+                @test k_data.group_id == expected_group
+
+                # Local ID should be 1-based within group
+                expected_local = ((i - 1) % workgroupsize) + 1
+                @test k_data.local_id == expected_local
+            end
+        end
 
-                # Local size should be 4 (our workgroupsize)
-                @test local_size_x == 4
+        @testset "Subgroups" begin
+            @test KI.sub_group_size(backend()) isa Int
 
-                # Number of groups should be ceil(N/4) = 4
-                @test num_groups_x == 4
+            # Test with small kernel
+            sg_size = KI.sub_group_size(backend())
+            sg_n = 2
+            workgroupsize = sg_size * sg_n
+            numworkgroups = 2
+            N = workgroupsize * numworkgroups
+
+            results = AT(Vector{SubgroupData}(undef, N))
+            kernel = KI.@kernel backend() launch = false test_subgroup_kernel(results)
+
+            kernel(results; workgroupsize, numworkgroups)
+            KernelAbstractions.synchronize(backend())
+
+            host_results = Array(results)
+
+            # Verify results make sense
+            for (i, sg_data) in enumerate(host_results)
+                @test sg_data.sub_group_size == sg_size
+                @test sg_data.max_sub_group_size == sg_size
+                @test sg_data.num_sub_groups == sg_n
 
                 # Group ID should be 1-based
-                expected_group = div(i - 1, 4) + 1
-                @test group_id_x == expected_group
+                div(((i - 1) % workgroupsize), sg_n) + 1
+                expected_sub_group = div(((i - 1) % workgroupsize), sg_size) + 1
+                @test sg_data.sub_group_id == expected_sub_group
 
                 # Local ID should be 1-based within group
-                expected_local = ((i - 1) % 4) + 1
-                @test local_id_x == expected_local
+                expected_sg_local = ((i - 1) % sg_size) + 1
+                @test sg_data.sub_group_local_id == expected_sg_local
             end
         end
+        @testset "shfl_down(::$T)" for T in KI.shfl_down_types(backend())
+            N = KI.sub_group_size(backend())
+            a = ones(T, N)
+            # a = zeros(T, N)
+            # rand!(a, (1:4))
+
+            dev_a = AT(a)
+            dev_b = AT(zeros(T, N))
+
+            KI.@kernel backend() workgroupsize=N shfl_down_test_kernel(dev_a, dev_b, Val(N))
+
+            b = Array(dev_b)
+            @test sum(a) ≈ b[1]
+        end
     end
     return nothing
 end
diff --git a/test/localmem.jl b/test/localmem.jl
index 9a34d97c..7894ee61 100644
--- a/test/localmem.jl
+++ b/test/localmem.jl
@@ -47,9 +47,26 @@ end
     end
 end
 
+@kernel function many_localmem(A)
+    N = @uniform prod(@groupsize())
+    @uniform begin
+        N2 = prod(@groupsize())
+    end
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem1 = @localmem Int (N,) # Ok iff groupsize is static
+    lmem2 = @localmem Int (N,) # Ok iff groupsize is static
+    @inbounds begin
+        lmem1[i] = i-1
+        lmem2[i] = 1
+        @synchronize
+        A[I] = lmem1[N2 - i + 1] + lmem2[N2 - i + 1]
+    end
+end
+
 function localmem_testsuite(backend, ArrayT)
     @testset "kernels" begin
-        @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16))
+        @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indices(backend(), 16), many_localmem(backend(), 16))
             A = ArrayT{Int}(undef, 64)
             kernel!(A, ndrange = size(A))
             synchronize(backend())