From 664480cd332fb67b9fc9417c9507dd0a8c237703 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 19:44:23 +0530 Subject: [PATCH 01/28] cuBlas level 1 method - scal supported --- deps/onemkl.cpp | 111 ++++++++++++++++++++++++++----------------- deps/onemkl.h | 15 ++++-- lib/mkl/libonemkl.jl | 17 +++++++ lib/mkl/wrappers.jl | 18 ++++++- test/onemkl.jl | 30 ++++++++++++ 5 files changed, 144 insertions(+), 47 deletions(-) create mode 100644 test/onemkl.jl diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ba9654ba..ca7c9e98 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -8,25 +8,25 @@ // https://spec.oneapi.io/versions/1.0-rev-1/elements/oneMKL/source/domains/blas/gemm.html oneapi::mkl::transpose convert(onemklTranspose val) { - switch (val) { - case ONEMKL_TRANSPOSE_NONTRANS: - return oneapi::mkl::transpose::nontrans; - case ONEMKL_TRANSPOSE_TRANS: - return oneapi::mkl::transpose::trans; - case ONEMLK_TRANSPOSE_CONJTRANS: - return oneapi::mkl::transpose::conjtrans; - } + switch (val) { + case ONEMKL_TRANSPOSE_NONTRANS: + return oneapi::mkl::transpose::nontrans; + case ONEMKL_TRANSPOSE_TRANS: + return oneapi::mkl::transpose::trans; + case ONEMLK_TRANSPOSE_CONJTRANS: + return oneapi::mkl::transpose::conjtrans; + } } extern "C" int onemklHgemm(syclQueue_t device_queue, onemklTranspose transA, onemklTranspose transB, int64_t m, int64_t n, - int64_t k, sycl::half alpha, const sycl::half *A, int64_t lda, - const sycl::half *B, int64_t ldb, sycl::half beta, sycl::half *C, - int64_t ldc) { - oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), - convert(transB), m, n, k, alpha, A, - lda, B, ldb, beta, C, ldc); - return 0; + int64_t k, sycl::half alpha, const sycl::half *A, + int64_t lda, const sycl::half *B, int64_t ldb, + sycl::half beta, sycl::half *C, int64_t ldc) { + oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), + convert(transB), m, n, k, alpha, A, lda, + B, ldb, beta, C, ldc); + return 0; } extern "C" int onemklSgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -34,10 +34,10 @@ extern "C" int onemklSgemm(syclQueue_t device_queue, onemklTranspose transA, int64_t k, float alpha, const float *A, int64_t lda, const float *B, int64_t ldb, float beta, float *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), - convert(transB), m, n, k, alpha, A, - lda, B, ldb, beta, C, ldc); - return 0; + oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), + convert(transB), m, n, k, alpha, A, lda, + B, ldb, beta, C, ldc); + return 0; } extern "C" int onemklDgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -45,10 +45,10 @@ extern "C" int onemklDgemm(syclQueue_t device_queue, onemklTranspose transA, int64_t k, double alpha, const double *A, int64_t lda, const double *B, int64_t ldb, double beta, double *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), - convert(transB), m, n, k, alpha, A, - lda, B, ldb, beta, C, ldc); - return 0; + oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), + convert(transB), m, n, k, alpha, A, lda, + B, ldb, beta, C, ldc); + return 0; } extern "C" int onemklCgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -58,12 +58,12 @@ extern "C" int onemklCgemm(syclQueue_t device_queue, onemklTranspose transA, const float _Complex *B, int64_t ldb, float _Complex beta, float _Complex *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm( - device_queue->val, convert(transA), convert(transB), m, n, k, alpha, - reinterpret_cast *>(A), lda, - reinterpret_cast *>(B), ldb, beta, - reinterpret_cast *>(C), ldc); - return 0; + oneapi::mkl::blas::column_major::gemm( + device_queue->val, convert(transA), convert(transB), m, n, k, alpha, + reinterpret_cast *>(A), lda, + reinterpret_cast *>(B), ldb, beta, + reinterpret_cast *>(C), ldc); + return 0; } extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -73,29 +73,54 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm( - device_queue->val, convert(transA), convert(transB), m, n, k, alpha, - reinterpret_cast *>(A), lda, - reinterpret_cast *>(B), ldb, beta, - reinterpret_cast *>(C), ldc); - return 0; + oneapi::mkl::blas::column_major::gemm( + device_queue->val, convert(transA), convert(transB), m, n, k, alpha, + reinterpret_cast *>(A), lda, + reinterpret_cast *>(B), ldb, beta, + reinterpret_cast *>(C), ldc); + return 0; +} + +// Support Level-1: SCAL primitive +extern "C" void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, + double *x, int64_t incx) { + oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); } +extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, + float *x, int64_t incx) { + oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); +} + +extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, + float alpha, float _Complex *x, + int64_t incx) { + oneapi::mkl::blas::column_major::scal( + device_queue->val, n, alpha, reinterpret_cast *>(x), + incx); +} + +extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, + double alpha, double _Complex *x, + int64_t incx) { + oneapi::mkl::blas::column_major::scal( + device_queue->val, n, alpha, reinterpret_cast *>(x), + incx); +} // other -// oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading the library. -// that is incompatible with oneAPI.jl destroying queues before that, so expose a function -// to manually wipe the device cache when we're destroying queues. +// oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading +// the library. that is incompatible with oneAPI.jl destroying queues before +// that, so expose a function to manually wipe the device cache when we're +// destroying queues. namespace oneapi { namespace mkl { namespace gpu { int clean_gpu_caches(); } -} -} +} // namespace mkl +} // namespace oneapi -extern "C" void onemklDestroy() { - oneapi::mkl::gpu::clean_gpu_caches(); -} +extern "C" void onemklDestroy() { oneapi::mkl::gpu::clean_gpu_caches(); } diff --git a/deps/onemkl.h b/deps/onemkl.h index 7e7e065b..56932d7c 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -10,9 +10,9 @@ extern "C" { #endif typedef enum { - ONEMKL_TRANSPOSE_NONTRANS, - ONEMKL_TRANSPOSE_TRANS, - ONEMLK_TRANSPOSE_CONJTRANS + ONEMKL_TRANSPOSE_NONTRANS, + ONEMKL_TRANSPOSE_TRANS, + ONEMLK_TRANSPOSE_CONJTRANS } onemklTranspose; // XXX: how to expose half in C? @@ -39,6 +39,15 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); +void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, + int64_t incx); +void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, + int64_t incx); +void onemklCscal(syclQueue_t device_queue, int64_t n, + float alpha, float _Complex *x, int64_t incx); +void onemklZscal(syclQueue_t device_queue, int64_t n, + double alpha, double _Complex *x, int64_t incx); + void onemklDestroy(); #ifdef __cplusplus } diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 042b5e3a..044f23ba 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -41,3 +41,20 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld B::ZePtr{ComplexF64}, ldb::Int64, beta::ComplexF64, C::ZePtr{ComplexF64}, ldc::Int64)::Cint end + +function onemklDscal(device_queue, n, alpha, x, incx) + @ccall liboneapi_support.onemklDscal(device_queue::syclQueue_t, n::Int64, alpha::Cdouble, x::ZePtr{Cdouble}, incx::Int64)::Cvoid +end + +function onemklSscal(device_queue, n, alpha, x, incx) + @ccall liboneapi_support.onemklSscal(device_queue::syclQueue_t, n::Int64, alpha::Cfloat, x::ZePtr{Cfloat}, incx::Int64)::Cvoid +end + +function onemklZscal(device_queue, n, alpha, x, incx) + @ccall liboneapi_support.onemklZscal(device_queue::syclQueue_t, n::Int64, alpha::ComplexF64, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid +end + +function onemklCscal(device_queue, n, alpha, x, incx) + @ccall liboneapi_support.onemklCscal(device_queue::syclQueue_t, n::Int64, alpha::ComplexF32, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid +end + diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 64da7c37..3419c057 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -14,7 +14,23 @@ function Base.convert(::Type{onemklTranspose}, trans::Char) end end - +# level 1 +## scal +for (fname, elty) in + ((:onemklDscal,:Float64), + (:onemklSscal,:Float32), + (:onemklZscal,:ComplexF64), + (:onemklCscal,:ComplexF32)) + @eval begin + function scal!(n::Integer, + alpha::Number, + x::StridedArray{$elty}) + queue = global_queue(context(x), device(x)) + $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) + x + end + end +end # # BLAS diff --git a/test/onemkl.jl b/test/onemkl.jl new file mode 100644 index 00000000..ac032401 --- /dev/null +++ b/test/onemkl.jl @@ -0,0 +1,30 @@ +#using oneAPI.oneMKL +#using LinearAlgebra + +m = 20 +n = 35 +k = 13 + +const eltypes=[Float32, ComplexF32] +const float64_supported = oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 == oneL0.ZE_DEVICE_MODULE_FLAG_FP64 +if (float64_supported) + append!(eltypes, [Float64, ComplexF64]) +end + +######################## +@testset "level 1" begin + @testset for T in eltypes + A = oneArray(rand(T, m)) + B = A + alpha = 2 + B = B.* 2 + println("CPU Result") + @show B + + oneMKL.scal!(m, alpha, A) + println("GPU Result") + @show A + + @test Array(A) == Array(B) + end +end From 5180b9b79a5bef237e593515cdd9f674365afb39 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:13:03 +0530 Subject: [PATCH 02/28] scal test case updated in onemkl.jl --- test/onemkl.jl | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index ac032401..38e2851e 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -1,11 +1,11 @@ -#using oneAPI.oneMKL -#using LinearAlgebra +using oneAPI.oneMKL +using LinearAlgebra m = 20 n = 35 k = 13 -const eltypes=[Float32, ComplexF32] +eltypes=[Float32, ComplexF32] const float64_supported = oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 == oneL0.ZE_DEVICE_MODULE_FLAG_FP64 if (float64_supported) append!(eltypes, [Float64, ComplexF64]) @@ -14,17 +14,14 @@ end ######################## @testset "level 1" begin @testset for T in eltypes - A = oneArray(rand(T, m)) - B = A - alpha = 2 - B = B.* 2 - println("CPU Result") - @show B - - oneMKL.scal!(m, alpha, A) - println("GPU Result") - @show A - - @test Array(A) == Array(B) + A = rand(T, m) + gpuA = oneArray(A) + if T === Float32 + oneMKL.scal!(m, 5f0, gpuA) + else + oneMKL.scal!(m, 5.0, gpuA) + end + _A = Array(gpuA) + @test isapprox(A .* 5, _A) end end From 1307b8d52a9515d521944c13581b712fb24036b3 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:19:00 +0530 Subject: [PATCH 03/28] NITS - cleanup --- deps/onemkl.h | 1 + lib/mkl/wrappers.jl | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/deps/onemkl.h b/deps/onemkl.h index 56932d7c..b45e9eae 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -39,6 +39,7 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); +// Level-1: scal oneMKL void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, int64_t incx); void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 3419c057..4c11f597 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,8 +23,8 @@ for (fname, elty) in (:onemklCscal,:ComplexF32)) @eval begin function scal!(n::Integer, - alpha::Number, - x::StridedArray{$elty}) + alpha::Number, + x::StridedArray{$elty}) queue = global_queue(context(x), device(x)) $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) x From 86f9dc73d259b89d8233145df5d811a6727f3cb5 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:26:27 +0530 Subject: [PATCH 04/28] indentation NITS --- deps/onemkl.cpp | 111 +++++++++++++++++++----------------------------- deps/onemkl.h | 16 ++----- 2 files changed, 46 insertions(+), 81 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ca7c9e98..ba9654ba 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -8,25 +8,25 @@ // https://spec.oneapi.io/versions/1.0-rev-1/elements/oneMKL/source/domains/blas/gemm.html oneapi::mkl::transpose convert(onemklTranspose val) { - switch (val) { - case ONEMKL_TRANSPOSE_NONTRANS: - return oneapi::mkl::transpose::nontrans; - case ONEMKL_TRANSPOSE_TRANS: - return oneapi::mkl::transpose::trans; - case ONEMLK_TRANSPOSE_CONJTRANS: - return oneapi::mkl::transpose::conjtrans; - } + switch (val) { + case ONEMKL_TRANSPOSE_NONTRANS: + return oneapi::mkl::transpose::nontrans; + case ONEMKL_TRANSPOSE_TRANS: + return oneapi::mkl::transpose::trans; + case ONEMLK_TRANSPOSE_CONJTRANS: + return oneapi::mkl::transpose::conjtrans; + } } extern "C" int onemklHgemm(syclQueue_t device_queue, onemklTranspose transA, onemklTranspose transB, int64_t m, int64_t n, - int64_t k, sycl::half alpha, const sycl::half *A, - int64_t lda, const sycl::half *B, int64_t ldb, - sycl::half beta, sycl::half *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), - convert(transB), m, n, k, alpha, A, lda, - B, ldb, beta, C, ldc); - return 0; + int64_t k, sycl::half alpha, const sycl::half *A, int64_t lda, + const sycl::half *B, int64_t ldb, sycl::half beta, sycl::half *C, + int64_t ldc) { + oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), + convert(transB), m, n, k, alpha, A, + lda, B, ldb, beta, C, ldc); + return 0; } extern "C" int onemklSgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -34,10 +34,10 @@ extern "C" int onemklSgemm(syclQueue_t device_queue, onemklTranspose transA, int64_t k, float alpha, const float *A, int64_t lda, const float *B, int64_t ldb, float beta, float *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), - convert(transB), m, n, k, alpha, A, lda, - B, ldb, beta, C, ldc); - return 0; + oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), + convert(transB), m, n, k, alpha, A, + lda, B, ldb, beta, C, ldc); + return 0; } extern "C" int onemklDgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -45,10 +45,10 @@ extern "C" int onemklDgemm(syclQueue_t device_queue, onemklTranspose transA, int64_t k, double alpha, const double *A, int64_t lda, const double *B, int64_t ldb, double beta, double *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), - convert(transB), m, n, k, alpha, A, lda, - B, ldb, beta, C, ldc); - return 0; + oneapi::mkl::blas::column_major::gemm(device_queue->val, convert(transA), + convert(transB), m, n, k, alpha, A, + lda, B, ldb, beta, C, ldc); + return 0; } extern "C" int onemklCgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -58,12 +58,12 @@ extern "C" int onemklCgemm(syclQueue_t device_queue, onemklTranspose transA, const float _Complex *B, int64_t ldb, float _Complex beta, float _Complex *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm( - device_queue->val, convert(transA), convert(transB), m, n, k, alpha, - reinterpret_cast *>(A), lda, - reinterpret_cast *>(B), ldb, beta, - reinterpret_cast *>(C), ldc); - return 0; + oneapi::mkl::blas::column_major::gemm( + device_queue->val, convert(transA), convert(transB), m, n, k, alpha, + reinterpret_cast *>(A), lda, + reinterpret_cast *>(B), ldb, beta, + reinterpret_cast *>(C), ldc); + return 0; } extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, @@ -73,54 +73,29 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc) { - oneapi::mkl::blas::column_major::gemm( - device_queue->val, convert(transA), convert(transB), m, n, k, alpha, - reinterpret_cast *>(A), lda, - reinterpret_cast *>(B), ldb, beta, - reinterpret_cast *>(C), ldc); - return 0; -} - -// Support Level-1: SCAL primitive -extern "C" void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, - double *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); + oneapi::mkl::blas::column_major::gemm( + device_queue->val, convert(transA), convert(transB), m, n, k, alpha, + reinterpret_cast *>(A), lda, + reinterpret_cast *>(B), ldb, beta, + reinterpret_cast *>(C), ldc); + return 0; } -extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, - float *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); -} - -extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, - float alpha, float _Complex *x, - int64_t incx) { - oneapi::mkl::blas::column_major::scal( - device_queue->val, n, alpha, reinterpret_cast *>(x), - incx); -} - -extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, - double alpha, double _Complex *x, - int64_t incx) { - oneapi::mkl::blas::column_major::scal( - device_queue->val, n, alpha, reinterpret_cast *>(x), - incx); -} // other -// oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading -// the library. that is incompatible with oneAPI.jl destroying queues before -// that, so expose a function to manually wipe the device cache when we're -// destroying queues. +// oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading the library. +// that is incompatible with oneAPI.jl destroying queues before that, so expose a function +// to manually wipe the device cache when we're destroying queues. namespace oneapi { namespace mkl { namespace gpu { int clean_gpu_caches(); } -} // namespace mkl -} // namespace oneapi +} +} -extern "C" void onemklDestroy() { oneapi::mkl::gpu::clean_gpu_caches(); } +extern "C" void onemklDestroy() { + oneapi::mkl::gpu::clean_gpu_caches(); +} diff --git a/deps/onemkl.h b/deps/onemkl.h index b45e9eae..7e7e065b 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -10,9 +10,9 @@ extern "C" { #endif typedef enum { - ONEMKL_TRANSPOSE_NONTRANS, - ONEMKL_TRANSPOSE_TRANS, - ONEMLK_TRANSPOSE_CONJTRANS + ONEMKL_TRANSPOSE_NONTRANS, + ONEMKL_TRANSPOSE_TRANS, + ONEMLK_TRANSPOSE_CONJTRANS } onemklTranspose; // XXX: how to expose half in C? @@ -39,16 +39,6 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); -// Level-1: scal oneMKL -void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, - int64_t incx); -void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, - int64_t incx); -void onemklCscal(syclQueue_t device_queue, int64_t n, - float alpha, float _Complex *x, int64_t incx); -void onemklZscal(syclQueue_t device_queue, int64_t n, - double alpha, double _Complex *x, int64_t incx); - void onemklDestroy(); #ifdef __cplusplus } From 57495e992ca7abad1faedb4253d07ceeffeeb0ed Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:29:54 +0530 Subject: [PATCH 05/28] updated with scal - deps --- deps/onemkl.cpp | 27 +++++++++++++++++++++++++++ deps/onemkl.h | 10 ++++++++++ 2 files changed, 37 insertions(+) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ba9654ba..9fbec1ad 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -81,6 +81,33 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, return 0; } +// Support Level-1: SCAL primitive +extern "C" void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, + double *x, int64_t incx) { + oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); +} + +extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, + float *x, int64_t incx) { + oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); +} + +extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, + float alpha, float _Complex *x, + int64_t incx) { + oneapi::mkl::blas::column_major::scal( + device_queue->val, n, alpha, reinterpret_cast *>(x), + incx); +} + +extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, + double alpha, double _Complex *x, + int64_t incx) { + oneapi::mkl::blas::column_major::scal( + device_queue->val, n, alpha, reinterpret_cast *>(x), + incx); +} + // other diff --git a/deps/onemkl.h b/deps/onemkl.h index 7e7e065b..d5776cf6 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -39,6 +39,16 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, const double _Complex *B, int64_t ldb, double _Complex beta, double _Complex *C, int64_t ldc); +// Level-1: scal oneMKL +void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, + int64_t incx); +void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, + int64_t incx); +void onemklCscal(syclQueue_t device_queue, int64_t n, + float alpha, float _Complex *x, int64_t incx); +void onemklZscal(syclQueue_t device_queue, int64_t n, + double alpha, double _Complex *x, int64_t incx); + void onemklDestroy(); #ifdef __cplusplus } From 28f245f202b7c1e6305b0f714b1ff45d20519d60 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:35:24 +0530 Subject: [PATCH 06/28] indentation fixes --- deps/onemkl.h | 12 ++++-------- lib/mkl/wrappers.jl | 12 ++++++------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/deps/onemkl.h b/deps/onemkl.h index d5776cf6..fc6a3436 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -40,14 +40,10 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex *C, int64_t ldc); // Level-1: scal oneMKL -void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, - int64_t incx); -void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, - int64_t incx); -void onemklCscal(syclQueue_t device_queue, int64_t n, - float alpha, float _Complex *x, int64_t incx); -void onemklZscal(syclQueue_t device_queue, int64_t n, - double alpha, double _Complex *x, int64_t incx); +void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, int64_t incx); +void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, int64_t incx); +void onemklCscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx); +void onemklZscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx); void onemklDestroy(); #ifdef __cplusplus diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 4c11f597..a0cee887 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,12 +23,12 @@ for (fname, elty) in (:onemklCscal,:ComplexF32)) @eval begin function scal!(n::Integer, - alpha::Number, - x::StridedArray{$elty}) - queue = global_queue(context(x), device(x)) - $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) - x - end + alpha::Number, + x::StridedArray{$elty}) + queue = global_queue(context(x), device(x)) + $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) + x + end end end From 86591f512a0aa56b24b1e50d0fec4a6100df4638 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:37:36 +0530 Subject: [PATCH 07/28] NITS --- lib/mkl/wrappers.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index a0cee887..baa2dc64 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,12 +23,12 @@ for (fname, elty) in (:onemklCscal,:ComplexF32)) @eval begin function scal!(n::Integer, - alpha::Number, - x::StridedArray{$elty}) - queue = global_queue(context(x), device(x)) - $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) + alpha::Number, + x::StridedArray{$elty}) + queue = global_queue(context(x), device(x)) + $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) x - end + end end end From c49620fe56108b31b3fec3ce8ff15ec106d1031e Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 22:38:29 +0530 Subject: [PATCH 08/28] NITS --- lib/mkl/wrappers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index baa2dc64..59735259 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -27,7 +27,7 @@ for (fname, elty) in x::StridedArray{$elty}) queue = global_queue(context(x), device(x)) $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) - x + x end end end From 990300a65a7f30cfc5d0e712009642781029f335 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Mon, 31 Oct 2022 23:05:04 +0530 Subject: [PATCH 09/28] cleanup onemkl.jl --- test/onemkl.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 38e2851e..1ffca0b7 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -5,11 +5,6 @@ m = 20 n = 35 k = 13 -eltypes=[Float32, ComplexF32] -const float64_supported = oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 == oneL0.ZE_DEVICE_MODULE_FLAG_FP64 -if (float64_supported) - append!(eltypes, [Float64, ComplexF64]) -end ######################## @testset "level 1" begin From 1e164dbded3cbfb3c5c9bb20edd60938d362a9d3 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 1 Nov 2022 21:45:48 +0530 Subject: [PATCH 10/28] updated with rmul! and testf usage --- lib/mkl/linalg.jl | 3 +++ test/onemkl.jl | 27 ++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index d1d6ae6b..78ef59cf 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,6 +49,9 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end +#LinearAlgebra.rmul!(x::oneStridedVecOrMat, k::Number) = +# oneMKL.scal!(length(x), k, x) + for NT in (Number, Real) # NOTE: alpha/beta also ::Real to avoid ambiguities with certain Base methods @eval begin diff --git a/test/onemkl.jl b/test/onemkl.jl index 1ffca0b7..cdf7efa3 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -1,3 +1,4 @@ +using oneAPI using oneAPI.oneMKL using LinearAlgebra @@ -5,18 +6,34 @@ m = 20 n = 35 k = 13 - ######################## @testset "level 1" begin - @testset for T in eltypes - A = rand(T, m) + @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) + alpha = rand(T,1) + A = rand(T,m) gpuA = oneArray(A) + gpuB = oneArray(A) +##### Failed Cases +# @test testf(rmul!, gpuA, Ref(alpha[1])) +# @test testf(oneMKL.scal!, m, alpha[1], gpuA) +# oneMKL.scal!(m, alpha[1], gpuA) +# _A = Array(gpuA) +# @test isapprox(A .* alpha[1], _A) + +#### Pass Cases + # TODO: This test passes if we disable our own implementation + # of rmul in lib/mkl/linalg.jl which means rmul! cpu implmentation might + # be taking over it ? + @test testf(rmul!, gpuB, Ref(alpha[1])) + if T === Float32 oneMKL.scal!(m, 5f0, gpuA) - else + else oneMKL.scal!(m, 5.0, gpuA) end + _A = Array(gpuA) - @test isapprox(A .* 5, _A) + @test isapprox(A .* 5, _A) + end end From 139de1c9c9a06beab97f8bc122127ba8cdfe1dbd Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 1 Nov 2022 21:49:23 +0530 Subject: [PATCH 11/28] NITS --- test/onemkl.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index cdf7efa3..1f6fe3c3 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -14,24 +14,23 @@ k = 13 gpuA = oneArray(A) gpuB = oneArray(A) ##### Failed Cases +# Following test fails if we use our own rmul! implementation from lib/mkl/linalg.jl # @test testf(rmul!, gpuA, Ref(alpha[1])) # @test testf(oneMKL.scal!, m, alpha[1], gpuA) -# oneMKL.scal!(m, alpha[1], gpuA) -# _A = Array(gpuA) -# @test isapprox(A .* alpha[1], _A) #### Pass Cases # TODO: This test passes if we disable our own implementation # of rmul in lib/mkl/linalg.jl which means rmul! cpu implmentation might # be taking over it ? @test testf(rmul!, gpuB, Ref(alpha[1])) - + + # This test works fine + # It manually checks for CPU/GPU comparisons for scal primitive if T === Float32 oneMKL.scal!(m, 5f0, gpuA) else oneMKL.scal!(m, 5.0, gpuA) end - _A = Array(gpuA) @test isapprox(A .* 5, _A) From 23943bb204711626bb8c1f59711d1d9f62b0da35 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 2 Nov 2022 10:41:57 +0530 Subject: [PATCH 12/28] testf used for cpu/gpu testing. 1. Input array is converted to non-gpu array 2. alpha is rand instead of based on T typewq --- lib/mkl/linalg.jl | 4 ++-- test/onemkl.jl | 26 ++------------------------ 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index 78ef59cf..bb9e60f1 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,8 +49,8 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end -#LinearAlgebra.rmul!(x::oneStridedVecOrMat, k::Number) = -# oneMKL.scal!(length(x), k, x) +LinearAlgebra.rmul!(x::oneStridedVecOrMat, k::Number) = + oneMKL.scal!(length(x), k, x) for NT in (Number, Real) # NOTE: alpha/beta also ::Real to avoid ambiguities with certain Base methods diff --git a/test/onemkl.jl b/test/onemkl.jl index 1f6fe3c3..36fd72cd 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -9,30 +9,8 @@ k = 13 ######################## @testset "level 1" begin @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) - alpha = rand(T,1) + alpha = rand() A = rand(T,m) - gpuA = oneArray(A) - gpuB = oneArray(A) -##### Failed Cases -# Following test fails if we use our own rmul! implementation from lib/mkl/linalg.jl -# @test testf(rmul!, gpuA, Ref(alpha[1])) -# @test testf(oneMKL.scal!, m, alpha[1], gpuA) - -#### Pass Cases - # TODO: This test passes if we disable our own implementation - # of rmul in lib/mkl/linalg.jl which means rmul! cpu implmentation might - # be taking over it ? - @test testf(rmul!, gpuB, Ref(alpha[1])) - - # This test works fine - # It manually checks for CPU/GPU comparisons for scal primitive - if T === Float32 - oneMKL.scal!(m, 5f0, gpuA) - else - oneMKL.scal!(m, 5.0, gpuA) - end - _A = Array(gpuA) - @test isapprox(A .* 5, _A) - + @test testf(rmul!, A, Ref(alpha[1])) end end From b51b87017169314f59e2dada7f4dec93256dc5a0 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 2 Nov 2022 11:50:57 +0530 Subject: [PATCH 13/28] NITS - clenaup & included int specific calls to rmul! diverted to non-gpuarray flow --- lib/mkl/linalg.jl | 6 +++++- lib/mkl/oneMKL.jl | 3 ++- test/onemkl.jl | 10 ++++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index bb9e60f1..5ac16a5e 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -49,9 +49,13 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end end -LinearAlgebra.rmul!(x::oneStridedVecOrMat, k::Number) = +LinearAlgebra.rmul!(x::oneStridedVecOrMat{<:onemklFloat}, k::Number) = oneMKL.scal!(length(x), k, x) +# Work around ambiguity with GPUArrays wrapper +LinearAlgebra.rmul!(x::oneStridedVecOrMat{<:onemklFloat}, k::Real) = + invoke(rmul!, Tuple{typeof(x), Number}, x, k) + for NT in (Number, Real) # NOTE: alpha/beta also ::Real to avoid ambiguities with certain Base methods @eval begin diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index d83f2141..7b0b24b2 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -12,7 +12,8 @@ using GPUArrays include("libonemkl.jl") -const onemklFloat = Union{Float64,Float32,Float16,ComplexF64,ComplexF32} +# Exclude Float16 for now, since many oneMKL functions - copy, scal, do not take Float16 +const onemklFloat = Union{Float64,Float32,ComplexF64,ComplexF32} include("wrappers.jl") include("linalg.jl") diff --git a/test/onemkl.jl b/test/onemkl.jl index 36fd72cd..ff47a8b5 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -8,9 +8,11 @@ k = 13 ######################## @testset "level 1" begin - @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) - alpha = rand() - A = rand(T,m) - @test testf(rmul!, A, Ref(alpha[1])) + @testset for T in eltypes + if T <:oneMKL.onemklFloat + alpha = rand() + A = rand(T,m) + @test testf(rmul!, A, alpha) + end end end From ab0abb2ce9702d1d2c1060f80b08bec9c0e1715f Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 11:04:27 +0530 Subject: [PATCH 14/28] wrapper alpha turns elttype and support all combinationswq --- lib/mkl/linalg.jl | 2 +- lib/mkl/wrappers.jl | 2 +- test/onemkl.jl | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl index 5ac16a5e..029ca554 100644 --- a/lib/mkl/linalg.jl +++ b/lib/mkl/linalg.jl @@ -50,7 +50,7 @@ function gemm_dispatch!(C::oneStridedVecOrMat, A, B, alpha::Number=true, beta::N end LinearAlgebra.rmul!(x::oneStridedVecOrMat{<:onemklFloat}, k::Number) = - oneMKL.scal!(length(x), k, x) + oneMKL.scal!(length(x), convert(eltype(x),k), x) # Work around ambiguity with GPUArrays wrapper LinearAlgebra.rmul!(x::oneStridedVecOrMat{<:onemklFloat}, k::Real) = diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 59735259..6f1abffb 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,7 +23,7 @@ for (fname, elty) in (:onemklCscal,:ComplexF32)) @eval begin function scal!(n::Integer, - alpha::Number, + alpha::$elty, x::StridedArray{$elty}) queue = global_queue(context(x), device(x)) $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) diff --git a/test/onemkl.jl b/test/onemkl.jl index ff47a8b5..87cdfb7a 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -10,9 +10,7 @@ k = 13 @testset "level 1" begin @testset for T in eltypes if T <:oneMKL.onemklFloat - alpha = rand() - A = rand(T,m) - @test testf(rmul!, A, alpha) + @test testf(rmul!, rand(T,m), Ref(rand())) end end end From d5a58e0de7b75e681ecccddb5cfdeedf16b499e3 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 11:30:53 +0530 Subject: [PATCH 15/28] NITS --- test/onemkl.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 87cdfb7a..cfc56fd0 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -10,7 +10,7 @@ k = 13 @testset "level 1" begin @testset for T in eltypes if T <:oneMKL.onemklFloat - @test testf(rmul!, rand(T,m), Ref(rand())) + @test testf(rmul!, rand(T,m), rand()) end end end From c39e90c6757ddbfa03c541e8b907b37315b7cc49 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 12:11:28 +0530 Subject: [PATCH 16/28] support for Cs, Zd configs of scal function --- deps/onemkl.cpp | 17 ++++++++++++++++- deps/onemkl.h | 6 ++++-- lib/mkl/libonemkl.jl | 7 +++++++ lib/mkl/wrappers.jl | 3 ++- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index 9fbec1ad..ce007f3c 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -93,6 +93,14 @@ extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, } extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, + float _Complex alpha, float _Complex *x, + int64_t incx) { + oneapi::mkl::blas::column_major::scal( + device_queue->val, n, std::complex(alpha), reinterpret_cast *>(x), + incx); +} + +extern "C" void onemklCsscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx) { oneapi::mkl::blas::column_major::scal( @@ -101,6 +109,14 @@ extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, } extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, + double _Complex alpha, double _Complex *x, + int64_t incx) { + oneapi::mkl::blas::column_major::scal( + device_queue->val, n, std::complex(alpha), reinterpret_cast *>(x), + incx); +} + +extern "C" void onemklZdscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx) { oneapi::mkl::blas::column_major::scal( @@ -108,7 +124,6 @@ extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, incx); } - // other // oneMKL keeps a cache of SYCL queues and tries to destroy them when unloading the library. diff --git a/deps/onemkl.h b/deps/onemkl.h index fc6a3436..7a307caa 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -42,8 +42,10 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, // Level-1: scal oneMKL void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, int64_t incx); void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, int64_t incx); -void onemklCscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx); -void onemklZscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx); +void onemklCscal(syclQueue_t device_queue, int64_t n, float _Complex alpha, float _Complex *x, int64_t incx); +void onemklCsscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx); +void onemklZscal(syclQueue_t device_queue, int64_t n, double _Complex alpha, double _Complex *x, int64_t incx); +void onemklZdscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx); void onemklDestroy(); #ifdef __cplusplus diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 044f23ba..16170de9 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -54,7 +54,14 @@ function onemklZscal(device_queue, n, alpha, x, incx) @ccall liboneapi_support.onemklZscal(device_queue::syclQueue_t, n::Int64, alpha::ComplexF64, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid end +function onemklZdscal(device_queue, n, alpha, x, incx) + @ccall liboneapi_support.onemklZdscal(device_queue::syclQueue_t, n::Int64, alpha::Cdouble, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid +end + function onemklCscal(device_queue, n, alpha, x, incx) @ccall liboneapi_support.onemklCscal(device_queue::syclQueue_t, n::Int64, alpha::ComplexF32, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid end +function onemklCsscal(device_queue, n, alpha, x, incx) + @ccall liboneapi_support.onemklCsscal(device_queue::syclQueue_t, n::Int64, alpha::Cfloat, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid +end diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 6f1abffb..0763b0b6 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,9 +23,10 @@ for (fname, elty) in (:onemklCscal,:ComplexF32)) @eval begin function scal!(n::Integer, - alpha::$elty, + alpha::Number, x::StridedArray{$elty}) queue = global_queue(context(x), device(x)) + alpha = $elty(alpha) $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) x end From 0a17298c50b8433b6a4ccabc59cc326095e12ec8 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Thu, 3 Nov 2022 14:39:21 +0530 Subject: [PATCH 17/28] updated with staticcast complex alpha --- deps/onemkl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index ce007f3c..6cd9b216 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -96,7 +96,7 @@ extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, float _Complex alpha, float _Complex *x, int64_t incx) { oneapi::mkl::blas::column_major::scal( - device_queue->val, n, std::complex(alpha), reinterpret_cast *>(x), + device_queue->val, n, static_cast >(alpha), reinterpret_cast *>(x), incx); } @@ -112,7 +112,7 @@ extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, double _Complex alpha, double _Complex *x, int64_t incx) { oneapi::mkl::blas::column_major::scal( - device_queue->val, n, std::complex(alpha), reinterpret_cast *>(x), + device_queue->val, n, static_cast >(alpha), reinterpret_cast *>(x), incx); } From e0aa24ef3ea402b1338b90e6902cf986426b42f0 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 11:26:40 +0530 Subject: [PATCH 18/28] added onestridedarray --- lib/mkl/wrappers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 0763b0b6..489af082 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -24,7 +24,7 @@ for (fname, elty) in @eval begin function scal!(n::Integer, alpha::Number, - x::StridedArray{$elty}) + x::oneStridedArray{$elty}) queue = global_queue(context(x), device(x)) alpha = $elty(alpha) $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) From b9a7d29dd3e2c027770b76fab35d154e35b11078 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Fri, 4 Nov 2022 18:35:45 +0530 Subject: [PATCH 19/28] enable tests of complex tye --- test/onemkl.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index cfc56fd0..46f35ac2 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -10,7 +10,9 @@ k = 13 @testset "level 1" begin @testset for T in eltypes if T <:oneMKL.onemklFloat - @test testf(rmul!, rand(T,m), rand()) + alpha = rand(T,1) + @test testf(rmul!, rand(T,m), alpha[1]) end + end end From 1c1b2069e2a7994f2420c920e5f1b92ebf239785 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:06:57 +0530 Subject: [PATCH 20/28] updated with Csscal and Zdscal test enabled --- deps/onemkl.cpp | 1 + lib/mkl/wrappers.jl | 14 ++++++++++++-- test/onemkl.jl | 22 ++++++++++++++++++++-- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index f1dfabcd..b9b9c56b 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -122,6 +122,7 @@ extern "C" void onemklZdscal(syclQueue_t device_queue, int64_t n, oneapi::mkl::blas::column_major::scal( device_queue->val, n, alpha, reinterpret_cast *>(x), incx); +} extern "C" void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *y, int64_t incy) { diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index e13ecf3d..2ad8a29a 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -23,16 +23,26 @@ for (fname, elty) in (:onemklCscal,:ComplexF32)) @eval begin function scal!(n::Integer, - alpha::Number, + alpha::$elty, x::oneStridedArray{$elty}) queue = global_queue(context(x), device(x)) - alpha = $elty(alpha) $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) x end end end +for (fname, elty, celty) in ((:onemklCsscal, :Float32, :ComplexF32), + (:onemklZdscal, :Float64, :ComplexF64)) + @eval begin + function scal!(n::Integer, + alpha::$elty, + x::oneStridedArray{$celty}) + queue = global_queue(context(x), device(x)) + $fname(sycl_queue(queue), n, alpha, x, stride(x,1)) + end + end +end # # BLAS # diff --git a/test/onemkl.jl b/test/onemkl.jl index 34391523..795feb2f 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -3,7 +3,7 @@ using oneAPI.oneMKL using LinearAlgebra -m = 20 +m = 20 n = 35 k = 13 @@ -15,8 +15,26 @@ k = 13 oneMKL.copy!(m,A,B) @test Array(A) == Array(B) - # Test scal primitive + # Test scal primitive [alpha/x: F32, F64, CF32, CF64] alpha = rand(T,1) @test testf(rmul!, rand(T,m), alpha[1]) + + # Test scal primitive [alpha - F32, F64, x - CF32, CF64] + A = rand(T,m) + gpuA = oneArray(A) + if T === ComplexF32 + alphaf32 = rand(Float32, 1) + B = A .* alphaf32[1] + oneMKL.scal!(m, alphaf32[1], gpuA) + @test Array(B) == Array(gpuA) + end + + if T === ComplexF64 + alphaf64 = rand(Float64, 1) + B = A .* alphaF64[1] + oneMKL.scal!(m, alphaf64[1], gpuA) + @test Array(B) == Array(gpuA) + end + end # level 1 testset end From d423806aa3e485beaa47b84f9586db3938c5b315 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:20:53 +0530 Subject: [PATCH 21/28] NITS --- deps/onemkl.cpp | 22 ++++++++++------------ deps/onemkl.h | 18 ++++++++++++------ lib/mkl/libonemkl.jl | 18 ++++++++++++------ 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/deps/onemkl.cpp b/deps/onemkl.cpp index b9b9c56b..e02e93a7 100644 --- a/deps/onemkl.cpp +++ b/deps/onemkl.cpp @@ -95,33 +95,31 @@ extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, float _Complex alpha, float _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal( - device_queue->val, n, static_cast >(alpha), reinterpret_cast *>(x), - incx); + oneapi::mkl::blas::column_major::scal(device_queue->val, n, + static_cast >(alpha), + reinterpret_cast *>(x),incx); } extern "C" void onemklCsscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal( - device_queue->val, n, alpha, reinterpret_cast *>(x), - incx); + oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, + reinterpret_cast *>(x),incx); } extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, double _Complex alpha, double _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal( - device_queue->val, n, static_cast >(alpha), reinterpret_cast *>(x), - incx); + oneapi::mkl::blas::column_major::scal(device_queue->val, n, + static_cast >(alpha), + reinterpret_cast *>(x),incx); } extern "C" void onemklZdscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal( - device_queue->val, n, alpha, reinterpret_cast *>(x), - incx); + oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, + reinterpret_cast *>(x),incx); } extern "C" void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, diff --git a/deps/onemkl.h b/deps/onemkl.h index af9d2cfb..4fdca1e4 100644 --- a/deps/onemkl.h +++ b/deps/onemkl.h @@ -40,12 +40,18 @@ int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, double _Complex *C, int64_t ldc); // Level-1: scal oneMKL -void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, int64_t incx); -void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, int64_t incx); -void onemklCscal(syclQueue_t device_queue, int64_t n, float _Complex alpha, float _Complex *x, int64_t incx); -void onemklCsscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx); -void onemklZscal(syclQueue_t device_queue, int64_t n, double _Complex alpha, double _Complex *x, int64_t incx); -void onemklZdscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx); +void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, + double *x, int64_t incx); +void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, + float *x, int64_t incx); +void onemklCscal(syclQueue_t device_queue, int64_t n, float _Complex alpha, + float _Complex *x, int64_t incx); +void onemklCsscal(syclQueue_t device_queue, int64_t n, float alpha, + float _Complex *x, int64_t incx); +void onemklZscal(syclQueue_t device_queue, int64_t n, double _Complex alpha, + double _Complex *x, int64_t incx); +void onemklZdscal(syclQueue_t device_queue, int64_t n, double alpha, + double _Complex *x, int64_t incx); void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, int64_t incx, double *y, int64_t incy); diff --git a/lib/mkl/libonemkl.jl b/lib/mkl/libonemkl.jl index 755e32e6..1271c5c1 100644 --- a/lib/mkl/libonemkl.jl +++ b/lib/mkl/libonemkl.jl @@ -43,27 +43,33 @@ function onemklZgemm(device_queue, transA, transB, m, n, k, alpha, A, lda, B, ld end function onemklDscal(device_queue, n, alpha, x, incx) - @ccall liboneapi_support.onemklDscal(device_queue::syclQueue_t, n::Int64, alpha::Cdouble, x::ZePtr{Cdouble}, incx::Int64)::Cvoid + @ccall liboneapi_support.onemklDscal(device_queue::syclQueue_t, n::Int64, + alpha::Cdouble, x::ZePtr{Cdouble}, incx::Int64)::Cvoid end function onemklSscal(device_queue, n, alpha, x, incx) - @ccall liboneapi_support.onemklSscal(device_queue::syclQueue_t, n::Int64, alpha::Cfloat, x::ZePtr{Cfloat}, incx::Int64)::Cvoid + @ccall liboneapi_support.onemklSscal(device_queue::syclQueue_t, n::Int64, + alpha::Cfloat, x::ZePtr{Cfloat}, incx::Int64)::Cvoid end function onemklZscal(device_queue, n, alpha, x, incx) - @ccall liboneapi_support.onemklZscal(device_queue::syclQueue_t, n::Int64, alpha::ComplexF64, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid + @ccall liboneapi_support.onemklZscal(device_queue::syclQueue_t, n::Int64, + alpha::ComplexF64, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid end function onemklZdscal(device_queue, n, alpha, x, incx) - @ccall liboneapi_support.onemklZdscal(device_queue::syclQueue_t, n::Int64, alpha::Cdouble, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid + @ccall liboneapi_support.onemklZdscal(device_queue::syclQueue_t, n::Int64, + alpha::Cdouble, x::ZePtr{ComplexF64}, incx::Int64)::Cvoid end function onemklCscal(device_queue, n, alpha, x, incx) - @ccall liboneapi_support.onemklCscal(device_queue::syclQueue_t, n::Int64, alpha::ComplexF32, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid + @ccall liboneapi_support.onemklCscal(device_queue::syclQueue_t, n::Int64, + alpha::ComplexF32, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid end function onemklCsscal(device_queue, n, alpha, x, incx) - @ccall liboneapi_support.onemklCsscal(device_queue::syclQueue_t, n::Int64, alpha::Cfloat, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid + @ccall liboneapi_support.onemklCsscal(device_queue::syclQueue_t, n::Int64, + alpha::Cfloat, x::ZePtr{ComplexF32}, incx::Int64)::Cvoid end function onemklDcopy(device_queue, n, x, incx, y, incy) From 071f69e126e598fb8d300fcdf714f7f2210b7caa Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:36:54 +0530 Subject: [PATCH 22/28] NITS --- lib/mkl/wrappers.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/mkl/wrappers.jl b/lib/mkl/wrappers.jl index 2ad8a29a..a778b9da 100644 --- a/lib/mkl/wrappers.jl +++ b/lib/mkl/wrappers.jl @@ -17,12 +17,12 @@ end # level 1 ## scal for (fname, elty) in - ((:onemklDscal,:Float64), - (:onemklSscal,:Float32), - (:onemklZscal,:ComplexF64), - (:onemklCscal,:ComplexF32)) + ((:onemklDscal,:Float64), + (:onemklSscal,:Float32), + (:onemklZscal,:ComplexF64), + (:onemklCscal,:ComplexF32)) @eval begin - function scal!(n::Integer, + function scal!(n::Integer, alpha::$elty, x::oneStridedArray{$elty}) queue = global_queue(context(x), device(x)) From 7be9df6df3ffbe66f682650d9c68ab747e163dc3 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:54:52 +0530 Subject: [PATCH 23/28] NITS --- test/onemkl.jl | 51 +++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 795feb2f..8a41f28a 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -10,31 +10,36 @@ k = 13 ############################################################################################ @testset "level 1" begin @testset for T in intersect(eltypes, [Float32, Float64, ComplexF32, ComplexF64]) - A = oneArray(rand(T, m)) - B = oneArray{T}(undef, m) - oneMKL.copy!(m,A,B) - @test Array(A) == Array(B) + @testset "copy" begin + A = oneArray(rand(T, m)) + B = oneArray{T}(undef, m) + oneMKL.copy!(m,A,B) + @test Array(A) == Array(B) + end + + @testset "scal" begin + # Test scal primitive [alpha/x: F32, F64, CF32, CF64] + alpha = rand(T,1) + @test testf(rmul!, rand(T,m), alpha[1]) - # Test scal primitive [alpha/x: F32, F64, CF32, CF64] - alpha = rand(T,1) - @test testf(rmul!, rand(T,m), alpha[1]) + # Test scal primitive [alpha - F32, F64, x - CF32, CF64] + A = rand(T,m) + gpuA = oneArray(A) + if T === ComplexF32 + alphaf32 = rand(Float32, 1) + B = A .* alphaf32[1] + oneMKL.scal!(m, alphaf32[1], gpuA) + @test Array(B) == Array(gpuA) + end - # Test scal primitive [alpha - F32, F64, x - CF32, CF64] - A = rand(T,m) - gpuA = oneArray(A) - if T === ComplexF32 - alphaf32 = rand(Float32, 1) - B = A .* alphaf32[1] - oneMKL.scal!(m, alphaf32[1], gpuA) - @test Array(B) == Array(gpuA) - end - - if T === ComplexF64 - alphaf64 = rand(Float64, 1) - B = A .* alphaF64[1] - oneMKL.scal!(m, alphaf64[1], gpuA) - @test Array(B) == Array(gpuA) - end + if T === ComplexF64 + alphaf64 = rand(Float64, 1) + B = A .* alphaF64[1] + oneMKL.scal!(m, alphaf64[1], gpuA) + @test Array(B) == Array(gpuA) + end + + end end # level 1 testset end From 038252f515ea53377e239f9df4fe36cbab4835e7 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 13:58:27 +0530 Subject: [PATCH 24/28] NITS --- test/onemkl.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 8a41f28a..954785b3 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -19,25 +19,25 @@ k = 13 @testset "scal" begin # Test scal primitive [alpha/x: F32, F64, CF32, CF64] - alpha = rand(T,1) - @test testf(rmul!, rand(T,m), alpha[1]) + alpha = rand(T,1) + @test testf(rmul!, rand(T,m), alpha[1]) - # Test scal primitive [alpha - F32, F64, x - CF32, CF64] - A = rand(T,m) - gpuA = oneArray(A) - if T === ComplexF32 - alphaf32 = rand(Float32, 1) - B = A .* alphaf32[1] - oneMKL.scal!(m, alphaf32[1], gpuA) - @test Array(B) == Array(gpuA) - end + # Test scal primitive [alpha - F32, F64, x - CF32, CF64] + A = rand(T,m) + gpuA = oneArray(A) + if T === ComplexF32 + alphaf32 = rand(Float32, 1) + B = A .* alphaf32[1] + oneMKL.scal!(m, alphaf32[1], gpuA) + @test Array(B) == Array(gpuA) + end - if T === ComplexF64 - alphaf64 = rand(Float64, 1) - B = A .* alphaF64[1] - oneMKL.scal!(m, alphaf64[1], gpuA) - @test Array(B) == Array(gpuA) - end + if T === ComplexF64 + alphaf64 = rand(Float64, 1) + B = A .* alphaF64[1] + oneMKL.scal!(m, alphaf64[1], gpuA) + @test Array(B) == Array(gpuA) + end end From d07339ea3a62a51a40457d9f307e691c70ba8c17 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Tue, 8 Nov 2022 14:42:37 +0530 Subject: [PATCH 25/28] Cleanup of tests --- test/onemkl.jl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index 954785b3..affb50e9 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -27,19 +27,15 @@ k = 13 gpuA = oneArray(A) if T === ComplexF32 alphaf32 = rand(Float32, 1) - B = A .* alphaf32[1] oneMKL.scal!(m, alphaf32[1], gpuA) - @test Array(B) == Array(gpuA) + @test isapprox(A .* alphaf32[1], Array(gpuA)) end if T === ComplexF64 alphaf64 = rand(Float64, 1) - B = A .* alphaF64[1] oneMKL.scal!(m, alphaf64[1], gpuA) - @test Array(B) == Array(gpuA) - end - + @test isapprox(A .* alphaf64[1], Array(gpuA)) + end end - end # level 1 testset end From cbbd3a4a8fff3ce2e0e7c774b5c68f00d2521ca2 Mon Sep 17 00:00:00 2001 From: Kali Uday Date: Wed, 9 Nov 2022 06:44:29 +0530 Subject: [PATCH 26/28] Instead of isapprox use compare op --- test/onemkl.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/onemkl.jl b/test/onemkl.jl index affb50e9..a158b811 100644 --- a/test/onemkl.jl +++ b/test/onemkl.jl @@ -28,13 +28,13 @@ k = 13 if T === ComplexF32 alphaf32 = rand(Float32, 1) oneMKL.scal!(m, alphaf32[1], gpuA) - @test isapprox(A .* alphaf32[1], Array(gpuA)) + @test Array(A .* alphaf32[1]) ≈ Array(gpuA) end if T === ComplexF64 alphaf64 = rand(Float64, 1) oneMKL.scal!(m, alphaf64[1], gpuA) - @test isapprox(A .* alphaf64[1], Array(gpuA)) + @test Array(A .* alphaf64[1]) ≈ Array(gpuA) end end end # level 1 testset From af68c99c799392e9d9db1b8cbaef97a7cd991bf6 Mon Sep 17 00:00:00 2001 From: Kali Uday Balleda Date: Fri, 11 Nov 2022 00:18:18 +0530 Subject: [PATCH 27/28] Bug fix: disable f16 check as it is not supported (CI crash) --- deps/src/onemkl.cpp | 25 +++++++++++++++++-------- lib/mkl/oneMKL.jl | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/deps/src/onemkl.cpp b/deps/src/onemkl.cpp index e02e93a7..864f2127 100644 --- a/deps/src/onemkl.cpp +++ b/deps/src/onemkl.cpp @@ -84,42 +84,51 @@ extern "C" int onemklZgemm(syclQueue_t device_queue, onemklTranspose transA, // Support Level-1: SCAL primitive extern "C" void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); + auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, + x, incx); + status.wait(); + } extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); + auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, + x, incx); + status.wait(); } extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, float _Complex alpha, float _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, - static_cast >(alpha), + auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, + static_cast >(alpha), reinterpret_cast *>(x),incx); + status.wait(); } extern "C" void onemklCsscal(syclQueue_t device_queue, int64_t n, float alpha, float _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, + auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, reinterpret_cast *>(x),incx); + status.wait(); } extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, double _Complex alpha, double _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, - static_cast >(alpha), + auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, + static_cast >(alpha), reinterpret_cast *>(x),incx); + status.wait(); } extern "C" void onemklZdscal(syclQueue_t device_queue, int64_t n, double alpha, double _Complex *x, int64_t incx) { - oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, + auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, reinterpret_cast *>(x),incx); + status.wait(); } extern "C" void onemklDcopy(syclQueue_t device_queue, int64_t n, const double *x, diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index b7cc45c9..7b0b24b2 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -13,7 +13,7 @@ using GPUArrays include("libonemkl.jl") # Exclude Float16 for now, since many oneMKL functions - copy, scal, do not take Float16 -const onemklFloat = Union{Float64,Float32,Float16,ComplexF64,ComplexF32} +const onemklFloat = Union{Float64,Float32,ComplexF64,ComplexF32} include("wrappers.jl") include("linalg.jl") From 578c3006188d6e1fa199a6180e0b439f822745bd Mon Sep 17 00:00:00 2001 From: kballeda Date: Tue, 22 Nov 2022 11:57:50 +0530 Subject: [PATCH 28/28] use force flush instead of wait() --- deps/src/onemkl.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deps/src/onemkl.cpp b/deps/src/onemkl.cpp index 8e88dd5b..a26ed8f5 100644 --- a/deps/src/onemkl.cpp +++ b/deps/src/onemkl.cpp @@ -92,7 +92,7 @@ extern "C" void onemklDscal(syclQueue_t device_queue, int64_t n, double alpha, double *x, int64_t incx) { auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); - status.wait(); + __FORCE_MKL_FLUSH__(status); } @@ -100,7 +100,7 @@ extern "C" void onemklSscal(syclQueue_t device_queue, int64_t n, float alpha, float *x, int64_t incx) { auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, x, incx); - status.wait(); + __FORCE_MKL_FLUSH__(status); } extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, @@ -109,7 +109,7 @@ extern "C" void onemklCscal(syclQueue_t device_queue, int64_t n, auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, static_cast >(alpha), reinterpret_cast *>(x),incx); - status.wait(); + __FORCE_MKL_FLUSH__(status); } extern "C" void onemklCsscal(syclQueue_t device_queue, int64_t n, @@ -117,7 +117,7 @@ extern "C" void onemklCsscal(syclQueue_t device_queue, int64_t n, int64_t incx) { auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, reinterpret_cast *>(x),incx); - status.wait(); + __FORCE_MKL_FLUSH__(status); } extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, @@ -126,7 +126,7 @@ extern "C" void onemklZscal(syclQueue_t device_queue, int64_t n, auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, static_cast >(alpha), reinterpret_cast *>(x),incx); - status.wait(); + __FORCE_MKL_FLUSH__(status); } extern "C" void onemklZdscal(syclQueue_t device_queue, int64_t n, @@ -134,7 +134,7 @@ extern "C" void onemklZdscal(syclQueue_t device_queue, int64_t n, int64_t incx) { auto status = oneapi::mkl::blas::column_major::scal(device_queue->val, n, alpha, reinterpret_cast *>(x),incx); - status.wait(); + __FORCE_MKL_FLUSH__(status); } extern "C" void onemklDnrm2(syclQueue_t device_queue, int64_t n, const double *x,