From 25bacad8a2a6c269641c824d528e2a4e2a681b33 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 20 May 2026 15:03:45 +0200 Subject: [PATCH 01/20] offload frontend draft impl --- src/base/frontend.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 src/base/frontend.rs diff --git a/src/base/frontend.rs b/src/base/frontend.rs new file mode 100644 index 0000000..17f1bfe --- /dev/null +++ b/src/base/frontend.rs @@ -0,0 +1,87 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload)] +#![cfg_attr(target_os = "linux", feature(core_intrinsics))] +#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] +#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] +#![no_std] + +#[cfg(target_os = "linux")] +extern crate libc; + +use core::offload::offload_kernel; + +#[cfg(target_arch = "nvptx64")] +use core::arch::nvptx::{ + _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, +}; + +#[panic_handler] +fn panic(_: &core::panic::PanicInfo) -> ! { + loop {} +} + +// library +struct Linear1D; + +pub trait PartitioningStrategy { + fn get_mut<'a, T>(data: &'a mut [T]) -> Option<&'a mut T>; +} + +impl PartitioningStrategy for Linear1D { + fn get_mut<'a, T>(data: &'a mut [T]) -> Option<&'a mut T> { + #[cfg(target_arch = "nvptx64")] + let i = unsafe { (block_idx_x() * block_dim_x() + thread_idx_x()) as usize }; + #[cfg(target_os = "linux")] + let i = 0; + if i < data.len() { + Some(&mut data[i]) + } else { + None + } + } +} + +struct Region<'a, T, S> { + data: &'a mut [T], + _marker: core::marker::PhantomData, +} + +impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { + pub fn new(data: &'a mut [T]) -> Self { + Self { + data, + _marker: core::marker::PhantomData, + } + } + + pub fn get(&self, index: usize) -> Option<&T> { + self.data.get(index) + } + + pub fn get_mut(&mut self) -> Option<&mut T> { + S::get_mut(self.data) + } +} + +// source code +#[offload_kernel] +fn foo(x: &mut Region) { + if let Some(e) = x.get_mut() { + *e = 42.0 as f64; + } +} + +#[cfg(target_os = "linux")] +#[unsafe(no_mangle)] +fn main() { + let mut x = [0.0f64; 256]; + let mut reg = Region::<_, Linear1D>::new(&mut x); + core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [256, 1, 1], (&mut reg,)); + for i in 0..x.len() { + assert_eq!(x[i], 42.0 as f64); + } +} From 98cbdfb5f4fb04feff294974e7e6ed69f8065355 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Sat, 23 May 2026 13:14:39 +0200 Subject: [PATCH 02/20] add more indexing patterns and modify design --- src/base/frontend.rs | 260 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 229 insertions(+), 31 deletions(-) diff --git a/src/base/frontend.rs b/src/base/frontend.rs index 17f1bfe..ef6bba8 100644 --- a/src/base/frontend.rs +++ b/src/base/frontend.rs @@ -14,74 +14,272 @@ extern crate libc; use core::offload::offload_kernel; -#[cfg(target_arch = "nvptx64")] -use core::arch::nvptx::{ - _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, -}; - #[panic_handler] fn panic(_: &core::panic::PanicInfo) -> ! { loop {} } -// library -struct Linear1D; +/* +* library +*/ + +// index helpers for mental sanity xd +#[derive(Clone, Copy)] +pub struct Dim3 { + pub x: usize, + pub y: usize, + pub z: usize, +} + +fn global_thread_dim() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: (_block_idx_x() * _block_dim_x() + _thread_idx_x()) as usize, + y: (_block_idx_y() * _block_dim_y() + _thread_idx_y()) as usize, + z: (_block_idx_z() * _block_dim_z() + _thread_idx_z()) as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} pub trait PartitioningStrategy { - fn get_mut<'a, T>(data: &'a mut [T]) -> Option<&'a mut T>; + type Shape: Copy; + type View<'a, T: 'a>; + type ViewMut<'a, T: 'a>; + + unsafe fn get<'a, T>( + ptr: *const T, + len: usize, + shape: Self::Shape, + ) -> Option>; + unsafe fn get_mut<'a, T>( + ptr: *mut T, + len: usize, + shape: Self::Shape, + ) -> Option>; } +pub struct Region<'a, T, S: PartitioningStrategy> { + ptr: *mut T, + len: usize, + pub shape: S::Shape, + _marker: core::marker::PhantomData<&'a mut [T]>, +} + +impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { + pub fn new(data: &'a mut [T], shape: S::Shape) -> Self { + Self { + ptr: data.as_mut_ptr(), + len: data.len(), + shape, + _marker: core::marker::PhantomData, + } + } + + pub fn get(&self) -> Option> { + unsafe { S::get(self.ptr as *const T, self.len, self.shape) } + } + + pub fn get_mut(&mut self) -> Option> { + unsafe { S::get_mut(self.ptr, self.len, self.shape) } + } +} + +// linear1d +pub struct Linear1D; impl PartitioningStrategy for Linear1D { - fn get_mut<'a, T>(data: &'a mut [T]) -> Option<&'a mut T> { - #[cfg(target_arch = "nvptx64")] - let i = unsafe { (block_idx_x() * block_dim_x() + thread_idx_x()) as usize }; - #[cfg(target_os = "linux")] - let i = 0; - if i < data.len() { - Some(&mut data[i]) + type Shape = (); + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + unsafe fn get<'a, T>(ptr: *const T, len: usize, _: Self::Shape) -> Option> { + let tid = global_thread_dim().x; + if tid < len { + Some(unsafe { &*ptr.add(tid) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>( + ptr: *mut T, + len: usize, + _: Self::Shape, + ) -> Option> { + let tid = global_thread_dim().x; + if tid < len { + Some(unsafe { &mut *ptr.add(tid) }) } else { None } } } -struct Region<'a, T, S> { - data: &'a mut [T], - _marker: core::marker::PhantomData, +// stencil2d +pub struct StencilViewMut<'a, T> { + base_ptr: *mut T, + center_idx: usize, + cols: usize, + _marker: core::marker::PhantomData<&'a mut T>, } +impl<'a, T> StencilViewMut<'a, T> { + pub fn set_center(&mut self, val: T) { + unsafe { + *self.base_ptr.add(self.center_idx) = val; + } + } -impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { - pub fn new(data: &'a mut [T]) -> Self { - Self { - data, - _marker: core::marker::PhantomData, + pub fn get_neighbour(&self, ox: isize, oy: isize) -> &T { + unsafe { + &*self + .base_ptr + .offset((self.center_idx as isize) + (oy * self.cols as isize) + ox) } } +} + +pub struct Stencil2D; +impl PartitioningStrategy for Stencil2D { + type Shape = (usize, usize); + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = StencilViewMut<'a, T>; - pub fn get(&self, index: usize) -> Option<&T> { - self.data.get(index) + unsafe fn get<'a, T>(_: *const T, _: usize, _: Self::Shape) -> Option> { + unimplemented!() + } + unsafe fn get_mut<'a, T>( + ptr: *mut T, + len: usize, + shape: Self::Shape, + ) -> Option> { + let tid = global_thread_dim(); + let x = tid.x + R; + let y = tid.y + R; + if x < shape.0 - R && y < shape.1 - R { + let center_idx = y * shape.0 + x; + if center_idx < len { + Some(StencilViewMut { + base_ptr: ptr, + center_idx, + cols: shape.0, + _marker: core::marker::PhantomData, + }) + } else { + None + } + } else { + None + } } +} - pub fn get_mut(&mut self) -> Option<&mut T> { - S::get_mut(self.data) +// stride +pub struct StrideViewMut<'a, T> { + block_ptr: *mut T, + stride: usize, + width: usize, + height: usize, + _marker: core::marker::PhantomData<&'a mut T>, +} +impl<'a, T> StrideViewMut<'a, T> { + pub fn set(&mut self, x: usize, y: usize, val: T) { + if x < self.width && y < self.height { + unsafe { + *self.block_ptr.add(y * self.stride + x) = val; + } + } + } +} + +pub struct Stride2D; +impl PartitioningStrategy + for Stride2D +{ + type Shape = (usize, usize); + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = StrideViewMut<'a, T>; + + unsafe fn get<'a, T>(_: *const T, _: usize, _: Self::Shape) -> Option> { + unimplemented!() + } + unsafe fn get_mut<'a, T>( + ptr: *mut T, + _: usize, + shape: Self::Shape, + ) -> Option> { + let tid = global_thread_dim(); + let start_x = tid.x * SX; + let start_y = tid.y * SY; + if start_x + W <= shape.0 && start_y + H <= shape.1 { + Some(StrideViewMut { + block_ptr: unsafe { ptr.add(start_y * shape.0 + start_x) }, + stride: shape.0, + width: W, + height: H, + _marker: core::marker::PhantomData, + }) + } else { + None + } } } // source code #[offload_kernel] -fn foo(x: &mut Region) { +fn linear1d(x: &mut Region) { if let Some(e) = x.get_mut() { - *e = 42.0 as f64; + *e = 42.0; + } +} + +#[offload_kernel] +fn stencil2d(grid: &mut Region>) { + if let Some(mut view) = grid.get_mut() { + let mid = *view.get_neighbour(0, 0); + let left = *view.get_neighbour(-1, 0); + let right = *view.get_neighbour(1, 0); + view.set_center((left + mid + right) / 3.0); + } +} + +#[offload_kernel] +fn stride2d(grid: &mut Region>) { + if let Some(mut view) = grid.get_mut() { + view.set(0, 0, 42.0); + view.set(1, 1, 42.0); } } #[cfg(target_os = "linux")] #[unsafe(no_mangle)] fn main() { + // linear1d let mut x = [0.0f64; 256]; - let mut reg = Region::<_, Linear1D>::new(&mut x); - core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [256, 1, 1], (&mut reg,)); + let mut reg = Region::<_, Linear1D>::new(&mut x, ()); + core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); for i in 0..x.len() { assert_eq!(x[i], 42.0 as f64); } + + // stencil2d + let mut grid = [ + 1.0, 1.0, 1.0, 1.0, // + 1.0, 4.0, 1.0, 1.0, // cargo fmt don't merge this lines + 1.0, 1.0, 1.0, 1.0, // + 1.0, 1.0, 1.0, 1.0, + ]; + let mut reg_stencil = Region::<_, Stencil2D<1>>::new(&mut grid, (4, 4)); + core::intrinsics::offload::<_, _, ()>(stencil2d, [1, 1, 1], [2, 2, 1], (&mut reg_stencil,)); + // thread (0, 0, 0) will have center on (x, y) = 1 (index = 5), so (1 + 4 + 1) / 3 = 20 + assert_eq!(grid[5], 2.0); + + // stride2d + let mut blocks = [0.0; 64]; + let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4>>::new(&mut blocks, (8, 8)); + core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); + // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements + assert_eq!(blocks[0], 42.0); + assert_eq!(blocks[9], 42.0); } From 154244d231ab06872569d8214a63eb2d96408836 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Tue, 26 May 2026 12:28:41 +0200 Subject: [PATCH 03/20] add offload! macro --- src/base/frontend.rs | 69 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/src/base/frontend.rs b/src/base/frontend.rs index ef6bba8..01cc236 100644 --- a/src/base/frontend.rs +++ b/src/base/frontend.rs @@ -19,6 +19,54 @@ fn panic(_: &core::panic::PanicInfo) -> ! { loop {} } +/* +* macro +*/ +#[macro_export] +macro_rules! offload { + ( $($field:ident = $val:expr),* $(,)? ) => { + $crate::offload!(@munch + [ $($field = $val),* ]; + kernel = NONE; + grid_dim = ([1, 1, 1]); + block_dim = ([1, 1, 1]); + args = NONE + ); + }; + + (@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; args = $a); + }; + (@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; args = $a); + }; + (@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); args = $a); + }; + (@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; args = (SOME $val)); + }; + + (@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + compile_error!(concat!("unkown field ", stringify!($invalid))); + }; + + (@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + compile_error!("missing `kernel`"); + }; + (@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = NONE) => { + compile_error!("missing `args`"); + }; + (@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); args = (SOME $args:expr)) => { + core::intrinsics::offload::<_, _, ()>( + $kernel, + $grid_dim, + $block_dim, + $args, + ) + }; +} + /* * library */ @@ -258,7 +306,12 @@ fn main() { // linear1d let mut x = [0.0f64; 256]; let mut reg = Region::<_, Linear1D>::new(&mut x, ()); - core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); + // core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); + offload! { + kernel = linear1d, + grid_dim = [256, 1, 1], + args = (&mut reg,), + }; for i in 0..x.len() { assert_eq!(x[i], 42.0 as f64); } @@ -271,14 +324,24 @@ fn main() { 1.0, 1.0, 1.0, 1.0, ]; let mut reg_stencil = Region::<_, Stencil2D<1>>::new(&mut grid, (4, 4)); - core::intrinsics::offload::<_, _, ()>(stencil2d, [1, 1, 1], [2, 2, 1], (&mut reg_stencil,)); + // core::intrinsics::offload::<_, _, ()>(stencil2d, [1, 1, 1], [2, 2, 1], (&mut reg_stencil,)); + offload! { + kernel = stencil2d, + block_dim = [2, 2, 1], + args = (&mut reg_stencil,), + }; // thread (0, 0, 0) will have center on (x, y) = 1 (index = 5), so (1 + 4 + 1) / 3 = 20 assert_eq!(grid[5], 2.0); // stride2d let mut blocks = [0.0; 64]; let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4>>::new(&mut blocks, (8, 8)); - core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); + // core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); + offload! { + kernel = stride2d, + block_dim = [2, 2, 1], + args = (&mut reg_stride,), + }; // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements assert_eq!(blocks[0], 42.0); assert_eq!(blocks[9], 42.0); From a70c194637cf743ac6fc728e8f60f9dc669ce643 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Tue, 26 May 2026 18:01:56 +0200 Subject: [PATCH 04/20] add offload frontend submodule --- .gitmodules | 4 ++++ crates/rustc_offload_frontend | 1 + 2 files changed, 5 insertions(+) create mode 100644 .gitmodules create mode 160000 crates/rustc_offload_frontend diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..d76dec3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "crates/rustc_offload_frontend"] + path = crates/rustc_offload_frontend + url = https://github.com/sa4dus/rustc_offload_frontend + branch = main diff --git a/crates/rustc_offload_frontend b/crates/rustc_offload_frontend new file mode 160000 index 0000000..ec8119e --- /dev/null +++ b/crates/rustc_offload_frontend @@ -0,0 +1 @@ +Subproject commit ec8119ef3f8c1e984522f601558f987a25f105dd From 47bf051b1a08e5c219ecc9d4bfa20aeb1ea4705e Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Tue, 26 May 2026 18:07:07 +0200 Subject: [PATCH 05/20] use rustc_offload_frontend crate --- Cargo.lock | 8 ++ Cargo.toml | 1 + src/base/frontend.rs | 278 ++----------------------------------------- 3 files changed, 21 insertions(+), 266 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f5194d..a589b02 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,14 @@ checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "rust_perf" version = "0.1.0" +dependencies = [ + "libc", + "rustc_offload_frontend", +] + +[[package]] +name = "rustc_offload_frontend" +version = "0.1.0" dependencies = [ "libc", ] diff --git a/Cargo.toml b/Cargo.toml index c1032a5..6074354 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ crate-type = ["cdylib"] [dependencies] libc = { version = "0.2.175", default-features = false } +rustc_offload_frontend = { path = "crates/rustc_offload_frontend" } [features] default = ["all", "f64"] diff --git a/src/base/frontend.rs b/src/base/frontend.rs index 01cc236..59861c1 100644 --- a/src/base/frontend.rs +++ b/src/base/frontend.rs @@ -3,278 +3,21 @@ #![allow(improper_ctypes)] #![allow(improper_gpu_kernel_arg)] #![allow(improper_ctypes_definitions)] + #![feature(gpu_offload)] + #![cfg_attr(target_os = "linux", feature(core_intrinsics))] -#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] #![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] -#![no_std] + +#![cfg_attr(target_arch = "nvptx64", no_std)] +#![cfg_attr(target_arch = "nvptx64", no_main)] #[cfg(target_os = "linux")] extern crate libc; -use core::offload::offload_kernel; - -#[panic_handler] -fn panic(_: &core::panic::PanicInfo) -> ! { - loop {} -} - -/* -* macro -*/ -#[macro_export] -macro_rules! offload { - ( $($field:ident = $val:expr),* $(,)? ) => { - $crate::offload!(@munch - [ $($field = $val),* ]; - kernel = NONE; - grid_dim = ([1, 1, 1]); - block_dim = ([1, 1, 1]); - args = NONE - ); - }; - - (@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; args = $a); - }; - (@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; args = $a); - }; - (@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); args = $a); - }; - (@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; args = (SOME $val)); - }; - - (@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - compile_error!(concat!("unkown field ", stringify!($invalid))); - }; - - (@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - compile_error!("missing `kernel`"); - }; - (@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = NONE) => { - compile_error!("missing `args`"); - }; - (@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); args = (SOME $args:expr)) => { - core::intrinsics::offload::<_, _, ()>( - $kernel, - $grid_dim, - $block_dim, - $args, - ) - }; -} - -/* -* library -*/ - -// index helpers for mental sanity xd -#[derive(Clone, Copy)] -pub struct Dim3 { - pub x: usize, - pub y: usize, - pub z: usize, -} - -fn global_thread_dim() -> Dim3 { - #[cfg(target_arch = "nvptx64")] - unsafe { - use core::arch::nvptx::*; - Dim3 { - x: (_block_idx_x() * _block_dim_x() + _thread_idx_x()) as usize, - y: (_block_idx_y() * _block_dim_y() + _thread_idx_y()) as usize, - z: (_block_idx_z() * _block_dim_z() + _thread_idx_z()) as usize, - } - } - #[cfg(target_os = "linux")] - Dim3 { x: 0, y: 0, z: 0 } -} - -pub trait PartitioningStrategy { - type Shape: Copy; - type View<'a, T: 'a>; - type ViewMut<'a, T: 'a>; - - unsafe fn get<'a, T>( - ptr: *const T, - len: usize, - shape: Self::Shape, - ) -> Option>; - unsafe fn get_mut<'a, T>( - ptr: *mut T, - len: usize, - shape: Self::Shape, - ) -> Option>; -} - -pub struct Region<'a, T, S: PartitioningStrategy> { - ptr: *mut T, - len: usize, - pub shape: S::Shape, - _marker: core::marker::PhantomData<&'a mut [T]>, -} - -impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { - pub fn new(data: &'a mut [T], shape: S::Shape) -> Self { - Self { - ptr: data.as_mut_ptr(), - len: data.len(), - shape, - _marker: core::marker::PhantomData, - } - } - - pub fn get(&self) -> Option> { - unsafe { S::get(self.ptr as *const T, self.len, self.shape) } - } - - pub fn get_mut(&mut self) -> Option> { - unsafe { S::get_mut(self.ptr, self.len, self.shape) } - } -} - -// linear1d -pub struct Linear1D; -impl PartitioningStrategy for Linear1D { - type Shape = (); - type View<'a, T: 'a> = &'a T; - type ViewMut<'a, T: 'a> = &'a mut T; - - unsafe fn get<'a, T>(ptr: *const T, len: usize, _: Self::Shape) -> Option> { - let tid = global_thread_dim().x; - if tid < len { - Some(unsafe { &*ptr.add(tid) }) - } else { - None - } - } - unsafe fn get_mut<'a, T>( - ptr: *mut T, - len: usize, - _: Self::Shape, - ) -> Option> { - let tid = global_thread_dim().x; - if tid < len { - Some(unsafe { &mut *ptr.add(tid) }) - } else { - None - } - } -} - -// stencil2d -pub struct StencilViewMut<'a, T> { - base_ptr: *mut T, - center_idx: usize, - cols: usize, - _marker: core::marker::PhantomData<&'a mut T>, -} -impl<'a, T> StencilViewMut<'a, T> { - pub fn set_center(&mut self, val: T) { - unsafe { - *self.base_ptr.add(self.center_idx) = val; - } - } - - pub fn get_neighbour(&self, ox: isize, oy: isize) -> &T { - unsafe { - &*self - .base_ptr - .offset((self.center_idx as isize) + (oy * self.cols as isize) + ox) - } - } -} - -pub struct Stencil2D; -impl PartitioningStrategy for Stencil2D { - type Shape = (usize, usize); - type View<'a, T: 'a> = &'a T; - type ViewMut<'a, T: 'a> = StencilViewMut<'a, T>; - - unsafe fn get<'a, T>(_: *const T, _: usize, _: Self::Shape) -> Option> { - unimplemented!() - } - unsafe fn get_mut<'a, T>( - ptr: *mut T, - len: usize, - shape: Self::Shape, - ) -> Option> { - let tid = global_thread_dim(); - let x = tid.x + R; - let y = tid.y + R; - if x < shape.0 - R && y < shape.1 - R { - let center_idx = y * shape.0 + x; - if center_idx < len { - Some(StencilViewMut { - base_ptr: ptr, - center_idx, - cols: shape.0, - _marker: core::marker::PhantomData, - }) - } else { - None - } - } else { - None - } - } -} - -// stride -pub struct StrideViewMut<'a, T> { - block_ptr: *mut T, - stride: usize, - width: usize, - height: usize, - _marker: core::marker::PhantomData<&'a mut T>, -} -impl<'a, T> StrideViewMut<'a, T> { - pub fn set(&mut self, x: usize, y: usize, val: T) { - if x < self.width && y < self.height { - unsafe { - *self.block_ptr.add(y * self.stride + x) = val; - } - } - } -} - -pub struct Stride2D; -impl PartitioningStrategy - for Stride2D -{ - type Shape = (usize, usize); - type View<'a, T: 'a> = &'a T; - type ViewMut<'a, T: 'a> = StrideViewMut<'a, T>; - - unsafe fn get<'a, T>(_: *const T, _: usize, _: Self::Shape) -> Option> { - unimplemented!() - } - unsafe fn get_mut<'a, T>( - ptr: *mut T, - _: usize, - shape: Self::Shape, - ) -> Option> { - let tid = global_thread_dim(); - let start_x = tid.x * SX; - let start_y = tid.y * SY; - if start_x + W <= shape.0 && start_y + H <= shape.1 { - Some(StrideViewMut { - block_ptr: unsafe { ptr.add(start_y * shape.0 + start_x) }, - stride: shape.0, - width: W, - height: H, - _marker: core::marker::PhantomData, - }) - } else { - None - } - } -} +use rustc_offload_frontend::{offload_kernel}; +use rustc_offload_frontend::partition::{Region, Linear1D, Stencil2D, Stride2D}; -// source code #[offload_kernel] fn linear1d(x: &mut Region) { if let Some(e) = x.get_mut() { @@ -301,8 +44,9 @@ fn stride2d(grid: &mut Region>) { } #[cfg(target_os = "linux")] -#[unsafe(no_mangle)] fn main() { + use rustc_offload_frontend::offload; + // linear1d let mut x = [0.0f64; 256]; let mut reg = Region::<_, Linear1D>::new(&mut x, ()); @@ -330,7 +74,7 @@ fn main() { block_dim = [2, 2, 1], args = (&mut reg_stencil,), }; - // thread (0, 0, 0) will have center on (x, y) = 1 (index = 5), so (1 + 4 + 1) / 3 = 20 + // thread (0, 0, 0) will have center on (x, y) = 1 (index = 5), so (1 + 4 + 1) / 3 = 2 assert_eq!(grid[5], 2.0); // stride2d @@ -345,4 +89,6 @@ fn main() { // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements assert_eq!(blocks[0], 42.0); assert_eq!(blocks[9], 42.0); + + println!("all checks passed!"); } From cc763c02712cbc8f762e339ed7ac646d9a3944f3 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 27 May 2026 23:53:00 +0200 Subject: [PATCH 06/20] fix stencil add linear2d and blur example --- crates/rustc_offload_frontend | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/rustc_offload_frontend b/crates/rustc_offload_frontend index ec8119e..32bc723 160000 --- a/crates/rustc_offload_frontend +++ b/crates/rustc_offload_frontend @@ -1 +1 @@ -Subproject commit ec8119ef3f8c1e984522f601558f987a25f105dd +Subproject commit 32bc723028de930a395c218bdf2065e5f5f8373c From 2361ad74f9bd20ce558ceec3907463e2ed0b7d67 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Thu, 28 May 2026 00:04:37 +0200 Subject: [PATCH 07/20] show rustc_offload_frontend submodule file tree on changes --- crates/rustc_offload_frontend | 1 - crates/rustc_offload_frontend/.gitignore | 1 + crates/rustc_offload_frontend/Cargo.lock | 16 ++ crates/rustc_offload_frontend/Cargo.toml | 29 +++ crates/rustc_offload_frontend/README.md | 1 + crates/rustc_offload_frontend/src/gpu.rs | 20 ++ crates/rustc_offload_frontend/src/lib.rs | 64 ++++++ crates/rustc_offload_frontend/src/main.rs | 99 ++++++++ .../rustc_offload_frontend/src/partition.rs | 212 ++++++++++++++++++ 9 files changed, 442 insertions(+), 1 deletion(-) delete mode 160000 crates/rustc_offload_frontend create mode 100644 crates/rustc_offload_frontend/.gitignore create mode 100644 crates/rustc_offload_frontend/Cargo.lock create mode 100644 crates/rustc_offload_frontend/Cargo.toml create mode 100644 crates/rustc_offload_frontend/README.md create mode 100644 crates/rustc_offload_frontend/src/gpu.rs create mode 100644 crates/rustc_offload_frontend/src/lib.rs create mode 100644 crates/rustc_offload_frontend/src/main.rs create mode 100644 crates/rustc_offload_frontend/src/partition.rs diff --git a/crates/rustc_offload_frontend b/crates/rustc_offload_frontend deleted file mode 160000 index 32bc723..0000000 --- a/crates/rustc_offload_frontend +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 32bc723028de930a395c218bdf2065e5f5f8373c diff --git a/crates/rustc_offload_frontend/.gitignore b/crates/rustc_offload_frontend/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/crates/rustc_offload_frontend/.gitignore @@ -0,0 +1 @@ +/target diff --git a/crates/rustc_offload_frontend/Cargo.lock b/crates/rustc_offload_frontend/Cargo.lock new file mode 100644 index 0000000..c4532ca --- /dev/null +++ b/crates/rustc_offload_frontend/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "rustc_offload_frontend" +version = "0.1.0" +dependencies = [ + "libc", +] diff --git a/crates/rustc_offload_frontend/Cargo.toml b/crates/rustc_offload_frontend/Cargo.toml new file mode 100644 index 0000000..e1dd08f --- /dev/null +++ b/crates/rustc_offload_frontend/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "rustc_offload_frontend" +version = "0.1.0" +edition = "2024" +license = "MIT" +description = "rustc offload feature frontend draft" +repository = "https://github.com/sa4dus/rustc_offload_frontend" +readme = "README.md" + +[lib] +crate-type = ["rlib", "cdylib"] +path = "src/lib.rs" + +[[bin]] +name = "main" +path = "src/main.rs" + +[dependencies] +libc = { version = "0.2.175", default-features = false } + +[features] + +[profile.release] +lto = "fat" +panic = "abort" + +[profile.dev] +lto = "fat" +panic = "abort" diff --git a/crates/rustc_offload_frontend/README.md b/crates/rustc_offload_frontend/README.md new file mode 100644 index 0000000..d518681 --- /dev/null +++ b/crates/rustc_offload_frontend/README.md @@ -0,0 +1 @@ +# rustc_offload_frontend diff --git a/crates/rustc_offload_frontend/src/gpu.rs b/crates/rustc_offload_frontend/src/gpu.rs new file mode 100644 index 0000000..5bc5223 --- /dev/null +++ b/crates/rustc_offload_frontend/src/gpu.rs @@ -0,0 +1,20 @@ +#[derive(Clone, Copy)] +pub struct Dim3 { + pub x: usize, + pub y: usize, + pub z: usize, +} + +pub(crate) fn global_thread_dim() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: (_block_idx_x() * _block_dim_x() + _thread_idx_x()) as usize, + y: (_block_idx_y() * _block_dim_y() + _thread_idx_y()) as usize, + z: (_block_idx_z() * _block_dim_z() + _thread_idx_z()) as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} diff --git a/crates/rustc_offload_frontend/src/lib.rs b/crates/rustc_offload_frontend/src/lib.rs new file mode 100644 index 0000000..1c116dd --- /dev/null +++ b/crates/rustc_offload_frontend/src/lib.rs @@ -0,0 +1,64 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload)] +#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] +#![cfg_attr(target_arch = "nvptx64", no_std)] + +pub use core::offload::offload_kernel; + +pub mod gpu; +pub mod partition; + +#[macro_export] +macro_rules! offload { + ( $($field:ident = $val:expr),* $(,)? ) => { + $crate::offload!(@munch + [ $($field = $val),* ]; + kernel = NONE; + grid_dim = ([1, 1, 1]); + block_dim = ([1, 1, 1]); + args = NONE + ); + }; + + (@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; args = $a); + }; + (@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; args = $a); + }; + (@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); args = $a); + }; + (@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; args = (SOME $val)); + }; + + (@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + compile_error!(concat!("unkown field ", stringify!($invalid))); + }; + + (@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + compile_error!("missing `kernel`"); + }; + (@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = NONE) => { + compile_error!("missing `args`"); + }; + (@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); args = (SOME $args:expr)) => { + core::intrinsics::offload::<_, _, ()>( + $kernel, + $grid_dim, + $block_dim, + $args, + ) + }; +} + +#[cfg(target_arch = "nvptx64")] +#[panic_handler] +fn panic(_: &core::panic::PanicInfo) -> ! { + loop {} +} diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs new file mode 100644 index 0000000..a2535cc --- /dev/null +++ b/crates/rustc_offload_frontend/src/main.rs @@ -0,0 +1,99 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload)] +#![cfg_attr(target_os = "linux", feature(core_intrinsics))] +#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] +#![cfg_attr(target_arch = "nvptx64", no_std)] +#![cfg_attr(target_arch = "nvptx64", no_main)] + +#[cfg(target_os = "linux")] +extern crate libc; + +use rustc_offload_frontend::offload_kernel; +use rustc_offload_frontend::partition::{Linear1D, Linear2D, Region, Stencil2D, Stride2D}; + +#[offload_kernel] +fn linear1d(x: &mut Region) { + if let Some(e) = x.get_mut() { + *e = 42.0; + } +} + +#[offload_kernel] +fn stride2d(grid: &mut Region>) { + if let Some(mut view) = grid.get_mut() { + view.set(0, 0, 42.0); + view.set(1, 1, 42.0); + } +} + +#[offload_kernel] +fn conv_blur2d(input: &Region>, output: &mut Region) { + if let (Some(in_view), Some(out_cell)) = (input.get(), output.get_mut()) { + let mut sum = 0.0; + + for dy in -1..=1 { + for dx in -1..=1 { + sum += in_view.get_neighbour(dx, dy); + } + } + + *out_cell = sum / 9.0; + } +} + +#[cfg(target_os = "linux")] +fn main() { + use rustc_offload_frontend::offload; + + // linear1d + let mut x = [0.0f64; 256]; + let mut reg = Region::<_, Linear1D>::new(&mut x, ()); + // core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); + offload! { + kernel = linear1d, + grid_dim = [256, 1, 1], + args = (&mut reg,), + }; + for i in 0..x.len() { + assert_eq!(x[i], 42.0 as f64); + } + + // stride2d + let mut blocks = [0.0; 64]; + let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4>>::new(&mut blocks, (8, 8)); + // core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); + offload! { + kernel = stride2d, + block_dim = [2, 2, 1], + args = (&mut reg_stride,), + }; + // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements + assert_eq!(blocks[0], 42.0); + assert_eq!(blocks[9], 42.0); + + // conv_blur2d + let mut input_data = [ + 0.0, 0.0, 0.0, 0.0, // + 0.0, 9.0, 9.0, 0.0, // + 0.0, 9.0, 9.0, 0.0, // + 0.0, 0.0, 0.0, 0.0, // + ]; + let mut output_data = [0.0f64; 16]; + + let reg_input = Region::<_, Stencil2D<1>>::new(&mut input_data, (4, 4)); + let mut reg_output = Region::<_, Linear2D>::new(&mut output_data, (4, 4)); + + offload! { + kernel = conv_blur2d, + block_dim = [4, 4, 1], + args = (®_input, &mut reg_output,), + }; + + println!("{:#?}", output_data); + + println!("all checks passed!"); +} diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs new file mode 100644 index 0000000..55deee2 --- /dev/null +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -0,0 +1,212 @@ +use crate::gpu::global_thread_dim; +use core::prelude::v1::*; + +pub unsafe trait PartitioningStrategy { + type Shape: Copy; + type View<'a, T: 'a>; + type ViewMut<'a, T: 'a>; + + unsafe fn get<'a, T>( + ptr: *const T, + len: usize, + shape: Self::Shape, + ) -> Option>; + unsafe fn get_mut<'a, T>( + ptr: *mut T, + len: usize, + shape: Self::Shape, + ) -> Option>; +} + +pub struct Region<'a, T, S: PartitioningStrategy> { + ptr: *mut T, + len: usize, + pub shape: S::Shape, + _marker: core::marker::PhantomData<&'a mut [T]>, +} + +impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { + pub fn new(data: &'a mut [T], shape: S::Shape) -> Self { + Self { + ptr: data.as_mut_ptr(), + len: data.len(), + shape, + _marker: core::marker::PhantomData, + } + } + + pub fn get(&self) -> Option> { + unsafe { S::get(self.ptr as *const T, self.len, self.shape) } + } + + pub fn get_mut(&mut self) -> Option> { + unsafe { S::get_mut(self.ptr, self.len, self.shape) } + } +} + +// linear1d +pub struct Linear1D; +unsafe impl PartitioningStrategy for Linear1D { + type Shape = (); + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + unsafe fn get<'a, T>(ptr: *const T, len: usize, _: Self::Shape) -> Option> { + let tid = global_thread_dim().x; + if tid < len { + Some(unsafe { &*ptr.add(tid) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>( + ptr: *mut T, + len: usize, + _: Self::Shape, + ) -> Option> { + let tid = global_thread_dim().x; + if tid < len { + Some(unsafe { &mut *ptr.add(tid) }) + } else { + None + } + } +} + +// linear2d +pub struct Linear2D; +unsafe impl PartitioningStrategy for Linear2D { + type Shape = (usize, usize); + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + unsafe fn get<'a, T>( + ptr: *const T, + len: usize, + shape: Self::Shape, + ) -> Option> { + let tid = global_thread_dim(); + let idx = tid.y * shape.0 + tid.x; + if idx < len { + Some(unsafe { &*ptr.add(idx) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>( + ptr: *mut T, + len: usize, + shape: Self::Shape, + ) -> Option> { + let tid = global_thread_dim(); + let idx = tid.y * shape.0 + tid.x; + if idx < len { + Some(unsafe { &mut *ptr.add(idx) }) + } else { + None + } + } +} + +// stencil2d +pub struct Stencil2D; + +pub struct StencilView<'a, T> { + base_ptr: *const T, + center_idx: usize, + cols: usize, + _marker: core::marker::PhantomData<&'a T>, +} + +impl<'a, T> StencilView<'a, T> { + pub fn get_neighbour(&self, ox: isize, oy: isize) -> &T { + unsafe { + &*self + .base_ptr + .offset((self.center_idx as isize) + (oy * self.cols as isize) + ox) + } + } +} + +unsafe impl PartitioningStrategy for Stencil2D { + type Shape = (usize, usize); + type View<'a, T: 'a> = StencilView<'a, T>; + type ViewMut<'a, T: 'a> = core::marker::PhantomData<&'a mut T>; + + unsafe fn get<'a, T>( + ptr: *const T, + len: usize, + shape: Self::Shape, + ) -> Option> { + let (cols, _rows) = shape; + let tid = global_thread_dim(); + + let center_idx = tid.y * cols + tid.x; + + if center_idx < len { + Some(StencilView { + base_ptr: ptr, + center_idx, + cols, + _marker: core::marker::PhantomData, + }) + } else { + None + } + } + + unsafe fn get_mut<'a, T>(_: *mut T, _: usize, _: Self::Shape) -> Option> { + None + } +} + +// stride +pub struct StrideViewMut<'a, T> { + block_ptr: *mut T, + stride: usize, + width: usize, + height: usize, + _marker: core::marker::PhantomData<&'a mut T>, +} +impl<'a, T> StrideViewMut<'a, T> { + pub fn set(&mut self, x: usize, y: usize, val: T) { + if x < self.width && y < self.height { + unsafe { + *self.block_ptr.add(y * self.stride + x) = val; + } + } + } +} + +pub struct Stride2D; +unsafe impl PartitioningStrategy + for Stride2D +{ + type Shape = (usize, usize); + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = StrideViewMut<'a, T>; + + unsafe fn get<'a, T>(_: *const T, _: usize, _: Self::Shape) -> Option> { + unimplemented!() + } + unsafe fn get_mut<'a, T>( + ptr: *mut T, + _: usize, + shape: Self::Shape, + ) -> Option> { + let tid = global_thread_dim(); + let start_x = tid.x * SX; + let start_y = tid.y * SY; + if start_x + W <= shape.0 && start_y + H <= shape.1 { + Some(StrideViewMut { + block_ptr: unsafe { ptr.add(start_y * shape.0 + start_x) }, + stride: shape.0, + width: W, + height: H, + _marker: core::marker::PhantomData, + }) + } else { + None + } + } +} From 0d27058eb3110a0bc28e4cc5dcab6fc52b228f69 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Thu, 28 May 2026 11:57:03 +0200 Subject: [PATCH 08/20] strange bug, we should look at this at some point --- crates/rustc_offload_frontend/src/main.rs | 44 ++++++++++++++++--- .../rustc_offload_frontend/src/partition.rs | 16 ++++--- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs index a2535cc..994de3c 100644 --- a/crates/rustc_offload_frontend/src/main.rs +++ b/crates/rustc_offload_frontend/src/main.rs @@ -45,6 +45,13 @@ fn conv_blur2d(input: &Region>, output: &mut Region, y: &mut Region) { + if let (Some(val_x), Some(val_y)) = (x.get(), y.get_mut()) { + *val_y = alpha * (*val_x) + (*val_y); + } +} + #[cfg(target_os = "linux")] fn main() { use rustc_offload_frontend::offload; @@ -58,6 +65,8 @@ fn main() { grid_dim = [256, 1, 1], args = (&mut reg,), }; + println!("GPU bits: {:064b} value: {:?}", x[0].to_bits(), x[0]); + println!("CPU bits: {:064b} value: {:?}", 42.0f64.to_bits(), 42.0); for i in 0..x.len() { assert_eq!(x[i], 42.0 as f64); } @@ -76,24 +85,47 @@ fn main() { assert_eq!(blocks[9], 42.0); // conv_blur2d - let mut input_data = [ + let mut input = [ 0.0, 0.0, 0.0, 0.0, // 0.0, 9.0, 9.0, 0.0, // 0.0, 9.0, 9.0, 0.0, // 0.0, 0.0, 0.0, 0.0, // ]; - let mut output_data = [0.0f64; 16]; - - let reg_input = Region::<_, Stencil2D<1>>::new(&mut input_data, (4, 4)); - let mut reg_output = Region::<_, Linear2D>::new(&mut output_data, (4, 4)); + let mut output = [0.0f64; 16]; + let reg_input = Region::<_, Stencil2D<1>>::new(&mut input, (4, 4)); + let mut reg_output = Region::<_, Linear2D>::new(&mut output, (4, 4)); offload! { kernel = conv_blur2d, block_dim = [4, 4, 1], args = (®_input, &mut reg_output,), }; - println!("{:#?}", output_data); + let expected = [ + 1.0, 2.0, 2.0, 1.0, // + 2.0, 4.0, 4.0, 2.0, // + 2.0, 4.0, 4.0, 2.0, // + 1.0, 2.0, 2.0, 1.0, // + ]; + assert_eq!(output, expected); + + // saxpy + const N: usize = 512; + let alpha: f32 = 2.5; + let x: [f32; N] = [2.0; N]; + let mut y: [f32; N] = [1.0; N]; + + let reg_x = Region::<_, Linear1D>::new(&x, ()); + let mut reg_y = Region::<_, Linear1D>::new(&mut y, ()); + + offload! { + kernel = saxpy_kernel, + grid_dim = [N as u32, 1, 1], args = (alpha, ®_x, &mut reg_y,), + }; + + for i in 0..N { + assert_eq!(y[i], 6.0f32); + } println!("all checks passed!"); } diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index 55deee2..6dfa20a 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -18,18 +18,22 @@ pub unsafe trait PartitioningStrategy { ) -> Option>; } -pub struct Region<'a, T, S: PartitioningStrategy> { +pub struct Region<'a, T, S: PartitioningStrategy, B: ?Sized = [T]> { ptr: *mut T, len: usize, pub shape: S::Shape, - _marker: core::marker::PhantomData<&'a mut [T]>, + _marker: core::marker::PhantomData<&'a B>, } -impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { - pub fn new(data: &'a mut [T], shape: S::Shape) -> Self { +impl<'a, T, S: PartitioningStrategy, B: ?Sized> Region<'a, T, S, B> { + pub fn new(data: &'a B, shape: S::Shape) -> Self + where + B: AsRef<[T]>, + { + let data_ref = data.as_ref(); Self { - ptr: data.as_mut_ptr(), - len: data.len(), + ptr: data_ref.as_ptr() as *mut T, + len: data_ref.len(), shape, _marker: core::marker::PhantomData, } From 1b80abaf43f42d8d8eef0a9557ec2381eaae3e83 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Thu, 28 May 2026 12:44:51 +0200 Subject: [PATCH 09/20] allow mut and inmut slices and arrays --- crates/rustc_offload_frontend/src/main.rs | 9 ++- .../rustc_offload_frontend/src/partition.rs | 63 ++++++++++++++++--- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs index 994de3c..260cc70 100644 --- a/crates/rustc_offload_frontend/src/main.rs +++ b/crates/rustc_offload_frontend/src/main.rs @@ -65,8 +65,6 @@ fn main() { grid_dim = [256, 1, 1], args = (&mut reg,), }; - println!("GPU bits: {:064b} value: {:?}", x[0].to_bits(), x[0]); - println!("CPU bits: {:064b} value: {:?}", 42.0f64.to_bits(), 42.0); for i in 0..x.len() { assert_eq!(x[i], 42.0 as f64); } @@ -85,7 +83,7 @@ fn main() { assert_eq!(blocks[9], 42.0); // conv_blur2d - let mut input = [ + let input = [ 0.0, 0.0, 0.0, 0.0, // 0.0, 9.0, 9.0, 0.0, // 0.0, 9.0, 9.0, 0.0, // @@ -93,7 +91,7 @@ fn main() { ]; let mut output = [0.0f64; 16]; - let reg_input = Region::<_, Stencil2D<1>>::new(&mut input, (4, 4)); + let reg_input = Region::<_, Stencil2D<1>>::new(&input, (4, 4)); let mut reg_output = Region::<_, Linear2D>::new(&mut output, (4, 4)); offload! { kernel = conv_blur2d, @@ -120,7 +118,8 @@ fn main() { offload! { kernel = saxpy_kernel, - grid_dim = [N as u32, 1, 1], args = (alpha, ®_x, &mut reg_y,), + grid_dim = [N as u32, 1, 1], + args = (alpha, ®_x, &mut reg_y,), }; for i in 0..N { diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index 6dfa20a..6cb10b9 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -1,4 +1,5 @@ use crate::gpu::global_thread_dim; +use core::convert::From; use core::prelude::v1::*; pub unsafe trait PartitioningStrategy { @@ -18,22 +19,68 @@ pub unsafe trait PartitioningStrategy { ) -> Option>; } -pub struct Region<'a, T, S: PartitioningStrategy, B: ?Sized = [T]> { +pub struct Region<'a, T, S: PartitioningStrategy> { ptr: *mut T, len: usize, pub shape: S::Shape, - _marker: core::marker::PhantomData<&'a B>, + _marker: core::marker::PhantomData<&'a mut [T]>, } -impl<'a, T, S: PartitioningStrategy, B: ?Sized> Region<'a, T, S, B> { - pub fn new(data: &'a B, shape: S::Shape) -> Self +pub struct RawRegion<'a, T> { + pub ptr: *mut T, + pub len: usize, + _marker: core::marker::PhantomData<&'a mut [T]>, +} + +impl<'a, T> From<&'a mut [T]> for RawRegion<'a, T> { + fn from(data: &'a mut [T]) -> Self { + Self { + ptr: data.as_mut_ptr(), + len: data.len(), + _marker: core::marker::PhantomData, + } + } +} + +impl<'a, T> From<&'a [T]> for RawRegion<'a, T> { + fn from(data: &'a [T]) -> Self { + Self { + ptr: data.as_ptr() as *mut T, + len: data.len(), + _marker: core::marker::PhantomData, + } + } +} + +impl<'a, T, const N: usize> From<&'a mut [T; N]> for RawRegion<'a, T> { + fn from(data: &'a mut [T; N]) -> Self { + Self { + ptr: data.as_mut_ptr(), + len: N, + _marker: core::marker::PhantomData, + } + } +} + +impl<'a, T, const N: usize> From<&'a [T; N]> for RawRegion<'a, T> { + fn from(data: &'a [T; N]) -> Self { + Self { + ptr: data.as_ptr() as *mut T, + len: N, + _marker: core::marker::PhantomData, + } + } +} + +impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { + pub fn new(data: D, shape: S::Shape) -> Self where - B: AsRef<[T]>, + D: Into>, { - let data_ref = data.as_ref(); + let raw = data.into(); Self { - ptr: data_ref.as_ptr() as *mut T, - len: data_ref.len(), + ptr: raw.ptr, + len: raw.len, shape, _marker: core::marker::PhantomData, } From 1998105bc74fd1f2c68232534d00aed007f08ea6 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Sun, 31 May 2026 21:57:27 +0200 Subject: [PATCH 10/20] fix ub and update to rl/r main --- crates/rustc_offload_frontend/src/lib.rs | 31 +++--- crates/rustc_offload_frontend/src/main.rs | 4 +- .../rustc_offload_frontend/src/partition.rs | 21 +++-- src/base/frontend.rs | 94 ------------------- 4 files changed, 36 insertions(+), 114 deletions(-) delete mode 100644 src/base/frontend.rs diff --git a/crates/rustc_offload_frontend/src/lib.rs b/crates/rustc_offload_frontend/src/lib.rs index 1c116dd..fbd03ba 100644 --- a/crates/rustc_offload_frontend/src/lib.rs +++ b/crates/rustc_offload_frontend/src/lib.rs @@ -20,38 +20,43 @@ macro_rules! offload { kernel = NONE; grid_dim = ([1, 1, 1]); block_dim = ([1, 1, 1]); + dyn_cache = (0); args = NONE ); }; - (@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; args = $a); + (@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; dyn_cache = $d; args = $a); }; - (@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; args = $a); + (@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; dyn_cache = $d; args = $a); }; - (@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); args = $a); + (@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); dyn_cache = $d; args = $a); }; - (@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; args = (SOME $val)); + (@munch [dyn_cache = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; dyn_cache = ($val); args = $a); + }; + (@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; dyn_cache = $d; args = (SOME $val)); }; - (@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { - compile_error!(concat!("unkown field ", stringify!($invalid))); + (@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + compile_error!(concat!("unknown field ", stringify!($invalid))); }; - (@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; args = $a:tt) => { + (@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { compile_error!("missing `kernel`"); }; - (@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; args = NONE) => { + (@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = NONE) => { compile_error!("missing `args`"); }; - (@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); args = (SOME $args:expr)) => { + (@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); dyn_cache = ($dyn_cache:expr); args = (SOME $args:expr)) => { core::intrinsics::offload::<_, _, ()>( $kernel, $grid_dim, $block_dim, + $dyn_cache, $args, ) }; diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs index 260cc70..3c202cd 100644 --- a/crates/rustc_offload_frontend/src/main.rs +++ b/crates/rustc_offload_frontend/src/main.rs @@ -37,7 +37,9 @@ fn conv_blur2d(input: &Region>, output: &mut Region { base_ptr: *const T, center_idx: usize, cols: usize, + rows: usize, _marker: core::marker::PhantomData<&'a T>, } impl<'a, T> StencilView<'a, T> { - pub fn get_neighbour(&self, ox: isize, oy: isize) -> &T { - unsafe { - &*self - .base_ptr - .offset((self.center_idx as isize) + (oy * self.cols as isize) + ox) + pub fn get_neighbour(&self, ox: isize, oy: isize) -> Option<&T> { + let cx = (self.center_idx % self.cols) as isize; + let cy = (self.center_idx / self.cols) as isize; + + let nx = cx + ox; + let ny = cy + oy; + + if nx >= 0 && nx < self.cols as isize && ny >= 0 && ny < self.rows as isize { + let offset = ny * (self.cols as isize) + nx; + Some(unsafe { &*self.base_ptr.offset(offset) }) + } else { + None } } } @@ -189,7 +197,7 @@ unsafe impl PartitioningStrategy for Stencil2D { len: usize, shape: Self::Shape, ) -> Option> { - let (cols, _rows) = shape; + let (cols, rows) = shape; let tid = global_thread_dim(); let center_idx = tid.y * cols + tid.x; @@ -199,6 +207,7 @@ unsafe impl PartitioningStrategy for Stencil2D { base_ptr: ptr, center_idx, cols, + rows, _marker: core::marker::PhantomData, }) } else { diff --git a/src/base/frontend.rs b/src/base/frontend.rs deleted file mode 100644 index 59861c1..0000000 --- a/src/base/frontend.rs +++ /dev/null @@ -1,94 +0,0 @@ -#![allow(internal_features)] -#![allow(linker_messages)] -#![allow(improper_ctypes)] -#![allow(improper_gpu_kernel_arg)] -#![allow(improper_ctypes_definitions)] - -#![feature(gpu_offload)] - -#![cfg_attr(target_os = "linux", feature(core_intrinsics))] -#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] - -#![cfg_attr(target_arch = "nvptx64", no_std)] -#![cfg_attr(target_arch = "nvptx64", no_main)] - -#[cfg(target_os = "linux")] -extern crate libc; - -use rustc_offload_frontend::{offload_kernel}; -use rustc_offload_frontend::partition::{Region, Linear1D, Stencil2D, Stride2D}; - -#[offload_kernel] -fn linear1d(x: &mut Region) { - if let Some(e) = x.get_mut() { - *e = 42.0; - } -} - -#[offload_kernel] -fn stencil2d(grid: &mut Region>) { - if let Some(mut view) = grid.get_mut() { - let mid = *view.get_neighbour(0, 0); - let left = *view.get_neighbour(-1, 0); - let right = *view.get_neighbour(1, 0); - view.set_center((left + mid + right) / 3.0); - } -} - -#[offload_kernel] -fn stride2d(grid: &mut Region>) { - if let Some(mut view) = grid.get_mut() { - view.set(0, 0, 42.0); - view.set(1, 1, 42.0); - } -} - -#[cfg(target_os = "linux")] -fn main() { - use rustc_offload_frontend::offload; - - // linear1d - let mut x = [0.0f64; 256]; - let mut reg = Region::<_, Linear1D>::new(&mut x, ()); - // core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); - offload! { - kernel = linear1d, - grid_dim = [256, 1, 1], - args = (&mut reg,), - }; - for i in 0..x.len() { - assert_eq!(x[i], 42.0 as f64); - } - - // stencil2d - let mut grid = [ - 1.0, 1.0, 1.0, 1.0, // - 1.0, 4.0, 1.0, 1.0, // cargo fmt don't merge this lines - 1.0, 1.0, 1.0, 1.0, // - 1.0, 1.0, 1.0, 1.0, - ]; - let mut reg_stencil = Region::<_, Stencil2D<1>>::new(&mut grid, (4, 4)); - // core::intrinsics::offload::<_, _, ()>(stencil2d, [1, 1, 1], [2, 2, 1], (&mut reg_stencil,)); - offload! { - kernel = stencil2d, - block_dim = [2, 2, 1], - args = (&mut reg_stencil,), - }; - // thread (0, 0, 0) will have center on (x, y) = 1 (index = 5), so (1 + 4 + 1) / 3 = 2 - assert_eq!(grid[5], 2.0); - - // stride2d - let mut blocks = [0.0; 64]; - let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4>>::new(&mut blocks, (8, 8)); - // core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); - offload! { - kernel = stride2d, - block_dim = [2, 2, 1], - args = (&mut reg_stride,), - }; - // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements - assert_eq!(blocks[0], 42.0); - assert_eq!(blocks[9], 42.0); - - println!("all checks passed!"); -} From da2456bddd9cf822511278b792211e77484f81bb Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Tue, 2 Jun 2026 13:37:21 +0200 Subject: [PATCH 11/20] remove from non-mut args --- crates/rustc_offload_frontend/src/main.rs | 36 +-- .../rustc_offload_frontend/src/partition.rs | 207 +++++------------- 2 files changed, 71 insertions(+), 172 deletions(-) diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs index 3c202cd..f365b96 100644 --- a/crates/rustc_offload_frontend/src/main.rs +++ b/crates/rustc_offload_frontend/src/main.rs @@ -13,7 +13,10 @@ extern crate libc; use rustc_offload_frontend::offload_kernel; -use rustc_offload_frontend::partition::{Linear1D, Linear2D, Region, Stencil2D, Stride2D}; +use rustc_offload_frontend::partition::{Linear1D, Linear2D, Region, Stride2D}; + +#[cfg(target_arch = "nvptx64")] +use rustc_offload_frontend::partition::PartitioningStrategy; #[offload_kernel] fn linear1d(x: &mut Region) { @@ -23,7 +26,7 @@ fn linear1d(x: &mut Region) { } #[offload_kernel] -fn stride2d(grid: &mut Region>) { +fn stride2d(grid: &mut Region>) { if let Some(mut view) = grid.get_mut() { view.set(0, 0, 42.0); view.set(1, 1, 42.0); @@ -31,13 +34,14 @@ fn stride2d(grid: &mut Region>) { } #[offload_kernel] -fn conv_blur2d(input: &Region>, output: &mut Region) { - if let (Some(in_view), Some(out_cell)) = (input.get(), output.get_mut()) { +fn conv_blur2d(input: &[f64], output: &mut Region>) { + if let Some(out_cell) = output.get_mut() { let mut sum = 0.0; for dy in -1..=1 { for dx in -1..=1 { - if let Some(v) = in_view.get_neighbour(dx, dy) { + let idx = (Linear2D::<4>::index() as isize + dy * 4 as isize + dx) as usize; + if let Some(v) = input.get(idx) { sum += v; } } @@ -48,8 +52,8 @@ fn conv_blur2d(input: &Region>, output: &mut Region, y: &mut Region) { - if let (Some(val_x), Some(val_y)) = (x.get(), y.get_mut()) { +fn saxpy_kernel(alpha: f32, x: &[f32], y: &mut Region) { + if let (Some(val_x), Some(val_y)) = (x.get(Linear1D::index()), y.get_mut()) { *val_y = alpha * (*val_x) + (*val_y); } } @@ -60,7 +64,7 @@ fn main() { // linear1d let mut x = [0.0f64; 256]; - let mut reg = Region::<_, Linear1D>::new(&mut x, ()); + let mut reg = Region::<_, Linear1D>::new(&mut x); // core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); offload! { kernel = linear1d, @@ -70,10 +74,11 @@ fn main() { for i in 0..x.len() { assert_eq!(x[i], 42.0 as f64); } + println!("::passed:: linear1d"); // stride2d let mut blocks = [0.0; 64]; - let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4>>::new(&mut blocks, (8, 8)); + let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4, 8>>::new(&mut blocks); // core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); offload! { kernel = stride2d, @@ -83,6 +88,7 @@ fn main() { // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements assert_eq!(blocks[0], 42.0); assert_eq!(blocks[9], 42.0); + println!("::passed:: stride2d"); // conv_blur2d let input = [ @@ -93,12 +99,11 @@ fn main() { ]; let mut output = [0.0f64; 16]; - let reg_input = Region::<_, Stencil2D<1>>::new(&input, (4, 4)); - let mut reg_output = Region::<_, Linear2D>::new(&mut output, (4, 4)); + let mut reg_output = Region::<_, Linear2D<4>>::new(&mut output); offload! { kernel = conv_blur2d, block_dim = [4, 4, 1], - args = (®_input, &mut reg_output,), + args = (&input as &[f64], &mut reg_output,), }; let expected = [ @@ -108,6 +113,7 @@ fn main() { 1.0, 2.0, 2.0, 1.0, // ]; assert_eq!(output, expected); + println!("::passed:: conv_blur2d"); // saxpy const N: usize = 512; @@ -115,18 +121,18 @@ fn main() { let x: [f32; N] = [2.0; N]; let mut y: [f32; N] = [1.0; N]; - let reg_x = Region::<_, Linear1D>::new(&x, ()); - let mut reg_y = Region::<_, Linear1D>::new(&mut y, ()); + let mut reg_y = Region::<_, Linear1D>::new(&mut y); offload! { kernel = saxpy_kernel, grid_dim = [N as u32, 1, 1], - args = (alpha, ®_x, &mut reg_y,), + args = (alpha, &x as &[f32], &mut reg_y,), }; for i in 0..N { assert_eq!(y[i], 6.0f32); } + println!("::passed:: saxpy"); println!("all checks passed!"); } diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index 90966bf..303c84b 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -3,27 +3,18 @@ use core::convert::From; use core::prelude::v1::*; pub unsafe trait PartitioningStrategy { - type Shape: Copy; type View<'a, T: 'a>; type ViewMut<'a, T: 'a>; - unsafe fn get<'a, T>( - ptr: *const T, - len: usize, - shape: Self::Shape, - ) -> Option>; - unsafe fn get_mut<'a, T>( - ptr: *mut T, - len: usize, - shape: Self::Shape, - ) -> Option>; + fn index() -> usize; + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option>; + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option>; } pub struct Region<'a, T, S: PartitioningStrategy> { ptr: *mut T, len: usize, - pub shape: S::Shape, - _marker: core::marker::PhantomData<&'a mut [T]>, + _marker: core::marker::PhantomData<(&'a mut [T], S)>, } pub struct RawRegion<'a, T> { @@ -42,16 +33,6 @@ impl<'a, T> From<&'a mut [T]> for RawRegion<'a, T> { } } -impl<'a, T> From<&'a [T]> for RawRegion<'a, T> { - fn from(data: &'a [T]) -> Self { - Self { - ptr: data.as_ptr() as *mut T, - len: data.len(), - _marker: core::marker::PhantomData, - } - } -} - impl<'a, T, const N: usize> From<&'a mut [T; N]> for RawRegion<'a, T> { fn from(data: &'a mut [T; N]) -> Self { Self { @@ -62,18 +43,8 @@ impl<'a, T, const N: usize> From<&'a mut [T; N]> for RawRegion<'a, T> { } } -impl<'a, T, const N: usize> From<&'a [T; N]> for RawRegion<'a, T> { - fn from(data: &'a [T; N]) -> Self { - Self { - ptr: data.as_ptr() as *mut T, - len: N, - _marker: core::marker::PhantomData, - } - } -} - impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { - pub fn new(data: D, shape: S::Shape) -> Self + pub fn new(data: D) -> Self where D: Into>, { @@ -81,43 +52,40 @@ impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { Self { ptr: raw.ptr, len: raw.len, - shape, _marker: core::marker::PhantomData, } } pub fn get(&self) -> Option> { - unsafe { S::get(self.ptr as *const T, self.len, self.shape) } + unsafe { S::get(self.ptr as *const T, self.len) } } pub fn get_mut(&mut self) -> Option> { - unsafe { S::get_mut(self.ptr, self.len, self.shape) } + unsafe { S::get_mut(self.ptr, self.len) } } } // linear1d pub struct Linear1D; unsafe impl PartitioningStrategy for Linear1D { - type Shape = (); type View<'a, T: 'a> = &'a T; type ViewMut<'a, T: 'a> = &'a mut T; - unsafe fn get<'a, T>(ptr: *const T, len: usize, _: Self::Shape) -> Option> { - let tid = global_thread_dim().x; - if tid < len { - Some(unsafe { &*ptr.add(tid) }) + fn index() -> usize { + global_thread_dim().x + } + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &*ptr.add(idx) }) } else { None } } - unsafe fn get_mut<'a, T>( - ptr: *mut T, - len: usize, - _: Self::Shape, - ) -> Option> { - let tid = global_thread_dim().x; - if tid < len { - Some(unsafe { &mut *ptr.add(tid) }) + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &mut *ptr.add(idx) }) } else { None } @@ -125,32 +93,25 @@ unsafe impl PartitioningStrategy for Linear1D { } // linear2d -pub struct Linear2D; -unsafe impl PartitioningStrategy for Linear2D { - type Shape = (usize, usize); +pub struct Linear2D; +unsafe impl PartitioningStrategy for Linear2D { type View<'a, T: 'a> = &'a T; type ViewMut<'a, T: 'a> = &'a mut T; - unsafe fn get<'a, T>( - ptr: *const T, - len: usize, - shape: Self::Shape, - ) -> Option> { + fn index() -> usize { let tid = global_thread_dim(); - let idx = tid.y * shape.0 + tid.x; + tid.y * W + tid.x + } + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let idx = Self::index(); if idx < len { Some(unsafe { &*ptr.add(idx) }) } else { None } } - unsafe fn get_mut<'a, T>( - ptr: *mut T, - len: usize, - shape: Self::Shape, - ) -> Option> { - let tid = global_thread_dim(); - let idx = tid.y * shape.0 + tid.x; + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); if idx < len { Some(unsafe { &mut *ptr.add(idx) }) } else { @@ -159,114 +120,46 @@ unsafe impl PartitioningStrategy for Linear2D { } } -// stencil2d -pub struct Stencil2D; - -pub struct StencilView<'a, T> { - base_ptr: *const T, - center_idx: usize, - cols: usize, - rows: usize, - _marker: core::marker::PhantomData<&'a T>, -} - -impl<'a, T> StencilView<'a, T> { - pub fn get_neighbour(&self, ox: isize, oy: isize) -> Option<&T> { - let cx = (self.center_idx % self.cols) as isize; - let cy = (self.center_idx / self.cols) as isize; - - let nx = cx + ox; - let ny = cy + oy; - - if nx >= 0 && nx < self.cols as isize && ny >= 0 && ny < self.rows as isize { - let offset = ny * (self.cols as isize) + nx; - Some(unsafe { &*self.base_ptr.offset(offset) }) - } else { - None - } - } -} - -unsafe impl PartitioningStrategy for Stencil2D { - type Shape = (usize, usize); - type View<'a, T: 'a> = StencilView<'a, T>; - type ViewMut<'a, T: 'a> = core::marker::PhantomData<&'a mut T>; - - unsafe fn get<'a, T>( - ptr: *const T, - len: usize, - shape: Self::Shape, - ) -> Option> { - let (cols, rows) = shape; - let tid = global_thread_dim(); - - let center_idx = tid.y * cols + tid.x; - - if center_idx < len { - Some(StencilView { - base_ptr: ptr, - center_idx, - cols, - rows, - _marker: core::marker::PhantomData, - }) - } else { - None - } - } - - unsafe fn get_mut<'a, T>(_: *mut T, _: usize, _: Self::Shape) -> Option> { - None - } -} - // stride pub struct StrideViewMut<'a, T> { block_ptr: *mut T, stride: usize, - width: usize, - height: usize, _marker: core::marker::PhantomData<&'a mut T>, } impl<'a, T> StrideViewMut<'a, T> { pub fn set(&mut self, x: usize, y: usize, val: T) { - if x < self.width && y < self.height { - unsafe { - *self.block_ptr.add(y * self.stride + x) = val; - } + unsafe { + *self.block_ptr.add(y * self.stride + x) = val; } } } -pub struct Stride2D; -unsafe impl PartitioningStrategy - for Stride2D +pub struct Stride2D< + const W: usize, + const H: usize, + const SX: usize, + const SY: usize, + const STRIDE: usize, +>; +unsafe impl + PartitioningStrategy for Stride2D { - type Shape = (usize, usize); type View<'a, T: 'a> = &'a T; type ViewMut<'a, T: 'a> = StrideViewMut<'a, T>; - unsafe fn get<'a, T>(_: *const T, _: usize, _: Self::Shape) -> Option> { + fn index() -> usize { + let tid = global_thread_dim(); + tid.y * SY * STRIDE + tid.x * SX + } + unsafe fn get<'a, T>(_: *const T, _: usize) -> Option> { unimplemented!() } - unsafe fn get_mut<'a, T>( - ptr: *mut T, - _: usize, - shape: Self::Shape, - ) -> Option> { - let tid = global_thread_dim(); - let start_x = tid.x * SX; - let start_y = tid.y * SY; - if start_x + W <= shape.0 && start_y + H <= shape.1 { - Some(StrideViewMut { - block_ptr: unsafe { ptr.add(start_y * shape.0 + start_x) }, - stride: shape.0, - width: W, - height: H, - _marker: core::marker::PhantomData, - }) - } else { - None - } + unsafe fn get_mut<'a, T>(ptr: *mut T, _: usize) -> Option> { + let idx = Self::index(); + Some(StrideViewMut { + block_ptr: unsafe { ptr.add(idx) }, + stride: STRIDE, + _marker: core::marker::PhantomData, + }) } } From 30fad325a697bc30492544a0c9e278ab9b44e8bf Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 00:04:55 +0200 Subject: [PATCH 12/20] move memory with preload and remove double indirection --- crates/rustc_offload_frontend/src/lib.rs | 2 +- crates/rustc_offload_frontend/src/main.rs | 41 +++++++++++-------- .../rustc_offload_frontend/src/partition.rs | 14 +++++++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/crates/rustc_offload_frontend/src/lib.rs b/crates/rustc_offload_frontend/src/lib.rs index fbd03ba..a422fc0 100644 --- a/crates/rustc_offload_frontend/src/lib.rs +++ b/crates/rustc_offload_frontend/src/lib.rs @@ -3,7 +3,7 @@ #![allow(improper_ctypes)] #![allow(improper_gpu_kernel_arg)] #![allow(improper_ctypes_definitions)] -#![feature(gpu_offload)] +#![feature(gpu_offload, offload)] #![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] #![cfg_attr(target_arch = "nvptx64", no_std)] diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs index f365b96..7ffd221 100644 --- a/crates/rustc_offload_frontend/src/main.rs +++ b/crates/rustc_offload_frontend/src/main.rs @@ -4,7 +4,7 @@ #![allow(improper_gpu_kernel_arg)] #![allow(improper_ctypes_definitions)] #![feature(gpu_offload)] -#![cfg_attr(target_os = "linux", feature(core_intrinsics))] +#![cfg_attr(target_os = "linux", feature(core_intrinsics, offload))] #![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] #![cfg_attr(target_arch = "nvptx64", no_std)] #![cfg_attr(target_arch = "nvptx64", no_main)] @@ -15,18 +15,21 @@ extern crate libc; use rustc_offload_frontend::offload_kernel; use rustc_offload_frontend::partition::{Linear1D, Linear2D, Region, Stride2D}; +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use rustc_offload_frontend::partition::PartitioningStrategy; #[offload_kernel] -fn linear1d(x: &mut Region) { +fn linear1d(mut x: Region) { if let Some(e) = x.get_mut() { *e = 42.0; } } #[offload_kernel] -fn stride2d(grid: &mut Region>) { +fn stride2d(mut grid: Region>) { if let Some(mut view) = grid.get_mut() { view.set(0, 0, 42.0); view.set(1, 1, 42.0); @@ -34,7 +37,7 @@ fn stride2d(grid: &mut Region>) { } #[offload_kernel] -fn conv_blur2d(input: &[f64], output: &mut Region>) { +fn conv_blur2d(input: &[f64], mut output: Region>) { if let Some(out_cell) = output.get_mut() { let mut sum = 0.0; @@ -52,7 +55,7 @@ fn conv_blur2d(input: &[f64], output: &mut Region>) { } #[offload_kernel] -fn saxpy_kernel(alpha: f32, x: &[f32], y: &mut Region) { +fn saxpy_kernel(alpha: f32, x: &[f32], mut y: Region) { if let (Some(val_x), Some(val_y)) = (x.get(Linear1D::index()), y.get_mut()) { *val_y = alpha * (*val_x) + (*val_y); } @@ -64,13 +67,14 @@ fn main() { // linear1d let mut x = [0.0f64; 256]; - let mut reg = Region::<_, Linear1D>::new(&mut x); - // core::intrinsics::offload::<_, _, ()>(linear1d, [1, 1, 1], [256, 1, 1], (&mut reg,)); + let p: PreloadMut<[f64; 256]> = preload_mut(&mut x); + let mut reg = Region::<'_, _, Linear1D>::from(&p); offload! { kernel = linear1d, grid_dim = [256, 1, 1], - args = (&mut reg,), + args = (reg,), }; + drop(p); for i in 0..x.len() { assert_eq!(x[i], 42.0 as f64); } @@ -78,13 +82,14 @@ fn main() { // stride2d let mut blocks = [0.0; 64]; - let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4, 8>>::new(&mut blocks); - // core::intrinsics::offload::<_, _, ()>(stride2d, [1, 1, 1], [2, 2, 1], (&mut reg_stride,)); + let p: PreloadMut<[f64; 64]> = preload_mut(&mut blocks); + let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4, 8>>::from(&p); offload! { kernel = stride2d, block_dim = [2, 2, 1], - args = (&mut reg_stride,), + args = (reg_stride,), }; + drop(p); // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements assert_eq!(blocks[0], 42.0); assert_eq!(blocks[9], 42.0); @@ -98,13 +103,14 @@ fn main() { 0.0, 0.0, 0.0, 0.0, // ]; let mut output = [0.0f64; 16]; - - let mut reg_output = Region::<_, Linear2D<4>>::new(&mut output); + let p: PreloadMut<[f64; 16]> = preload_mut(&mut output); + let mut reg_output = Region::<_, Linear2D<4>>::from(&p); offload! { kernel = conv_blur2d, block_dim = [4, 4, 1], - args = (&input as &[f64], &mut reg_output,), + args = (&input as &[f64], reg_output,), }; + drop(p); let expected = [ 1.0, 2.0, 2.0, 1.0, // @@ -120,14 +126,15 @@ fn main() { let alpha: f32 = 2.5; let x: [f32; N] = [2.0; N]; let mut y: [f32; N] = [1.0; N]; - - let mut reg_y = Region::<_, Linear1D>::new(&mut y); + let p: PreloadMut<[f32; N]> = preload_mut(&mut y); + let mut reg_y = Region::<_, Linear1D>::from(&p); offload! { kernel = saxpy_kernel, grid_dim = [N as u32, 1, 1], - args = (alpha, &x as &[f32], &mut reg_y,), + args = (alpha, &x as &[f32], reg_y,), }; + drop(p); for i in 0..N { assert_eq!(y[i], 6.0f32); diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index 303c84b..a0ca4ac 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -1,6 +1,7 @@ use crate::gpu::global_thread_dim; use core::convert::From; use core::prelude::v1::*; +use core::offload::offload::PreloadMut; pub unsafe trait PartitioningStrategy { type View<'a, T: 'a>; @@ -17,6 +18,19 @@ pub struct Region<'a, T, S: PartitioningStrategy> { _marker: core::marker::PhantomData<(&'a mut [T], S)>, } +impl<'a, T, const N: usize, S> From<&PreloadMut<'a, [T; N]>> for Region<'a, T, S> +where + S: PartitioningStrategy, +{ + fn from(p: &PreloadMut<'a, [T; N]>) -> Self { + Self { + ptr: p.cpu_ptr as *mut T, + len: N, + _marker: core::marker::PhantomData, + } + } +} + pub struct RawRegion<'a, T> { pub ptr: *mut T, pub len: usize, From 89d985e1e730120feb2daac15c16752c7e21945e Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 00:36:52 +0200 Subject: [PATCH 13/20] update intrinsic calls for dyn_cache arg --- src/apps/del_dot_vec_2d.rs | 1 + src/apps/energy.rs | 6 ++++++ src/apps/fir.rs | 1 + src/apps/ltimes.rs | 1 + src/apps/matvec_3d_stencil.rs | 1 + src/apps/pressure.rs | 2 ++ src/apps/vol3d.rs | 1 + 7 files changed, 13 insertions(+) diff --git a/src/apps/del_dot_vec_2d.rs b/src/apps/del_dot_vec_2d.rs index 6add513..8aaf7ef 100644 --- a/src/apps/del_dot_vec_2d.rs +++ b/src/apps/del_dot_vec_2d.rs @@ -152,6 +152,7 @@ impl KernelBase for DelDotVec2D { _del_dot_vec_2d, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, ( self.div as *mut [Real; NNALLS], &*x1, diff --git a/src/apps/energy.rs b/src/apps/energy.rs index 8fa9d51..a30fc38 100644 --- a/src/apps/energy.rs +++ b/src/apps/energy.rs @@ -257,6 +257,7 @@ unsafe fn energycalc1( _energycalc1, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, (e_new, e_old, delvc, p_old, q_old, work, iend), ) } @@ -278,6 +279,7 @@ unsafe fn energycalc2( _energycalc2, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, ( delvc, q_new, @@ -307,6 +309,7 @@ unsafe fn energycalc3( _energycalc3, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, (e_new, delvc, p_old, q_old, p_half_step, q_new, iend), ) } @@ -322,6 +325,7 @@ unsafe fn energycalc4( _energycalc4, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, (e_new, work, e_cut, emin, iend), ) } @@ -348,6 +352,7 @@ unsafe fn energycalc5( _energycalc5, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, ( delvc, pbvc, @@ -387,6 +392,7 @@ unsafe fn energycalc6( _energycalc6, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, ( delvc, pbvc, e_new, vnewc, bvc, p_new, q_new, ql_old, qq_old, rho0, q_cut, iend, ), diff --git a/src/apps/fir.rs b/src/apps/fir.rs index 1b267b6..701678c 100644 --- a/src/apps/fir.rs +++ b/src/apps/fir.rs @@ -93,6 +93,7 @@ impl KernelBase for Fir { _fir, [BLOCKS, 1, 1], [THREADS_PER_BLOCK, 1, 1], + 0, ( self.m_out as *mut [Real; IEND], &*(self.m_in as *const [Real; IEND + COEFFLEN]), diff --git a/src/apps/ltimes.rs b/src/apps/ltimes.rs index deedc70..cb98988 100644 --- a/src/apps/ltimes.rs +++ b/src/apps/ltimes.rs @@ -105,6 +105,7 @@ impl KernelBase for LTimes { _ltimes, [grid_x as u32, grid_y as u32, grid_z as u32], [m_block as u32, g_block as u32, z_block as u32], + 0, ( self.phidat as *mut [Real; 390400], self.elldat as *const [Real; 1600], diff --git a/src/apps/matvec_3d_stencil.rs b/src/apps/matvec_3d_stencil.rs index 35b826c..1707834 100644 --- a/src/apps/matvec_3d_stencil.rs +++ b/src/apps/matvec_3d_stencil.rs @@ -130,6 +130,7 @@ impl KernelBase for Matvec3DStencil { _matvec3dstencil, [n.div_ceil(256) as u32, 1, 1], [256, 1, 1], + 0, ( self.x as *const [Real; 1124864], self.b as *mut [Real; 1124864], diff --git a/src/apps/pressure.rs b/src/apps/pressure.rs index 582f039..f30b486 100644 --- a/src/apps/pressure.rs +++ b/src/apps/pressure.rs @@ -95,6 +95,7 @@ impl KernelBase for Pressure { _pressure_calc1, grid, block, + 0, ( self.bvc as *mut [Real; N_DEFAULT], self.compression as *const [Real; N_DEFAULT], @@ -107,6 +108,7 @@ impl KernelBase for Pressure { _pressure_calc2, grid, block, + 0, ( self.p_new as *mut [Real; N_DEFAULT], self.bvc as *const [Real; N_DEFAULT], diff --git a/src/apps/vol3d.rs b/src/apps/vol3d.rs index 7cf5877..6bbb855 100644 --- a/src/apps/vol3d.rs +++ b/src/apps/vol3d.rs @@ -125,6 +125,7 @@ impl KernelBase for Vol3D { _vol3d, [((count + 255) / 256) as u32, 1, 1], [256, 1, 1], + 0, ( self.x as *const [Real; 1124864], self.y as *const [Real; 1124864], From 4ca96bbde526e76df491088bfecfd0861bd5af01 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 14:46:52 +0200 Subject: [PATCH 14/20] update perf suite to match rustc_offload_frontend style --- Cargo.toml | 7 ++- src/lib.rs | 133 ++++------------------------------------------------ src/main.rs | 118 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 125 deletions(-) create mode 100644 src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 6074354..ee31154 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,12 @@ version = "0.1.0" edition = "2024" [lib] -crate-type = ["cdylib"] +crate-type = ["rlib", "cdylib"] +path = "src/lib.rs" + +[[bin]] +name = "main" +path = "src/main.rs" [dependencies] libc = { version = "0.2.175", default-features = false } diff --git a/src/lib.rs b/src/lib.rs index 40f6055..ab1a8b8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,129 +1,14 @@ #![allow(internal_features)] -#![allow(non_snake_case)] -#![allow(clippy::deref_addrof)] -#![allow(clippy::too_many_arguments)] -#![allow(clippy::missing_safety_doc)] -#![allow(unused_features)] -#![feature(abi_gpu_kernel)] -#![feature(core_float_math)] -#![feature(core_intrinsics)] -#![feature(float_algebraic)] -#![feature(rustc_attrs)] -#![cfg_attr(target_arch = "amdgpu", feature(stdarch_amdgpu))] -#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] -#![no_std] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload, offload)] +#![feature(float_algebraic, core_float_math)] +#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx, abi_gpu_kernel))] +#![cfg_attr(target_arch = "nvptx64", no_std)] -#[cfg(target_os = "linux")] -extern crate libc; - -#[panic_handler] -fn panic(_: &core::panic::PanicInfo) -> ! { - loop {} -} +#![feature(rustc_attrs, core_intrinsics)] pub mod apps; pub mod common; - -#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] -use apps::del_dot_vec_2d::DelDotVec2D; -#[cfg(all(target_os = "linux", feature = "energy"))] -use apps::energy::Energy; -#[cfg(all(target_os = "linux", feature = "fir"))] -use apps::fir::Fir; - -#[cfg(all(target_os = "linux", feature = "ltimes"))] -use apps::ltimes::LTimes; - -#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] -use apps::matvec_3d_stencil::Matvec3DStencil; - -#[cfg(all(target_os = "linux", feature = "pressure"))] -use apps::pressure::Pressure; - -#[cfg(all(target_os = "linux", feature = "vol3d"))] -use apps::vol3d::Vol3D; - -#[cfg(all(target_os = "linux", feature = "energy"))] -static mut K_ENERGY: Energy = Energy::INIT; -#[cfg(all(target_os = "linux", feature = "fir"))] -static mut K_FIR: Fir = Fir::INIT; -#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] -static mut K_DEL: DelDotVec2D = DelDotVec2D::INIT; -#[cfg(all(target_os = "linux", feature = "ltimes"))] -static mut K_LTIMES: LTimes = LTimes::INIT; -#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] -static mut K_MATVEC3DSTENCIL: Matvec3DStencil = Matvec3DStencil::INIT; -#[cfg(all(target_os = "linux", feature = "pressure"))] -static mut K_PRESSURE: Pressure = Pressure::INIT; -#[cfg(all(target_os = "linux", feature = "vol3d"))] -static mut K_VOL3D: Vol3D = Vol3D::INIT; - -#[cfg(target_os = "linux")] -#[unsafe(no_mangle)] -fn main() { - use crate::common::executor::{Executor, KernelResult, MAX_KERNELS}; - use crate::common::kernel_base::KernelBase; - use core::mem::MaybeUninit; - - let mut k_links: [Option<&mut dyn KernelBase>; MAX_KERNELS] = [const { None }; MAX_KERNELS]; - let mut count = 0; - - #[cfg(feature = "energy")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_ENERGY) }); - count += 1; - } - #[cfg(feature = "fir")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_FIR) }); - count += 1; - } - #[cfg(feature = "del_dot_vec_2d")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_DEL) }); - count += 1; - } - #[cfg(feature = "ltimes")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_LTIMES) }); - count += 1; - } - #[cfg(feature = "matvec_3d_stencil")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_MATVEC3DSTENCIL) }); - count += 1; - } - - #[cfg(feature = "pressure")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_PRESSURE) }); - count += 1; - } - - #[cfg(feature = "vol3d")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_VOL3D) }); - count += 1; - } - - let mut kernel_refs: [MaybeUninit<&mut dyn KernelBase>; MAX_KERNELS] = - [const { MaybeUninit::uninit() }; MAX_KERNELS]; - - for i in 0..count { - kernel_refs[i] = MaybeUninit::new(k_links[i].take().unwrap()); - } - - let kernels_slice = unsafe { - core::slice::from_raw_parts_mut(kernel_refs.as_mut_ptr() as *mut &mut dyn KernelBase, count) - }; - - let mut suite = Executor::new(kernels_slice); - - static mut RESULT_BUF: [MaybeUninit; MAX_KERNELS] = - [const { MaybeUninit::uninit() }; MAX_KERNELS]; - - let results = suite.run_suite(unsafe { &mut *(&raw mut RESULT_BUF) }); - - Executor::print_report(results); - Executor::export_csv(results, c"results.csv".as_ptr()); -} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..adea659 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,118 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload)] +#![cfg_attr(target_os = "linux", feature(core_intrinsics, offload))] +#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] +#![cfg_attr(target_arch = "nvptx64", no_std)] +#![cfg_attr(target_arch = "nvptx64", no_main)] + +use rust_perf; + +#[cfg(target_os = "linux")] +extern crate libc; + +#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] +use rust_perf::apps::del_dot_vec_2d::DelDotVec2D; +#[cfg(all(target_os = "linux", feature = "energy"))] +use rust_perf::apps::energy::Energy; +#[cfg(all(target_os = "linux", feature = "fir"))] +use rust_perf::apps::fir::Fir; + +#[cfg(all(target_os = "linux", feature = "ltimes"))] +use rust_perf::apps::ltimes::LTimes; + +#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] +use rust_perf::apps::matvec_3d_stencil::Matvec3DStencil; + +#[cfg(all(target_os = "linux", feature = "pressure"))] +use rust_perf::apps::pressure::Pressure; + +#[cfg(all(target_os = "linux", feature = "vol3d"))] +use rust_perf::apps::vol3d::Vol3D; + +#[cfg(all(target_os = "linux", feature = "energy"))] +static mut K_ENERGY: Energy = Energy::INIT; +#[cfg(all(target_os = "linux", feature = "fir"))] +static mut K_FIR: Fir = Fir::INIT; +#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] +static mut K_DEL: DelDotVec2D = DelDotVec2D::INIT; +#[cfg(all(target_os = "linux", feature = "ltimes"))] +static mut K_LTIMES: LTimes = LTimes::INIT; +#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] +static mut K_MATVEC3DSTENCIL: Matvec3DStencil = Matvec3DStencil::INIT; +#[cfg(all(target_os = "linux", feature = "pressure"))] +static mut K_PRESSURE: Pressure = Pressure::INIT; +#[cfg(all(target_os = "linux", feature = "vol3d"))] +static mut K_VOL3D: Vol3D = Vol3D::INIT; + +#[cfg(target_os = "linux")] +fn main() { + use rust_perf::common::executor::{Executor, KernelResult, MAX_KERNELS}; + use rust_perf::common::kernel_base::KernelBase; + use core::mem::MaybeUninit; + + let mut k_links: [Option<&mut dyn KernelBase>; MAX_KERNELS] = [const { None }; MAX_KERNELS]; + let mut count = 0; + + #[cfg(feature = "energy")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_ENERGY) }); + count += 1; + } + #[cfg(feature = "fir")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_FIR) }); + count += 1; + } + #[cfg(feature = "del_dot_vec_2d")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_DEL) }); + count += 1; + } + #[cfg(feature = "ltimes")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_LTIMES) }); + count += 1; + } + #[cfg(feature = "matvec_3d_stencil")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_MATVEC3DSTENCIL) }); + count += 1; + } + + #[cfg(feature = "pressure")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_PRESSURE) }); + count += 1; + } + + #[cfg(feature = "vol3d")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_VOL3D) }); + count += 1; + } + + let mut kernel_refs: [MaybeUninit<&mut dyn KernelBase>; MAX_KERNELS] = + [const { MaybeUninit::uninit() }; MAX_KERNELS]; + + for i in 0..count { + kernel_refs[i] = MaybeUninit::new(k_links[i].take().unwrap()); + } + + let kernels_slice = unsafe { + core::slice::from_raw_parts_mut(kernel_refs.as_mut_ptr() as *mut &mut dyn KernelBase, count) + }; + + let mut suite = Executor::new(kernels_slice); + + static mut RESULT_BUF: [MaybeUninit; MAX_KERNELS] = + [const { MaybeUninit::uninit() }; MAX_KERNELS]; + + let results = suite.run_suite(unsafe { &mut *(&raw mut RESULT_BUF) }); + + Executor::print_report(results); + Executor::export_csv(results, c"results.csv".as_ptr()); +} From aa381b42b80de5d4ac66209e34e7c055f093fbca Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 14:47:55 +0200 Subject: [PATCH 15/20] move fir to new frontend --- src/apps/fir.rs | 66 ++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/src/apps/fir.rs b/src/apps/fir.rs index 701678c..ee42229 100644 --- a/src/apps/fir.rs +++ b/src/apps/fir.rs @@ -6,6 +6,17 @@ pub const COEFFLEN: usize = 16; const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (IEND as u32).div_ceil(THREADS_PER_BLOCK); + +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{Region, Linear1D, PartitioningStrategy}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, @@ -88,20 +99,21 @@ impl KernelBase for Fir { } fn run_kernel(&mut self) { - unsafe { - core::intrinsics::offload::<_, _, ()>( - _fir, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - ( - self.m_out as *mut [Real; IEND], - &*(self.m_in as *const [Real; IEND + COEFFLEN]), - &self.coeff as &[Real; COEFFLEN], - IEND, - ), - ); - } + let mut m_out = unsafe { &mut *(self.m_out as *mut [Real; IEND]) }; + let p: PreloadMut<[Real; IEND]> = preload_mut(&mut m_out); + let mut m_out_reg = Region::<'_, _, Linear1D>::from(&p); + offload! { + kernel = fir, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + m_out_reg, + unsafe { &*(self.m_in as *const [Real; IEND + COEFFLEN]) }, + unsafe { &self.coeff as &[Real; COEFFLEN] }, + IEND, + ), + }; + drop(p); } fn update_checksum(&self) -> f64 { @@ -118,30 +130,18 @@ impl KernelBase for Fir { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _fir( - m_out: *mut [Real; IEND], - m_in: &[Real; IEND + COEFFLEN], - coeff: &[Real; COEFFLEN], - iend: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _fir( - m_out: *mut [Real; IEND], +#[offload_kernel] +fn fir( + mut m_out: Region, m_in: &[Real; IEND + COEFFLEN], coeff: &[Real; COEFFLEN], iend: usize, ) { - let i = unsafe { (block_idx_x() * block_dim_x() + thread_idx_x()) as usize }; - if i < iend { + let i = Linear1D::index(); + if let Some(v) = m_out.get_mut() { let mut sum: Real = Real::from(0.0); let mut j = 0; while j < COEFFLEN { @@ -150,8 +150,6 @@ pub unsafe extern "gpu-kernel" fn _fir( } j += 1; } - unsafe { - (*m_out)[i] = sum; - } + *v = sum; } } From e3e6c9dffc1a82e7b210cf1874ed386ad66536f9 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 15:51:58 +0200 Subject: [PATCH 16/20] move energy to new frontend + format --- .../rustc_offload_frontend/src/partition.rs | 5 + src/apps/energy.rs | 527 ++++++------------ src/apps/fir.rs | 4 +- src/lib.rs | 1 - src/main.rs | 2 +- 5 files changed, 165 insertions(+), 374 deletions(-) diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index a0ca4ac..f89dc69 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -12,6 +12,7 @@ pub unsafe trait PartitioningStrategy { unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option>; } +#[derive(Debug, Copy, Clone)] pub struct Region<'a, T, S: PartitioningStrategy> { ptr: *mut T, len: usize, @@ -80,6 +81,7 @@ impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { } // linear1d +#[derive(Debug, Copy, Clone)] pub struct Linear1D; unsafe impl PartitioningStrategy for Linear1D { type View<'a, T: 'a> = &'a T; @@ -107,6 +109,7 @@ unsafe impl PartitioningStrategy for Linear1D { } // linear2d +#[derive(Debug, Copy, Clone)] pub struct Linear2D; unsafe impl PartitioningStrategy for Linear2D { type View<'a, T: 'a> = &'a T; @@ -135,6 +138,7 @@ unsafe impl PartitioningStrategy for Linear2D { } // stride +#[derive(Debug, Copy, Clone)] pub struct StrideViewMut<'a, T> { block_ptr: *mut T, stride: usize, @@ -148,6 +152,7 @@ impl<'a, T> StrideViewMut<'a, T> { } } +#[derive(Debug, Copy, Clone)] pub struct Stride2D< const W: usize, const H: usize, diff --git a/src/apps/energy.rs b/src/apps/energy.rs index a30fc38..d0419f9 100644 --- a/src/apps/energy.rs +++ b/src/apps/energy.rs @@ -8,6 +8,15 @@ const IEND: usize = DEFAULT_PROBLEM_SIZE; const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (IEND as u32).div_ceil(THREADS_PER_BLOCK); +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{Linear1D, PartitioningStrategy, Region}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, @@ -126,49 +135,81 @@ impl KernelBase for Energy { } fn run_kernel(&mut self) { - unsafe { - energycalc1( - self.e_new as *mut [Real; IEND], + let mut e_new = unsafe { &mut *(self.e_new as *mut [Real; IEND]) }; + let mut q_new = unsafe { &mut *(self.q_new as *mut [Real; IEND]) }; + + let p1: PreloadMut<[Real; IEND]> = preload_mut(&mut e_new); + let p2: PreloadMut<[Real; IEND]> = preload_mut(&mut q_new); + + let mut e_new_reg = Region::<'_, _, Linear1D>::from(&p1); + let mut q_new_reg = Region::<'_, _, Linear1D>::from(&p2); + unsafe { + offload! { + kernel = energycalc1, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, &*(self.e_old as *const [Real; IEND]), &*(self.delvc as *const [Real; IEND]), &*(self.p_old as *const [Real; IEND]), &*(self.q_old as *const [Real; IEND]), &*(self.work as *const [Real; IEND]), IEND, - ); - energycalc2( + ), + }; + offload! { + kernel = energycalc2, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( &*(self.delvc as *const [Real; IEND]), - self.q_new as *mut [Real; IEND], + q_new_reg, &*(self.comp_half_step as *const [Real; IEND]), &*(self.p_half_step as *const [Real; IEND]), - self.e_new as *mut [Real; IEND], + e_new_reg, &*(self.bvc as *const [Real; IEND]), &*(self.pbvc as *const [Real; IEND]), &*(self.ql_old as *const [Real; IEND]), &*(self.qq_old as *const [Real; IEND]), self.rho0, IEND, - ); - energycalc3( - self.e_new as *mut [Real; IEND], + ), + }; + offload! { + kernel = energycalc3, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, &*(self.delvc as *const [Real; IEND]), &*(self.p_old as *const [Real; IEND]), &*(self.q_old as *const [Real; IEND]), &*(self.p_half_step as *const [Real; IEND]), &*(self.q_new as *const [Real; IEND]), IEND, - ); - energycalc4( - self.e_new as *mut [Real; IEND], + ), + }; + offload! { + kernel = energycalc4, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, &*(self.work as *const [Real; IEND]), self.e_cut, self.emin, IEND, - ); - energycalc5( + ), + }; + offload! { + kernel = energycalc5, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( &*(self.delvc as *const [Real; IEND]), &*(self.pbvc as *const [Real; IEND]), - self.e_new as *mut [Real; IEND], + e_new_reg, &*(self.vnewc as *const [Real; IEND]), &*(self.bvc as *const [Real; IEND]), &*(self.p_new as *const [Real; IEND]), @@ -182,22 +223,28 @@ impl KernelBase for Energy { self.e_cut, self.emin, IEND, - ); - energycalc6( + ), + }; + offload! { + kernel = energycalc6, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( &*(self.delvc as *const [Real; IEND]), &*(self.pbvc as *const [Real; IEND]), - self.e_new as *mut [Real; IEND], + e_new_reg, &*(self.vnewc as *const [Real; IEND]), &*(self.bvc as *const [Real; IEND]), &*(self.p_new as *const [Real; IEND]), - self.q_new as *mut [Real; IEND], + q_new_reg, &*(self.ql_old as *const [Real; IEND]), &*(self.qq_old as *const [Real; IEND]), self.rho0, self.q_cut, IEND, - ); - } + ), + }; + } } fn update_checksum(&self) -> f64 { @@ -243,245 +290,12 @@ impl KernelBase for Energy { } } -#[cfg(target_os = "linux")] -unsafe fn energycalc1( - e_new: *mut [Real; IEND], - e_old: &[Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - work: &[Real; IEND], - iend: usize, -) { - core::intrinsics::offload( - _energycalc1, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - (e_new, e_old, delvc, p_old, q_old, work, iend), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc2( - delvc: &[Real; IEND], - q_new: *mut [Real; IEND], - comp_half_step: &[Real; IEND], - p_half_step: &[Real; IEND], - e_new: *mut [Real; IEND], - bvc: &[Real; IEND], - pbvc: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc2, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - ( - delvc, - q_new, - comp_half_step, - p_half_step, - e_new, - bvc, - pbvc, - ql_old, - qq_old, - rho0, - iend, - ), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc3( - e_new: *mut [Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - iend: usize, -) { - core::intrinsics::offload( - _energycalc3, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - (e_new, delvc, p_old, q_old, p_half_step, q_new, iend), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc4( - e_new: *mut [Real; IEND], - work: &[Real; IEND], - e_cut: Real, - emin: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc4, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - (e_new, work, e_cut, emin, iend), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc5( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - rho0: Real, - e_cut: Real, - emin: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc5, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - ( - delvc, - pbvc, - e_new, - vnewc, - bvc, - p_new, - ql_old, - qq_old, - p_old, - q_old, - p_half_step, - q_new, - rho0, - e_cut, - emin, - iend, - ), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc6( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - q_new: *mut [Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - q_cut: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc6, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - ( - delvc, pbvc, e_new, vnewc, bvc, p_new, q_new, ql_old, qq_old, rho0, q_cut, iend, - ), - ) -} - -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _energycalc1( - e_new: *mut [Real; IEND], - e_old: &[Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - work: &[Real; IEND], - iend: usize, - ); - pub fn _energycalc2( - delvc: &[Real; IEND], - q_new: *mut [Real; IEND], - comp_half_step: &[Real; IEND], - p_half_step: &[Real; IEND], - e_new: *mut [Real; IEND], - bvc: &[Real; IEND], - pbvc: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - iend: usize, - ); - pub fn _energycalc3( - e_new: *mut [Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - iend: usize, - ); - pub fn _energycalc4( - e_new: *mut [Real; IEND], - work: &[Real; IEND], - e_cut: Real, - emin: Real, - iend: usize, - ); - pub fn _energycalc5( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - rho0: Real, - e_cut: Real, - emin: Real, - iend: usize, - ); - pub fn _energycalc6( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - q_new: *mut [Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - q_cut: Real, - iend: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::{Real, RealExt}; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc1( - e_new: *mut [Real; IEND], +#[offload_kernel] +fn energycalc1( + mut e_new: Region, e_old: &[Real; IEND], delvc: &[Real; IEND], p_old: &[Real; IEND], @@ -489,25 +303,20 @@ pub extern "gpu-kernel" fn _energycalc1( work: &[Real; IEND], iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - (*e_new)[i] = (*e_old)[i] - Real::from(0.5) * (*delvc)[i] * ((*p_old)[i] + (*q_old)[i]) - + Real::from(0.5) * (*work)[i]; - } + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + *v = (*e_old)[i] - Real::from(0.5) * (*delvc)[i] * ((*p_old)[i] + (*q_old)[i]) + + Real::from(0.5) * (*work)[i]; } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc2( +#[offload_kernel] +fn energycalc2( delvc: &[Real; IEND], - q_new: *mut [Real; IEND], + mut q_new: Region, comp_half_step: &[Real; IEND], p_half_step: &[Real; IEND], - e_new: *mut [Real; IEND], + mut e_new: Region, bvc: &[Real; IEND], pbvc: &[Real; IEND], ql_old: &[Real; IEND], @@ -515,33 +324,29 @@ pub extern "gpu-kernel" fn _energycalc2( rho0: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - if ((*delvc)[i]).to_f64() > 0.0 { - (*q_new)[i] = Real::from(0.0); + let i = Linear1D::index(); + if let Some(v1) = q_new.get_mut() + && let Some(v2) = e_new.get_mut() + { + if ((*delvc)[i]).to_f64() > 0.0 { + *v1 = Real::from(0.0); + } else { + let vhalf = Real::from(1.0) / (Real::from(1.0) + (*comp_half_step)[i]); + let mut ssc = + ((*pbvc)[i] * (*v2) + vhalf * vhalf * (*bvc)[i] * (*p_half_step)[i]) / rho0; + if ssc.to_f64() <= 0.1111111e-36 { + ssc = Real::from(0.3333333e-18); } else { - let vhalf = Real::from(1.0) / (Real::from(1.0) + (*comp_half_step)[i]); - let mut ssc = ((*pbvc)[i] * (*e_new)[i] - + vhalf * vhalf * (*bvc)[i] * (*p_half_step)[i]) - / rho0; - if ssc.to_f64() <= 0.1111111e-36 { - ssc = Real::from(0.3333333e-18); - } else { - ssc = ssc.sqrt(); - } - (*q_new)[i] = ssc * (*ql_old)[i] + (*qq_old)[i]; + ssc = ssc.sqrt(); } + *v1 = ssc * (*ql_old)[i] + (*qq_old)[i]; } } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc3( - e_new: *mut [Real; IEND], +#[offload_kernel] +fn energycalc3( + mut e_new: Region, delvc: &[Real; IEND], p_old: &[Real; IEND], q_old: &[Real; IEND], @@ -549,50 +354,40 @@ pub extern "gpu-kernel" fn _energycalc3( q_new: &[Real; IEND], iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - (*e_new)[i] += Real::from(0.5) - * (*delvc)[i] - * (Real::from(3.0) * ((*p_old)[i] + (*q_old)[i]) - - Real::from(4.0) * ((*p_half_step)[i] + (*q_new)[i])); - } + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + *v += Real::from(0.5) + * (*delvc)[i] + * (Real::from(3.0) * ((*p_old)[i] + (*q_old)[i]) + - Real::from(4.0) * ((*p_half_step)[i] + (*q_new)[i])); } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc4( - e_new: *mut [Real; IEND], +#[offload_kernel] +fn energycalc4( + mut e_new: Region, work: &[Real; IEND], e_cut: Real, emin: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - (*e_new)[i] += Real::from(0.5) * (*work)[i]; - if ((*e_new)[i]).abs() < e_cut { - (*e_new)[i] = Real::from(0.0); - } - if (*e_new)[i] < emin { - (*e_new)[i] = emin; - } + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + *v += Real::from(0.5) * (*work)[i]; + if (*v).abs() < e_cut { + *v = Real::from(0.0); + } + if *v < emin { + *v = emin; } } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc5( +#[offload_kernel] +fn energycalc5( delvc: &[Real; IEND], pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], + mut e_new: Region, vnewc: &[Real; IEND], bvc: &[Real; IEND], p_new: &[Real; IEND], @@ -607,70 +402,64 @@ pub extern "gpu-kernel" fn _energycalc5( emin: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - let q_tilde = if ((*delvc)[i]).to_f64() > 0.0 { - Real::from(0.0) + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + let q_tilde = if ((*delvc)[i]).to_f64() > 0.0 { + Real::from(0.0) + } else { + let mut ssc = + ((*pbvc)[i] * (*v) + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) / rho0; + if ssc.to_f64() <= 0.1111111e-36 { + ssc = Real::from(0.3333333e-18); } else { - let mut ssc = ((*pbvc)[i] * (*e_new)[i] - + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) - / rho0; - if ssc.to_f64() <= 0.1111111e-36 { - ssc = Real::from(0.3333333e-18); - } else { - ssc = ssc.sqrt(); - } - ssc * (*ql_old)[i] + (*qq_old)[i] - }; - (*e_new)[i] -= (Real::from(7.0) * ((*p_old)[i] + (*q_old)[i]) - - Real::from(8.0) * ((*p_half_step)[i] + (*q_new)[i]) - + ((*p_new)[i] + q_tilde)) - * (*delvc)[i] - / Real::from(6.0); - if ((*e_new)[i]).abs() < e_cut { - (*e_new)[i] = Real::from(0.0); - } - if (*e_new)[i] < emin { - (*e_new)[i] = emin; + ssc = ssc.sqrt(); } + ssc * (*ql_old)[i] + (*qq_old)[i] + }; + *v -= (Real::from(7.0) * ((*p_old)[i] + (*q_old)[i]) + - Real::from(8.0) * ((*p_half_step)[i] + (*q_new)[i]) + + ((*p_new)[i] + q_tilde)) + * (*delvc)[i] + / Real::from(6.0); + if (*v).abs() < e_cut { + *v = Real::from(0.0); + } + if *v < emin { + *v = emin; } } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc6( +#[offload_kernel] +fn energycalc6( delvc: &[Real; IEND], pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], + mut e_new: Region, vnewc: &[Real; IEND], bvc: &[Real; IEND], p_new: &[Real; IEND], - q_new: *mut [Real; IEND], + mut q_new: Region, ql_old: &[Real; IEND], qq_old: &[Real; IEND], rho0: Real, q_cut: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend && ((*delvc)[i]).to_f64() <= 0.0 { - let mut ssc = ((*pbvc)[i] * (*e_new)[i] - + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) - / rho0; - if ssc.to_f64() <= 0.1111111e-36 { - ssc = Real::from(0.3333333e-18); - } else { - ssc = ssc.sqrt(); - } - (*q_new)[i] = ssc * (*ql_old)[i] + (*qq_old)[i]; - if ((*q_new)[i]).abs() < q_cut { - (*q_new)[i] = Real::from(0.0); - } + let i = Linear1D::index(); + if let Some(v1) = e_new.get_mut() + && let Some(v2) = q_new.get_mut() + && ((*delvc)[i]).to_f64() <= 0.0 + { + let mut ssc = + ((*pbvc)[i] * (*v1) + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) / rho0; + if ssc.to_f64() <= 0.1111111e-36 { + ssc = Real::from(0.3333333e-18); + } else { + ssc = ssc.sqrt(); + } + *v2 = ssc * (*ql_old)[i] + (*qq_old)[i]; + if (*v2).abs() < q_cut { + *v2 = Real::from(0.0); } } } diff --git a/src/apps/fir.rs b/src/apps/fir.rs index ee42229..ad40553 100644 --- a/src/apps/fir.rs +++ b/src/apps/fir.rs @@ -6,9 +6,8 @@ pub const COEFFLEN: usize = 16; const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (IEND as u32).div_ceil(THREADS_PER_BLOCK); - use core::offload::offload_kernel; -use rustc_offload_frontend::partition::{Region, Linear1D, PartitioningStrategy}; +use rustc_offload_frontend::partition::{Linear1D, PartitioningStrategy, Region}; #[cfg(target_os = "linux")] use rustc_offload_frontend::offload; @@ -16,7 +15,6 @@ use rustc_offload_frontend::offload; #[cfg(target_os = "linux")] use core::offload::offload::{PreloadMut, preload_mut}; - #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, diff --git a/src/lib.rs b/src/lib.rs index ab1a8b8..b4dcbc7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,6 @@ #![feature(float_algebraic, core_float_math)] #![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx, abi_gpu_kernel))] #![cfg_attr(target_arch = "nvptx64", no_std)] - #![feature(rustc_attrs, core_intrinsics)] pub mod apps; diff --git a/src/main.rs b/src/main.rs index adea659..aa51d09 100644 --- a/src/main.rs +++ b/src/main.rs @@ -50,9 +50,9 @@ static mut K_VOL3D: Vol3D = Vol3D::INIT; #[cfg(target_os = "linux")] fn main() { + use core::mem::MaybeUninit; use rust_perf::common::executor::{Executor, KernelResult, MAX_KERNELS}; use rust_perf::common::kernel_base::KernelBase; - use core::mem::MaybeUninit; let mut k_links: [Option<&mut dyn KernelBase>; MAX_KERNELS] = [const { None }; MAX_KERNELS]; let mut count = 0; From d319bac7ad58ee9ca8e98d653d6927fcb9eca3f9 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 16:35:16 +0200 Subject: [PATCH 17/20] move pressure to new frontend --- crates/rustc_offload_frontend/src/gpu.rs | 42 ++++ .../rustc_offload_frontend/src/partition.rs | 39 +++- src/apps/energy.rs | 202 +++++++++--------- src/apps/pressure.rs | 101 ++++----- 4 files changed, 225 insertions(+), 159 deletions(-) diff --git a/crates/rustc_offload_frontend/src/gpu.rs b/crates/rustc_offload_frontend/src/gpu.rs index 5bc5223..4231788 100644 --- a/crates/rustc_offload_frontend/src/gpu.rs +++ b/crates/rustc_offload_frontend/src/gpu.rs @@ -18,3 +18,45 @@ pub(crate) fn global_thread_dim() -> Dim3 { #[cfg(target_os = "linux")] Dim3 { x: 0, y: 0, z: 0 } } + +pub (crate) fn block_idx() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: _block_idx_x() as usize, + y: _block_idx_y() as usize, + z: _block_idx_z() as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} + +pub (crate) fn block_dim() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: _block_dim_x() as usize, + y: _block_dim_y() as usize, + z: _block_dim_z() as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} + +pub (crate) fn thread_idx() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: _thread_idx_x() as usize, + y: _thread_idx_y() as usize, + z: _thread_idx_z() as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index f89dc69..7b173b2 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -1,4 +1,4 @@ -use crate::gpu::global_thread_dim; +use crate::gpu::{ global_thread_dim, block_idx, block_dim, thread_idx }; use core::convert::From; use core::prelude::v1::*; use core::offload::offload::PreloadMut; @@ -137,7 +137,42 @@ unsafe impl PartitioningStrategy for Linear2D { } } -// stride +// stride1d +#[derive(Debug, Copy, Clone)] +pub struct Stride1D< + const STRIDE: usize, +>; +unsafe impl + PartitioningStrategy for Stride1D +{ + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + fn index() -> usize { + let bidx = block_idx().x; + let tidx = thread_idx().x; + bidx * STRIDE + tidx + } + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &*ptr.add(idx) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &mut *ptr.add(idx) }) + } else { + None + } + } +} + + +// stride2d #[derive(Debug, Copy, Clone)] pub struct StrideViewMut<'a, T> { block_ptr: *mut T, diff --git a/src/apps/energy.rs b/src/apps/energy.rs index d0419f9..071b869 100644 --- a/src/apps/energy.rs +++ b/src/apps/energy.rs @@ -143,108 +143,108 @@ impl KernelBase for Energy { let mut e_new_reg = Region::<'_, _, Linear1D>::from(&p1); let mut q_new_reg = Region::<'_, _, Linear1D>::from(&p2); - unsafe { - offload! { - kernel = energycalc1, - grid_dim = [BLOCKS, 1, 1], - block_dim = [THREADS_PER_BLOCK, 1, 1], - args = ( - e_new_reg, - &*(self.e_old as *const [Real; IEND]), - &*(self.delvc as *const [Real; IEND]), - &*(self.p_old as *const [Real; IEND]), - &*(self.q_old as *const [Real; IEND]), - &*(self.work as *const [Real; IEND]), - IEND, - ), - }; - offload! { - kernel = energycalc2, - grid_dim = [BLOCKS, 1, 1], - block_dim = [THREADS_PER_BLOCK, 1, 1], - args = ( - &*(self.delvc as *const [Real; IEND]), - q_new_reg, - &*(self.comp_half_step as *const [Real; IEND]), - &*(self.p_half_step as *const [Real; IEND]), - e_new_reg, - &*(self.bvc as *const [Real; IEND]), - &*(self.pbvc as *const [Real; IEND]), - &*(self.ql_old as *const [Real; IEND]), - &*(self.qq_old as *const [Real; IEND]), - self.rho0, - IEND, - ), - }; - offload! { - kernel = energycalc3, - grid_dim = [BLOCKS, 1, 1], - block_dim = [THREADS_PER_BLOCK, 1, 1], - args = ( - e_new_reg, - &*(self.delvc as *const [Real; IEND]), - &*(self.p_old as *const [Real; IEND]), - &*(self.q_old as *const [Real; IEND]), - &*(self.p_half_step as *const [Real; IEND]), - &*(self.q_new as *const [Real; IEND]), - IEND, - ), - }; - offload! { - kernel = energycalc4, - grid_dim = [BLOCKS, 1, 1], - block_dim = [THREADS_PER_BLOCK, 1, 1], - args = ( - e_new_reg, - &*(self.work as *const [Real; IEND]), - self.e_cut, - self.emin, - IEND, - ), - }; - offload! { - kernel = energycalc5, - grid_dim = [BLOCKS, 1, 1], - block_dim = [THREADS_PER_BLOCK, 1, 1], - args = ( - &*(self.delvc as *const [Real; IEND]), - &*(self.pbvc as *const [Real; IEND]), - e_new_reg, - &*(self.vnewc as *const [Real; IEND]), - &*(self.bvc as *const [Real; IEND]), - &*(self.p_new as *const [Real; IEND]), - &*(self.ql_old as *const [Real; IEND]), - &*(self.qq_old as *const [Real; IEND]), - &*(self.p_old as *const [Real; IEND]), - &*(self.q_old as *const [Real; IEND]), - &*(self.p_half_step as *const [Real; IEND]), - &*(self.q_new as *const [Real; IEND]), - self.rho0, - self.e_cut, - self.emin, - IEND, - ), - }; - offload! { - kernel = energycalc6, - grid_dim = [BLOCKS, 1, 1], - block_dim = [THREADS_PER_BLOCK, 1, 1], - args = ( - &*(self.delvc as *const [Real; IEND]), - &*(self.pbvc as *const [Real; IEND]), + unsafe { + offload! { + kernel = energycalc1, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, + &*(self.e_old as *const [Real; IEND]), + &*(self.delvc as *const [Real; IEND]), + &*(self.p_old as *const [Real; IEND]), + &*(self.q_old as *const [Real; IEND]), + &*(self.work as *const [Real; IEND]), + IEND, + ), + }; + offload! { + kernel = energycalc2, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + &*(self.delvc as *const [Real; IEND]), + q_new_reg, + &*(self.comp_half_step as *const [Real; IEND]), + &*(self.p_half_step as *const [Real; IEND]), + e_new_reg, + &*(self.bvc as *const [Real; IEND]), + &*(self.pbvc as *const [Real; IEND]), + &*(self.ql_old as *const [Real; IEND]), + &*(self.qq_old as *const [Real; IEND]), + self.rho0, + IEND, + ), + }; + offload! { + kernel = energycalc3, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( e_new_reg, - &*(self.vnewc as *const [Real; IEND]), - &*(self.bvc as *const [Real; IEND]), - &*(self.p_new as *const [Real; IEND]), - q_new_reg, - &*(self.ql_old as *const [Real; IEND]), - &*(self.qq_old as *const [Real; IEND]), - self.rho0, - self.q_cut, - IEND, - ), - }; - } + &*(self.delvc as *const [Real; IEND]), + &*(self.p_old as *const [Real; IEND]), + &*(self.q_old as *const [Real; IEND]), + &*(self.p_half_step as *const [Real; IEND]), + &*(self.q_new as *const [Real; IEND]), + IEND, + ), + }; + offload! { + kernel = energycalc4, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, + &*(self.work as *const [Real; IEND]), + self.e_cut, + self.emin, + IEND, + ), + }; + offload! { + kernel = energycalc5, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + &*(self.delvc as *const [Real; IEND]), + &*(self.pbvc as *const [Real; IEND]), + e_new_reg, + &*(self.vnewc as *const [Real; IEND]), + &*(self.bvc as *const [Real; IEND]), + &*(self.p_new as *const [Real; IEND]), + &*(self.ql_old as *const [Real; IEND]), + &*(self.qq_old as *const [Real; IEND]), + &*(self.p_old as *const [Real; IEND]), + &*(self.q_old as *const [Real; IEND]), + &*(self.p_half_step as *const [Real; IEND]), + &*(self.q_new as *const [Real; IEND]), + self.rho0, + self.e_cut, + self.emin, + IEND, + ), + }; + offload! { + kernel = energycalc6, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + &*(self.delvc as *const [Real; IEND]), + &*(self.pbvc as *const [Real; IEND]), + e_new_reg, + &*(self.vnewc as *const [Real; IEND]), + &*(self.bvc as *const [Real; IEND]), + &*(self.p_new as *const [Real; IEND]), + q_new_reg, + &*(self.ql_old as *const [Real; IEND]), + &*(self.qq_old as *const [Real; IEND]), + self.rho0, + self.q_cut, + IEND, + ), + }; + } } fn update_checksum(&self) -> f64 { diff --git a/src/apps/pressure.rs b/src/apps/pressure.rs index f30b486..da4bbc4 100644 --- a/src/apps/pressure.rs +++ b/src/apps/pressure.rs @@ -1,6 +1,15 @@ pub const N_DEFAULT: usize = 1000000; const DEFAULT_REPS: u32 = 700; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{_block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x}; @@ -91,26 +100,33 @@ impl KernelBase for Pressure { let grid = [n.div_ceil(256) as u32, 1, 1]; let block = [256, 1, 1]; - core::intrinsics::offload::<_, _, ()>( - _pressure_calc1, - grid, - block, - 0, - ( - self.bvc as *mut [Real; N_DEFAULT], + let mut bvc = unsafe { &mut *(self.bvc as *mut [Real; N_DEFAULT]) }; + let mut p_new = unsafe { &mut *(self.p_new as *mut [Real; N_DEFAULT]) }; + + let p1: PreloadMut<[Real; N_DEFAULT]> = preload_mut(&mut bvc); + let p2: PreloadMut<[Real; N_DEFAULT]> = preload_mut(&mut p_new); + + let mut bvc_reg = Region::<'_, _, Stride1D<256>>::from(&p1); + let mut p_new_reg = Region::<'_, _, Stride1D<256>>::from(&p2); + + offload! { + kernel = pressure_calc1, + grid_dim = grid, + block_dim = block, + args = ( + bvc_reg, self.compression as *const [Real; N_DEFAULT], self.cls, n, ), - ); - - core::intrinsics::offload::<_, _, ()>( - _pressure_calc2, - grid, - block, - 0, - ( - self.p_new as *mut [Real; N_DEFAULT], + }; + + offload! { + kernel = pressure_calc2, + grid_dim = grid, + block_dim = block, + args = ( + p_new_reg, self.bvc as *const [Real; N_DEFAULT], self.e_old as *const [Real; N_DEFAULT], self.vnewc as *const [Real; N_DEFAULT], @@ -119,7 +135,7 @@ impl KernelBase for Pressure { self.pmin, n, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -143,52 +159,25 @@ impl KernelBase for Pressure { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _pressure_calc1( - bvc: *mut [Real; N_DEFAULT], - compression: *const [Real; N_DEFAULT], - cls: Real, - n: usize, - ); - - pub fn _pressure_calc2( - p_new: *mut [Real; N_DEFAULT], - bvc: *const [Real; N_DEFAULT], - e_old: *const [Real; N_DEFAULT], - vnewc: *const [Real; N_DEFAULT], - p_cut: Real, - eosvmax: Real, - pmin: Real, - n: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _pressure_calc1( - bvc: *mut [Real; N_DEFAULT], +#[offload_kernel] +fn pressure_calc1( + mut bvc: Region>, compression: *const [Real; N_DEFAULT], cls: Real, n: usize, ) { - let i = unsafe { (block_idx_x() * 256 + thread_idx_x()) as usize }; - if i < n { - unsafe { - (*bvc)[i] = cls * ((*compression)[i] + Real::from(1.0)); - } + let i = Stride1D::<256>::index(); + if let Some(v) = bvc.get_mut() { + *v = cls * ((*compression)[i] + Real::from(1.0)); } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _pressure_calc2( - p_new: *mut [Real; N_DEFAULT], +#[offload_kernel] +fn pressure_calc2( + mut p_new: Region>, bvc: *const [Real; N_DEFAULT], e_old: *const [Real; N_DEFAULT], vnewc: *const [Real; N_DEFAULT], @@ -197,8 +186,8 @@ pub unsafe extern "gpu-kernel" fn _pressure_calc2( pmin: Real, n: usize, ) { - let i = unsafe { (block_idx_x() * 256 + thread_idx_x()) as usize }; - if i < n { + let i = Stride1D::<256>::index(); + if let Some(v) = p_new.get_mut() { unsafe { let mut p = (*bvc)[i] * (*e_old)[i]; @@ -212,7 +201,7 @@ pub unsafe extern "gpu-kernel" fn _pressure_calc2( p = pmin; } - (*p_new)[i] = p; + *v = p; } } } From 38c4dd54106aacfb0be3ce785bc767e7ea03eac5 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 17:15:14 +0200 Subject: [PATCH 18/20] vol3d to frontend --- .../rustc_offload_frontend/src/partition.rs | 56 ++++++ src/apps/vol3d.rs | 173 +++++++++--------- 2 files changed, 141 insertions(+), 88 deletions(-) diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index 7b173b2..84a2cfb 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -217,3 +217,59 @@ unsafe impl { + base_ptr: *mut T, + idx: usize, + len: usize, + _marker: core::marker::PhantomData<&'a mut T>, +} + +impl<'a, T> OffsetStrideViewMut<'a, T> { + pub fn set(&mut self, offset: usize, val: T) { + if let Some(final_idx) = self.idx.checked_add(offset) { + if final_idx < self.len { + unsafe { + *self.base_ptr.add(final_idx) = val; + } + } + } + } +} + +#[derive(Debug, Copy, Clone)] +pub struct OffsetStride1D; + +unsafe impl PartitioningStrategy for OffsetStride1D { + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = OffsetStrideViewMut<'a, T>; + + fn index() -> usize { + let bidx = block_idx().x; + let tidx = thread_idx().x; + bidx * STRIDE + tidx + } + + unsafe fn get<'a, T>(_: *const T, _: usize) -> Option> { + unimplemented!("write only") + } + + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + + if idx < len { + Some(OffsetStrideViewMut { + base_ptr: ptr, + idx, + len, + _marker: core::marker::PhantomData, + }) + } else { + None + } + } +} diff --git a/src/apps/vol3d.rs b/src/apps/vol3d.rs index 6bbb855..8df12da 100644 --- a/src/apps/vol3d.rs +++ b/src/apps/vol3d.rs @@ -1,6 +1,15 @@ pub const N_DEFAULT: usize = 1000000; const DEFAULT_REPS: u32 = 100; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{OffsetStride1D, PartitioningStrategy, Region}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{_block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x}; @@ -121,23 +130,26 @@ impl KernelBase for Vol3D { let kp = self.kp; let count = lpz + 1 - fpz; - core::intrinsics::offload::<_, _, ()>( - _vol3d, - [((count + 255) / 256) as u32, 1, 1], - [256, 1, 1], - 0, - ( + let mut vol = unsafe { &mut *(self.vol as *mut [Real; 1124864]) }; + let p: PreloadMut<[Real; 1124864]> = preload_mut(&mut vol); + let mut vol_reg = Region::<'_, _, OffsetStride1D<256>>::from(&p); + + offload! { + kernel = vol3d, + grid_dim = [((count + 255) / 256) as u32, 1, 1], + block_dim = [256, 1, 1], + args = ( self.x as *const [Real; 1124864], self.y as *const [Real; 1124864], self.z as *const [Real; 1124864], - self.vol as *mut [Real; 1124864], + vol_reg, self.vnormq, jp, kp, fpz, lpz, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -159,102 +171,87 @@ impl KernelBase for Vol3D { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _vol3d( - x: *const [Real; 1124864], - y: *const [Real; 1124864], - z: *const [Real; 1124864], - vol: *mut [Real; 1124864], - vnormq: Real, - jp: usize, - kp: usize, - fpz: usize, - lpz: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _vol3d( +#[offload_kernel] +fn vol3d( x: *const [Real; 1124864], y: *const [Real; 1124864], z: *const [Real; 1124864], - vol: *mut [Real; 1124864], + mut vol: Region>, vnormq: Real, jp: usize, kp: usize, fpz: usize, lpz: usize, ) { - let idx = unsafe { (block_idx_x() * 256 + thread_idx_x()) as usize }; + let idx = OffsetStride1D::<256>::index(); let i = fpz + idx; if i > lpz { return; } - unsafe { - let i0 = i; - let i1 = i + 1; - let i2 = i + jp; - let i3 = i + 1 + jp; - let i4 = i + kp; - let i5 = i + 1 + kp; - let i6 = i + jp + kp; - let i7 = i + 1 + jp + kp; - - let x71 = (*x)[i7] - (*x)[i1]; - let x72 = (*x)[i7] - (*x)[i2]; - let x74 = (*x)[i7] - (*x)[i4]; - let x30 = (*x)[i3] - (*x)[i0]; - let x50 = (*x)[i5] - (*x)[i0]; - let x60 = (*x)[i6] - (*x)[i0]; - - let y71 = (*y)[i7] - (*y)[i1]; - let y72 = (*y)[i7] - (*y)[i2]; - let y74 = (*y)[i7] - (*y)[i4]; - let y30 = (*y)[i3] - (*y)[i0]; - let y50 = (*y)[i5] - (*y)[i0]; - let y60 = (*y)[i6] - (*y)[i0]; - - let z71 = (*z)[i7] - (*z)[i1]; - let z72 = (*z)[i7] - (*z)[i2]; - let z74 = (*z)[i7] - (*z)[i4]; - let z30 = (*z)[i3] - (*z)[i0]; - let z50 = (*z)[i5] - (*z)[i0]; - let z60 = (*z)[i6] - (*z)[i0]; - - let mut xps = x71 + x60; - let mut yps = y71 + y60; - let mut zps = z71 + z60; - - let mut cyz = y72 * z30 - z72 * y30; - let mut czx = z72 * x30 - x72 * z30; - let mut cxy = x72 * y30 - y72 * x30; - let mut v = xps * cyz + yps * czx + zps * cxy; - - xps = x72 + x50; - yps = y72 + y50; - zps = z72 + z50; - - cyz = y74 * z60 - z74 * y60; - czx = z74 * x60 - x74 * z60; - cxy = x74 * y60 - y74 * x60; - v += xps * cyz + yps * czx + zps * cxy; - - xps = x74 + x30; - yps = y74 + y30; - zps = z74 + z30; - - cyz = y71 * z50 - z71 * y50; - czx = z71 * x50 - x71 * z50; - cxy = x71 * y50 - y71 * x50; - v += xps * cyz + yps * czx + zps * cxy; - - (*vol)[i] = v * vnormq; + if let Some(mut vvol) = vol.get_mut() { + unsafe { + let i0 = i; + let i1 = i + 1; + let i2 = i + jp; + let i3 = i + 1 + jp; + let i4 = i + kp; + let i5 = i + 1 + kp; + let i6 = i + jp + kp; + let i7 = i + 1 + jp + kp; + + let x71 = (*x)[i7] - (*x)[i1]; + let x72 = (*x)[i7] - (*x)[i2]; + let x74 = (*x)[i7] - (*x)[i4]; + let x30 = (*x)[i3] - (*x)[i0]; + let x50 = (*x)[i5] - (*x)[i0]; + let x60 = (*x)[i6] - (*x)[i0]; + + let y71 = (*y)[i7] - (*y)[i1]; + let y72 = (*y)[i7] - (*y)[i2]; + let y74 = (*y)[i7] - (*y)[i4]; + let y30 = (*y)[i3] - (*y)[i0]; + let y50 = (*y)[i5] - (*y)[i0]; + let y60 = (*y)[i6] - (*y)[i0]; + + let z71 = (*z)[i7] - (*z)[i1]; + let z72 = (*z)[i7] - (*z)[i2]; + let z74 = (*z)[i7] - (*z)[i4]; + let z30 = (*z)[i3] - (*z)[i0]; + let z50 = (*z)[i5] - (*z)[i0]; + let z60 = (*z)[i6] - (*z)[i0]; + + let mut xps = x71 + x60; + let mut yps = y71 + y60; + let mut zps = z71 + z60; + + let mut cyz = y72 * z30 - z72 * y30; + let mut czx = z72 * x30 - x72 * z30; + let mut cxy = x72 * y30 - y72 * x30; + let mut v = xps * cyz + yps * czx + zps * cxy; + + xps = x72 + x50; + yps = y72 + y50; + zps = z72 + z50; + + cyz = y74 * z60 - z74 * y60; + czx = z74 * x60 - x74 * z60; + cxy = x74 * y60 - y74 * x60; + v += xps * cyz + yps * czx + zps * cxy; + + xps = x74 + x30; + yps = y74 + y30; + zps = z74 + z30; + + cyz = y71 * z50 - z71 * y50; + czx = z71 * x50 - x71 * z50; + cxy = x71 * y50 - y71 * x50; + v += xps * cyz + yps * czx + zps * cxy; + + vvol.set(fpz, v * vnormq); + } } } From 67940cf429886f48d68daaf811f7256b244667be Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 17:37:00 +0200 Subject: [PATCH 19/20] use macros on del_dot_vec_2d, ltimes and matvec_3d_stencil --- src/apps/del_dot_vec_2d.rs | 85 ++++++++++++++--------------------- src/apps/ltimes.rs | 41 +++++++---------- src/apps/matvec_3d_stencil.rs | 54 +++++++--------------- 3 files changed, 67 insertions(+), 113 deletions(-) diff --git a/src/apps/del_dot_vec_2d.rs b/src/apps/del_dot_vec_2d.rs index 8aaf7ef..c232c38 100644 --- a/src/apps/del_dot_vec_2d.rs +++ b/src/apps/del_dot_vec_2d.rs @@ -17,6 +17,15 @@ const N_REAL_ZONES: usize = (IMAX - IMIN) * (JMAX - JMIN); const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (N_REAL_ZONES as u32).div_ceil(THREADS_PER_BLOCK); +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "amdgpu")] use core::arch::amdgpu::{workgroup_id_x as block_idx_x, workitem_id_x as thread_idx_x}; #[cfg(target_arch = "nvptx64")] @@ -148,12 +157,11 @@ impl KernelBase for DelDotVec2D { let fy4 = unsafe { self.ydot.add(JP) as *const Real }; unsafe { - core::intrinsics::offload::<_, _, ()>( - _del_dot_vec_2d, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - 0, - ( + offload! { + kernel = del_dot_vec_2d, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( self.div as *mut [Real; NNALLS], &*x1, x2, @@ -176,7 +184,7 @@ impl KernelBase for DelDotVec2D { ptiny, N_REAL_ZONES, ), - ); + }; } } @@ -202,40 +210,11 @@ impl KernelBase for DelDotVec2D { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _del_dot_vec_2d( - div: *mut [Real; NNALLS], - x1: &[Real; NNALLS], - x2: *const Real, - x3: *const Real, - x4: *const Real, - y1: &[Real; NNALLS], - y2: *const Real, - y3: *const Real, - y4: *const Real, - fx1: &[Real; NNALLS], - fx2: *const Real, - fx3: *const Real, - fx4: *const Real, - fy1: &[Real; NNALLS], - fy2: *const Real, - fy3: *const Real, - fy4: *const Real, - real_zones: &[usize; N_REAL_ZONES], - half: Real, - ptiny: Real, - iend: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::{Real, RealExt}; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _del_dot_vec_2d( +#[offload_kernel] +fn del_dot_vec_2d( div: *mut [Real; NNALLS], x1: &[Real; NNALLS], x2: *const Real, @@ -262,27 +241,29 @@ pub unsafe extern "gpu-kernel" fn _del_dot_vec_2d( if ii < iend { let i = real_zones[ii]; - let xi = half * (x1[i] + *x2.add(i) - *x3.add(i) - *x4.add(i)); - let xj = half * (*x2.add(i) + *x3.add(i) - *x4.add(i) - x1[i]); + unsafe { + let xi = half * (x1[i] + *x2.add(i) - *x3.add(i) - *x4.add(i)); + let xj = half * (*x2.add(i) + *x3.add(i) - *x4.add(i) - x1[i]); - let yi = half * (y1[i] + *y2.add(i) - *y3.add(i) - *y4.add(i)); - let yj = half * (*y2.add(i) + *y3.add(i) - *y4.add(i) - y1[i]); + let yi = half * (y1[i] + *y2.add(i) - *y3.add(i) - *y4.add(i)); + let yj = half * (*y2.add(i) + *y3.add(i) - *y4.add(i) - y1[i]); - let fxi = half * (fx1[i] + *fx2.add(i) - *fx3.add(i) - *fx4.add(i)); - let fxj = half * (*fx2.add(i) + *fx3.add(i) - *fx4.add(i) - fx1[i]); + let fxi = half * (fx1[i] + *fx2.add(i) - *fx3.add(i) - *fx4.add(i)); + let fxj = half * (*fx2.add(i) + *fx3.add(i) - *fx4.add(i) - fx1[i]); - let fyi = half * (fy1[i] + *fy2.add(i) - *fy3.add(i) - *fy4.add(i)); - let fyj = half * (*fy2.add(i) + *fy3.add(i) - *fy4.add(i) - fy1[i]); + let fyi = half * (fy1[i] + *fy2.add(i) - *fy3.add(i) - *fy4.add(i)); + let fyj = half * (*fy2.add(i) + *fy3.add(i) - *fy4.add(i) - fy1[i]); - let rarea = Real::from(1.0) / (xi * yj - xj * yi + ptiny); + let rarea = Real::from(1.0) / (xi * yj - xj * yi + ptiny); - let dfxdx = rarea * (fxi * yj - fxj * yi); + let dfxdx = rarea * (fxi * yj - fxj * yi); - let dfydy = rarea * (fyj * xi - fyi * xj); + let dfydy = rarea * (fyj * xi - fyi * xj); - let affine = (fy1[i] + *fy2.add(i) + *fy3.add(i) + *fy4.add(i)) - / (y1[i] + *y2.add(i) + *y3.add(i) + *y4.add(i)); + let affine = (fy1[i] + *fy2.add(i) + *fy3.add(i) + *fy4.add(i)) + / (y1[i] + *y2.add(i) + *y3.add(i) + *y4.add(i)); - (*div)[i] = dfxdx + dfydy + affine; + (*div)[i] = dfxdx + dfydy + affine; + } } } diff --git a/src/apps/ltimes.rs b/src/apps/ltimes.rs index cb98988..1e9e10f 100644 --- a/src/apps/ltimes.rs +++ b/src/apps/ltimes.rs @@ -3,6 +3,15 @@ pub const NUM_G: usize = 32; pub const NUM_M: usize = 25; const DEFAULT_REPS: u32 = 50; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_idx_x as block_idx_x, _block_idx_y as block_idx_y, _block_idx_z as block_idx_z, @@ -101,12 +110,11 @@ impl KernelBase for LTimes { let grid_y = NUM_G.div_ceil(g_block); let grid_z = num_z.div_ceil(z_block); - core::intrinsics::offload::<_, _, ()>( - _ltimes, - [grid_x as u32, grid_y as u32, grid_z as u32], - [m_block as u32, g_block as u32, z_block as u32], - 0, - ( + offload! { + kernel = ltimes, + grid_dim = [grid_x as u32, grid_y as u32, grid_z as u32], + block_dim = [m_block as u32, g_block as u32, z_block as u32], + args = ( self.phidat as *mut [Real; 390400], self.elldat as *const [Real; 1600], self.psidat as *const [Real; 999424], @@ -115,7 +123,7 @@ impl KernelBase for LTimes { NUM_G, num_z, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -136,26 +144,11 @@ impl KernelBase for LTimes { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _ltimes( - phi: *mut [Real; 390400], - ell: &[Real; 1600], - psi: &[Real; 999424], - num_d: usize, - num_m: usize, - num_g: usize, - num_z: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _ltimes( +#[offload_kernel] +fn ltimes( phi: *mut [Real; 390400], ell: &[Real; 1600], psi: &[Real; 999424], diff --git a/src/apps/matvec_3d_stencil.rs b/src/apps/matvec_3d_stencil.rs index 1707834..a6f5e8f 100644 --- a/src/apps/matvec_3d_stencil.rs +++ b/src/apps/matvec_3d_stencil.rs @@ -1,6 +1,15 @@ pub const N_DEFAULT: usize = 1000000; const DEFAULT_REPS: u32 = 100; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{_block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x}; @@ -126,12 +135,11 @@ impl KernelBase for Matvec3DStencil { let jp = self.jp; let kp = self.kp; - core::intrinsics::offload::<_, _, ()>( - _matvec3dstencil, - [n.div_ceil(256) as u32, 1, 1], - [256, 1, 1], - 0, - ( + offload! { + kernel = matvec3dstencil, + grid_dim = [n.div_ceil(256) as u32, 1, 1], + block_dim = [256, 1, 1], + args = ( self.x as *const [Real; 1124864], self.b as *mut [Real; 1124864], self.matrix[0] as *const [Real; 1124864], @@ -153,7 +161,7 @@ impl KernelBase for Matvec3DStencil { kp, n, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -177,39 +185,11 @@ impl KernelBase for Matvec3DStencil { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _matvec3dstencil( - x: *const [Real; 1124864], - b: *mut [Real; 1124864], - m0: *const [Real; 1124864], - m1: *const [Real; 1124864], - m2: *const [Real; 1124864], - m3: *const [Real; 1124864], - m4: *const [Real; 1124864], - m5: *const [Real; 1124864], - m6: *const [Real; 1124864], - m7: *const [Real; 1124864], - m8: *const [Real; 1124864], - m9: *const [Real; 1124864], - m10: *const [Real; 1124864], - m11: *const [Real; 1124864], - m12: *const [Real; 1124864], - m13: *const [Real; 1124864], - real_zones: *const [u64; 1000000], - jp: usize, - kp: usize, - n: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _matvec3dstencil( +#[offload_kernel] +fn matvec3dstencil( x: *const [Real; 1124864], b: *mut [Real; 1124864], m0: *const [Real; 1124864], From d95bb4e7aa95b99030d23212d3d17aa9d8195229 Mon Sep 17 00:00:00 2001 From: Sa4dUs Date: Wed, 3 Jun 2026 17:52:27 +0200 Subject: [PATCH 20/20] complete ltimes --- crates/rustc_offload_frontend/src/gpu.rs | 6 +- .../rustc_offload_frontend/src/partition.rs | 73 ++++++++++++++++--- src/apps/ltimes.rs | 25 ++++--- 3 files changed, 79 insertions(+), 25 deletions(-) diff --git a/crates/rustc_offload_frontend/src/gpu.rs b/crates/rustc_offload_frontend/src/gpu.rs index 4231788..2d7df26 100644 --- a/crates/rustc_offload_frontend/src/gpu.rs +++ b/crates/rustc_offload_frontend/src/gpu.rs @@ -19,7 +19,7 @@ pub(crate) fn global_thread_dim() -> Dim3 { Dim3 { x: 0, y: 0, z: 0 } } -pub (crate) fn block_idx() -> Dim3 { +pub(crate) fn block_idx() -> Dim3 { #[cfg(target_arch = "nvptx64")] unsafe { use core::arch::nvptx::*; @@ -33,7 +33,7 @@ pub (crate) fn block_idx() -> Dim3 { Dim3 { x: 0, y: 0, z: 0 } } -pub (crate) fn block_dim() -> Dim3 { +pub(crate) fn block_dim() -> Dim3 { #[cfg(target_arch = "nvptx64")] unsafe { use core::arch::nvptx::*; @@ -47,7 +47,7 @@ pub (crate) fn block_dim() -> Dim3 { Dim3 { x: 0, y: 0, z: 0 } } -pub (crate) fn thread_idx() -> Dim3 { +pub(crate) fn thread_idx() -> Dim3 { #[cfg(target_arch = "nvptx64")] unsafe { use core::arch::nvptx::*; diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs index 84a2cfb..8cde141 100644 --- a/crates/rustc_offload_frontend/src/partition.rs +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -1,7 +1,7 @@ -use crate::gpu::{ global_thread_dim, block_idx, block_dim, thread_idx }; +use crate::gpu::{block_dim, block_idx, global_thread_dim, thread_idx}; use core::convert::From; -use core::prelude::v1::*; use core::offload::offload::PreloadMut; +use core::prelude::v1::*; pub unsafe trait PartitioningStrategy { type View<'a, T: 'a>; @@ -139,12 +139,8 @@ unsafe impl PartitioningStrategy for Linear2D { // stride1d #[derive(Debug, Copy, Clone)] -pub struct Stride1D< - const STRIDE: usize, ->; -unsafe impl - PartitioningStrategy for Stride1D -{ +pub struct Stride1D; +unsafe impl PartitioningStrategy for Stride1D { type View<'a, T: 'a> = &'a T; type ViewMut<'a, T: 'a> = &'a mut T; @@ -171,7 +167,6 @@ unsafe impl } } - // stride2d #[derive(Debug, Copy, Clone)] pub struct StrideViewMut<'a, T> { @@ -260,7 +255,7 @@ unsafe impl PartitioningStrategy for OffsetStride1D unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { let idx = Self::index(); - + if idx < len { Some(OffsetStrideViewMut { base_ptr: ptr, @@ -273,3 +268,61 @@ unsafe impl PartitioningStrategy for OffsetStride1D } } } + +// for ltimes +#[derive(Debug, Copy, Clone)] +pub struct Stride3D< + const BX: usize, + const BY: usize, + const BZ: usize, + const MAX_X: usize, + const MAX_Y: usize, +>; + +unsafe impl< + const BX: usize, + const BY: usize, + const BZ: usize, + const MAX_X: usize, + const MAX_Y: usize, +> PartitioningStrategy for Stride3D +{ + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + fn index() -> usize { + let mx = (block_idx().x * BX) + thread_idx().x; + let gy = (block_idx().y * BY) + thread_idx().y; + let zz = (block_idx().z * BZ) + thread_idx().z; + + mx + MAX_X * (gy + MAX_Y * zz) + } + + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let mx = (block_idx().x * BX) + thread_idx().x; + let gy = (block_idx().y * BY) + thread_idx().y; + let zz = (block_idx().z * BZ) + thread_idx().z; + + if mx < MAX_X && gy < MAX_Y { + let idx = mx + MAX_X * (gy + MAX_Y * zz); + if idx < len { + return Some(unsafe { &*ptr.add(idx) }); + } + } + None + } + + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let mx = (block_idx().x * BX) + thread_idx().x; + let gy = (block_idx().y * BY) + thread_idx().y; + let zz = (block_idx().z * BZ) + thread_idx().z; + + if mx < MAX_X && gy < MAX_Y { + let idx = mx + MAX_X * (gy + MAX_Y * zz); + if idx < len { + return Some(unsafe { &mut *ptr.add(idx) }); + } + } + None + } +} diff --git a/src/apps/ltimes.rs b/src/apps/ltimes.rs index 1e9e10f..568e89b 100644 --- a/src/apps/ltimes.rs +++ b/src/apps/ltimes.rs @@ -4,7 +4,7 @@ pub const NUM_M: usize = 25; const DEFAULT_REPS: u32 = 50; use core::offload::offload_kernel; -use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride3D}; #[cfg(target_os = "linux")] use rustc_offload_frontend::offload; @@ -110,12 +110,16 @@ impl KernelBase for LTimes { let grid_y = NUM_G.div_ceil(g_block); let grid_z = num_z.div_ceil(z_block); + let mut phidat = unsafe { &mut *(self.phidat as *mut [Real; 390400]) }; + let p: PreloadMut<[Real; 390400]> = preload_mut(&mut phidat); + let mut phidat_reg = Region::<'_, _, Stride3D<32, 8, 1, 25, 32>>::from(&p); + offload! { kernel = ltimes, grid_dim = [grid_x as u32, grid_y as u32, grid_z as u32], block_dim = [m_block as u32, g_block as u32, z_block as u32], args = ( - self.phidat as *mut [Real; 390400], + phidat_reg, self.elldat as *const [Real; 1600], self.psidat as *const [Real; 999424], NUM_D, @@ -149,7 +153,7 @@ use crate::common::types::Real; #[offload_kernel] fn ltimes( - phi: *mut [Real; 390400], + mut phi: Region>, ell: &[Real; 1600], psi: &[Real; 999424], num_d: usize, @@ -157,22 +161,19 @@ fn ltimes( num_g: usize, num_z: usize, ) { - let num_m = NUM_M; - let num_g = NUM_G; let num_d = NUM_D; + let num_g = NUM_G; - let m = (block_idx_x() * 32 + thread_idx_x()) as usize; - let g = (block_idx_y() * 8 + thread_idx_y()) as usize; - let z = (block_idx_z() * 1 + thread_idx_z()) as usize; - - if m < num_m && g < num_g && z < num_z { - let phi_idx = m + num_m * (g + num_g * z); + let m = unsafe { (block_idx_x() * 32 + thread_idx_x()) as usize }; + let g = unsafe { (block_idx_y() * 8 + thread_idx_y()) as usize }; + let z = unsafe { (block_idx_z() * 1 + thread_idx_z()) as usize }; + if let Some(v) = phi.get_mut() { for d in 0..num_d { let ell_idx = d + num_d * m; let psi_idx = d + num_d * (g + num_g * z); - (*phi)[phi_idx] += (*ell)[ell_idx] * (*psi)[psi_idx]; + *v += ell[ell_idx] * psi[psi_idx]; } } }