diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..d76dec3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "crates/rustc_offload_frontend"] + path = crates/rustc_offload_frontend + url = https://github.com/sa4dus/rustc_offload_frontend + branch = main diff --git a/Cargo.lock b/Cargo.lock index 0f5194d..a589b02 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,14 @@ checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "rust_perf" version = "0.1.0" +dependencies = [ + "libc", + "rustc_offload_frontend", +] + +[[package]] +name = "rustc_offload_frontend" +version = "0.1.0" dependencies = [ "libc", ] diff --git a/Cargo.toml b/Cargo.toml index c1032a5..ee31154 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,10 +4,16 @@ version = "0.1.0" edition = "2024" [lib] -crate-type = ["cdylib"] +crate-type = ["rlib", "cdylib"] +path = "src/lib.rs" + +[[bin]] +name = "main" +path = "src/main.rs" [dependencies] libc = { version = "0.2.175", default-features = false } +rustc_offload_frontend = { path = "crates/rustc_offload_frontend" } [features] default = ["all", "f64"] diff --git a/crates/rustc_offload_frontend/.gitignore b/crates/rustc_offload_frontend/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/crates/rustc_offload_frontend/.gitignore @@ -0,0 +1 @@ +/target diff --git a/crates/rustc_offload_frontend/Cargo.lock b/crates/rustc_offload_frontend/Cargo.lock new file mode 100644 index 0000000..c4532ca --- /dev/null +++ b/crates/rustc_offload_frontend/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "rustc_offload_frontend" +version = "0.1.0" +dependencies = [ + "libc", +] diff --git a/crates/rustc_offload_frontend/Cargo.toml b/crates/rustc_offload_frontend/Cargo.toml new file mode 100644 index 0000000..e1dd08f --- /dev/null +++ b/crates/rustc_offload_frontend/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "rustc_offload_frontend" +version = "0.1.0" +edition = "2024" +license = "MIT" +description = "rustc offload feature frontend draft" +repository = "https://github.com/sa4dus/rustc_offload_frontend" +readme = "README.md" + +[lib] +crate-type = ["rlib", "cdylib"] +path = "src/lib.rs" + +[[bin]] +name = "main" +path = "src/main.rs" + +[dependencies] +libc = { version = "0.2.175", default-features = false } + +[features] + +[profile.release] +lto = "fat" +panic = "abort" + +[profile.dev] +lto = "fat" +panic = "abort" diff --git a/crates/rustc_offload_frontend/README.md b/crates/rustc_offload_frontend/README.md new file mode 100644 index 0000000..d518681 --- /dev/null +++ b/crates/rustc_offload_frontend/README.md @@ -0,0 +1 @@ +# rustc_offload_frontend diff --git a/crates/rustc_offload_frontend/src/gpu.rs b/crates/rustc_offload_frontend/src/gpu.rs new file mode 100644 index 0000000..2d7df26 --- /dev/null +++ b/crates/rustc_offload_frontend/src/gpu.rs @@ -0,0 +1,62 @@ +#[derive(Clone, Copy)] +pub struct Dim3 { + pub x: usize, + pub y: usize, + pub z: usize, +} + +pub(crate) fn global_thread_dim() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: (_block_idx_x() * _block_dim_x() + _thread_idx_x()) as usize, + y: (_block_idx_y() * _block_dim_y() + _thread_idx_y()) as usize, + z: (_block_idx_z() * _block_dim_z() + _thread_idx_z()) as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} + +pub(crate) fn block_idx() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: _block_idx_x() as usize, + y: _block_idx_y() as usize, + z: _block_idx_z() as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} + +pub(crate) fn block_dim() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: _block_dim_x() as usize, + y: _block_dim_y() as usize, + z: _block_dim_z() as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} + +pub(crate) fn thread_idx() -> Dim3 { + #[cfg(target_arch = "nvptx64")] + unsafe { + use core::arch::nvptx::*; + Dim3 { + x: _thread_idx_x() as usize, + y: _thread_idx_y() as usize, + z: _thread_idx_z() as usize, + } + } + #[cfg(target_os = "linux")] + Dim3 { x: 0, y: 0, z: 0 } +} diff --git a/crates/rustc_offload_frontend/src/lib.rs b/crates/rustc_offload_frontend/src/lib.rs new file mode 100644 index 0000000..a422fc0 --- /dev/null +++ b/crates/rustc_offload_frontend/src/lib.rs @@ -0,0 +1,69 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload, offload)] +#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] +#![cfg_attr(target_arch = "nvptx64", no_std)] + +pub use core::offload::offload_kernel; + +pub mod gpu; +pub mod partition; + +#[macro_export] +macro_rules! offload { + ( $($field:ident = $val:expr),* $(,)? ) => { + $crate::offload!(@munch + [ $($field = $val),* ]; + kernel = NONE; + grid_dim = ([1, 1, 1]); + block_dim = ([1, 1, 1]); + dyn_cache = (0); + args = NONE + ); + }; + + (@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; dyn_cache = $d; args = $a); + }; + (@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; dyn_cache = $d; args = $a); + }; + (@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); dyn_cache = $d; args = $a); + }; + (@munch [dyn_cache = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; dyn_cache = ($val); args = $a); + }; + (@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + $crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; dyn_cache = $d; args = (SOME $val)); + }; + + (@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + compile_error!(concat!("unknown field ", stringify!($invalid))); + }; + + (@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => { + compile_error!("missing `kernel`"); + }; + (@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = NONE) => { + compile_error!("missing `args`"); + }; + (@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); dyn_cache = ($dyn_cache:expr); args = (SOME $args:expr)) => { + core::intrinsics::offload::<_, _, ()>( + $kernel, + $grid_dim, + $block_dim, + $dyn_cache, + $args, + ) + }; +} + +#[cfg(target_arch = "nvptx64")] +#[panic_handler] +fn panic(_: &core::panic::PanicInfo) -> ! { + loop {} +} diff --git a/crates/rustc_offload_frontend/src/main.rs b/crates/rustc_offload_frontend/src/main.rs new file mode 100644 index 0000000..7ffd221 --- /dev/null +++ b/crates/rustc_offload_frontend/src/main.rs @@ -0,0 +1,145 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload)] +#![cfg_attr(target_os = "linux", feature(core_intrinsics, offload))] +#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] +#![cfg_attr(target_arch = "nvptx64", no_std)] +#![cfg_attr(target_arch = "nvptx64", no_main)] + +#[cfg(target_os = "linux")] +extern crate libc; + +use rustc_offload_frontend::offload_kernel; +use rustc_offload_frontend::partition::{Linear1D, Linear2D, Region, Stride2D}; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + +#[cfg(target_arch = "nvptx64")] +use rustc_offload_frontend::partition::PartitioningStrategy; + +#[offload_kernel] +fn linear1d(mut x: Region) { + if let Some(e) = x.get_mut() { + *e = 42.0; + } +} + +#[offload_kernel] +fn stride2d(mut grid: Region>) { + if let Some(mut view) = grid.get_mut() { + view.set(0, 0, 42.0); + view.set(1, 1, 42.0); + } +} + +#[offload_kernel] +fn conv_blur2d(input: &[f64], mut output: Region>) { + if let Some(out_cell) = output.get_mut() { + let mut sum = 0.0; + + for dy in -1..=1 { + for dx in -1..=1 { + let idx = (Linear2D::<4>::index() as isize + dy * 4 as isize + dx) as usize; + if let Some(v) = input.get(idx) { + sum += v; + } + } + } + + *out_cell = sum / 9.0; + } +} + +#[offload_kernel] +fn saxpy_kernel(alpha: f32, x: &[f32], mut y: Region) { + if let (Some(val_x), Some(val_y)) = (x.get(Linear1D::index()), y.get_mut()) { + *val_y = alpha * (*val_x) + (*val_y); + } +} + +#[cfg(target_os = "linux")] +fn main() { + use rustc_offload_frontend::offload; + + // linear1d + let mut x = [0.0f64; 256]; + let p: PreloadMut<[f64; 256]> = preload_mut(&mut x); + let mut reg = Region::<'_, _, Linear1D>::from(&p); + offload! { + kernel = linear1d, + grid_dim = [256, 1, 1], + args = (reg,), + }; + drop(p); + for i in 0..x.len() { + assert_eq!(x[i], 42.0 as f64); + } + println!("::passed:: linear1d"); + + // stride2d + let mut blocks = [0.0; 64]; + let p: PreloadMut<[f64; 64]> = preload_mut(&mut blocks); + let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4, 8>>::from(&p); + offload! { + kernel = stride2d, + block_dim = [2, 2, 1], + args = (reg_stride,), + }; + drop(p); + // thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements + assert_eq!(blocks[0], 42.0); + assert_eq!(blocks[9], 42.0); + println!("::passed:: stride2d"); + + // conv_blur2d + let input = [ + 0.0, 0.0, 0.0, 0.0, // + 0.0, 9.0, 9.0, 0.0, // + 0.0, 9.0, 9.0, 0.0, // + 0.0, 0.0, 0.0, 0.0, // + ]; + let mut output = [0.0f64; 16]; + let p: PreloadMut<[f64; 16]> = preload_mut(&mut output); + let mut reg_output = Region::<_, Linear2D<4>>::from(&p); + offload! { + kernel = conv_blur2d, + block_dim = [4, 4, 1], + args = (&input as &[f64], reg_output,), + }; + drop(p); + + let expected = [ + 1.0, 2.0, 2.0, 1.0, // + 2.0, 4.0, 4.0, 2.0, // + 2.0, 4.0, 4.0, 2.0, // + 1.0, 2.0, 2.0, 1.0, // + ]; + assert_eq!(output, expected); + println!("::passed:: conv_blur2d"); + + // saxpy + const N: usize = 512; + let alpha: f32 = 2.5; + let x: [f32; N] = [2.0; N]; + let mut y: [f32; N] = [1.0; N]; + let p: PreloadMut<[f32; N]> = preload_mut(&mut y); + let mut reg_y = Region::<_, Linear1D>::from(&p); + + offload! { + kernel = saxpy_kernel, + grid_dim = [N as u32, 1, 1], + args = (alpha, &x as &[f32], reg_y,), + }; + drop(p); + + for i in 0..N { + assert_eq!(y[i], 6.0f32); + } + println!("::passed:: saxpy"); + + println!("all checks passed!"); +} diff --git a/crates/rustc_offload_frontend/src/partition.rs b/crates/rustc_offload_frontend/src/partition.rs new file mode 100644 index 0000000..8cde141 --- /dev/null +++ b/crates/rustc_offload_frontend/src/partition.rs @@ -0,0 +1,328 @@ +use crate::gpu::{block_dim, block_idx, global_thread_dim, thread_idx}; +use core::convert::From; +use core::offload::offload::PreloadMut; +use core::prelude::v1::*; + +pub unsafe trait PartitioningStrategy { + type View<'a, T: 'a>; + type ViewMut<'a, T: 'a>; + + fn index() -> usize; + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option>; + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option>; +} + +#[derive(Debug, Copy, Clone)] +pub struct Region<'a, T, S: PartitioningStrategy> { + ptr: *mut T, + len: usize, + _marker: core::marker::PhantomData<(&'a mut [T], S)>, +} + +impl<'a, T, const N: usize, S> From<&PreloadMut<'a, [T; N]>> for Region<'a, T, S> +where + S: PartitioningStrategy, +{ + fn from(p: &PreloadMut<'a, [T; N]>) -> Self { + Self { + ptr: p.cpu_ptr as *mut T, + len: N, + _marker: core::marker::PhantomData, + } + } +} + +pub struct RawRegion<'a, T> { + pub ptr: *mut T, + pub len: usize, + _marker: core::marker::PhantomData<&'a mut [T]>, +} + +impl<'a, T> From<&'a mut [T]> for RawRegion<'a, T> { + fn from(data: &'a mut [T]) -> Self { + Self { + ptr: data.as_mut_ptr(), + len: data.len(), + _marker: core::marker::PhantomData, + } + } +} + +impl<'a, T, const N: usize> From<&'a mut [T; N]> for RawRegion<'a, T> { + fn from(data: &'a mut [T; N]) -> Self { + Self { + ptr: data.as_mut_ptr(), + len: N, + _marker: core::marker::PhantomData, + } + } +} + +impl<'a, T, S: PartitioningStrategy> Region<'a, T, S> { + pub fn new(data: D) -> Self + where + D: Into>, + { + let raw = data.into(); + Self { + ptr: raw.ptr, + len: raw.len, + _marker: core::marker::PhantomData, + } + } + + pub fn get(&self) -> Option> { + unsafe { S::get(self.ptr as *const T, self.len) } + } + + pub fn get_mut(&mut self) -> Option> { + unsafe { S::get_mut(self.ptr, self.len) } + } +} + +// linear1d +#[derive(Debug, Copy, Clone)] +pub struct Linear1D; +unsafe impl PartitioningStrategy for Linear1D { + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + fn index() -> usize { + global_thread_dim().x + } + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &*ptr.add(idx) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &mut *ptr.add(idx) }) + } else { + None + } + } +} + +// linear2d +#[derive(Debug, Copy, Clone)] +pub struct Linear2D; +unsafe impl PartitioningStrategy for Linear2D { + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + fn index() -> usize { + let tid = global_thread_dim(); + tid.y * W + tid.x + } + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &*ptr.add(idx) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &mut *ptr.add(idx) }) + } else { + None + } + } +} + +// stride1d +#[derive(Debug, Copy, Clone)] +pub struct Stride1D; +unsafe impl PartitioningStrategy for Stride1D { + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + fn index() -> usize { + let bidx = block_idx().x; + let tidx = thread_idx().x; + bidx * STRIDE + tidx + } + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &*ptr.add(idx) }) + } else { + None + } + } + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + if idx < len { + Some(unsafe { &mut *ptr.add(idx) }) + } else { + None + } + } +} + +// stride2d +#[derive(Debug, Copy, Clone)] +pub struct StrideViewMut<'a, T> { + block_ptr: *mut T, + stride: usize, + _marker: core::marker::PhantomData<&'a mut T>, +} +impl<'a, T> StrideViewMut<'a, T> { + pub fn set(&mut self, x: usize, y: usize, val: T) { + unsafe { + *self.block_ptr.add(y * self.stride + x) = val; + } + } +} + +#[derive(Debug, Copy, Clone)] +pub struct Stride2D< + const W: usize, + const H: usize, + const SX: usize, + const SY: usize, + const STRIDE: usize, +>; +unsafe impl + PartitioningStrategy for Stride2D +{ + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = StrideViewMut<'a, T>; + + fn index() -> usize { + let tid = global_thread_dim(); + tid.y * SY * STRIDE + tid.x * SX + } + unsafe fn get<'a, T>(_: *const T, _: usize) -> Option> { + unimplemented!() + } + unsafe fn get_mut<'a, T>(ptr: *mut T, _: usize) -> Option> { + let idx = Self::index(); + Some(StrideViewMut { + block_ptr: unsafe { ptr.add(idx) }, + stride: STRIDE, + _marker: core::marker::PhantomData, + }) + } +} + +// some custom patterns needed for `rust_perf` + +// for vol3d +#[derive(Debug, Copy, Clone)] +pub struct OffsetStrideViewMut<'a, T> { + base_ptr: *mut T, + idx: usize, + len: usize, + _marker: core::marker::PhantomData<&'a mut T>, +} + +impl<'a, T> OffsetStrideViewMut<'a, T> { + pub fn set(&mut self, offset: usize, val: T) { + if let Some(final_idx) = self.idx.checked_add(offset) { + if final_idx < self.len { + unsafe { + *self.base_ptr.add(final_idx) = val; + } + } + } + } +} + +#[derive(Debug, Copy, Clone)] +pub struct OffsetStride1D; + +unsafe impl PartitioningStrategy for OffsetStride1D { + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = OffsetStrideViewMut<'a, T>; + + fn index() -> usize { + let bidx = block_idx().x; + let tidx = thread_idx().x; + bidx * STRIDE + tidx + } + + unsafe fn get<'a, T>(_: *const T, _: usize) -> Option> { + unimplemented!("write only") + } + + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let idx = Self::index(); + + if idx < len { + Some(OffsetStrideViewMut { + base_ptr: ptr, + idx, + len, + _marker: core::marker::PhantomData, + }) + } else { + None + } + } +} + +// for ltimes +#[derive(Debug, Copy, Clone)] +pub struct Stride3D< + const BX: usize, + const BY: usize, + const BZ: usize, + const MAX_X: usize, + const MAX_Y: usize, +>; + +unsafe impl< + const BX: usize, + const BY: usize, + const BZ: usize, + const MAX_X: usize, + const MAX_Y: usize, +> PartitioningStrategy for Stride3D +{ + type View<'a, T: 'a> = &'a T; + type ViewMut<'a, T: 'a> = &'a mut T; + + fn index() -> usize { + let mx = (block_idx().x * BX) + thread_idx().x; + let gy = (block_idx().y * BY) + thread_idx().y; + let zz = (block_idx().z * BZ) + thread_idx().z; + + mx + MAX_X * (gy + MAX_Y * zz) + } + + unsafe fn get<'a, T>(ptr: *const T, len: usize) -> Option> { + let mx = (block_idx().x * BX) + thread_idx().x; + let gy = (block_idx().y * BY) + thread_idx().y; + let zz = (block_idx().z * BZ) + thread_idx().z; + + if mx < MAX_X && gy < MAX_Y { + let idx = mx + MAX_X * (gy + MAX_Y * zz); + if idx < len { + return Some(unsafe { &*ptr.add(idx) }); + } + } + None + } + + unsafe fn get_mut<'a, T>(ptr: *mut T, len: usize) -> Option> { + let mx = (block_idx().x * BX) + thread_idx().x; + let gy = (block_idx().y * BY) + thread_idx().y; + let zz = (block_idx().z * BZ) + thread_idx().z; + + if mx < MAX_X && gy < MAX_Y { + let idx = mx + MAX_X * (gy + MAX_Y * zz); + if idx < len { + return Some(unsafe { &mut *ptr.add(idx) }); + } + } + None + } +} diff --git a/src/apps/del_dot_vec_2d.rs b/src/apps/del_dot_vec_2d.rs index 6add513..c232c38 100644 --- a/src/apps/del_dot_vec_2d.rs +++ b/src/apps/del_dot_vec_2d.rs @@ -17,6 +17,15 @@ const N_REAL_ZONES: usize = (IMAX - IMIN) * (JMAX - JMIN); const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (N_REAL_ZONES as u32).div_ceil(THREADS_PER_BLOCK); +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "amdgpu")] use core::arch::amdgpu::{workgroup_id_x as block_idx_x, workitem_id_x as thread_idx_x}; #[cfg(target_arch = "nvptx64")] @@ -148,11 +157,11 @@ impl KernelBase for DelDotVec2D { let fy4 = unsafe { self.ydot.add(JP) as *const Real }; unsafe { - core::intrinsics::offload::<_, _, ()>( - _del_dot_vec_2d, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - ( + offload! { + kernel = del_dot_vec_2d, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( self.div as *mut [Real; NNALLS], &*x1, x2, @@ -175,7 +184,7 @@ impl KernelBase for DelDotVec2D { ptiny, N_REAL_ZONES, ), - ); + }; } } @@ -201,40 +210,11 @@ impl KernelBase for DelDotVec2D { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _del_dot_vec_2d( - div: *mut [Real; NNALLS], - x1: &[Real; NNALLS], - x2: *const Real, - x3: *const Real, - x4: *const Real, - y1: &[Real; NNALLS], - y2: *const Real, - y3: *const Real, - y4: *const Real, - fx1: &[Real; NNALLS], - fx2: *const Real, - fx3: *const Real, - fx4: *const Real, - fy1: &[Real; NNALLS], - fy2: *const Real, - fy3: *const Real, - fy4: *const Real, - real_zones: &[usize; N_REAL_ZONES], - half: Real, - ptiny: Real, - iend: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::{Real, RealExt}; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _del_dot_vec_2d( +#[offload_kernel] +fn del_dot_vec_2d( div: *mut [Real; NNALLS], x1: &[Real; NNALLS], x2: *const Real, @@ -261,27 +241,29 @@ pub unsafe extern "gpu-kernel" fn _del_dot_vec_2d( if ii < iend { let i = real_zones[ii]; - let xi = half * (x1[i] + *x2.add(i) - *x3.add(i) - *x4.add(i)); - let xj = half * (*x2.add(i) + *x3.add(i) - *x4.add(i) - x1[i]); + unsafe { + let xi = half * (x1[i] + *x2.add(i) - *x3.add(i) - *x4.add(i)); + let xj = half * (*x2.add(i) + *x3.add(i) - *x4.add(i) - x1[i]); - let yi = half * (y1[i] + *y2.add(i) - *y3.add(i) - *y4.add(i)); - let yj = half * (*y2.add(i) + *y3.add(i) - *y4.add(i) - y1[i]); + let yi = half * (y1[i] + *y2.add(i) - *y3.add(i) - *y4.add(i)); + let yj = half * (*y2.add(i) + *y3.add(i) - *y4.add(i) - y1[i]); - let fxi = half * (fx1[i] + *fx2.add(i) - *fx3.add(i) - *fx4.add(i)); - let fxj = half * (*fx2.add(i) + *fx3.add(i) - *fx4.add(i) - fx1[i]); + let fxi = half * (fx1[i] + *fx2.add(i) - *fx3.add(i) - *fx4.add(i)); + let fxj = half * (*fx2.add(i) + *fx3.add(i) - *fx4.add(i) - fx1[i]); - let fyi = half * (fy1[i] + *fy2.add(i) - *fy3.add(i) - *fy4.add(i)); - let fyj = half * (*fy2.add(i) + *fy3.add(i) - *fy4.add(i) - fy1[i]); + let fyi = half * (fy1[i] + *fy2.add(i) - *fy3.add(i) - *fy4.add(i)); + let fyj = half * (*fy2.add(i) + *fy3.add(i) - *fy4.add(i) - fy1[i]); - let rarea = Real::from(1.0) / (xi * yj - xj * yi + ptiny); + let rarea = Real::from(1.0) / (xi * yj - xj * yi + ptiny); - let dfxdx = rarea * (fxi * yj - fxj * yi); + let dfxdx = rarea * (fxi * yj - fxj * yi); - let dfydy = rarea * (fyj * xi - fyi * xj); + let dfydy = rarea * (fyj * xi - fyi * xj); - let affine = (fy1[i] + *fy2.add(i) + *fy3.add(i) + *fy4.add(i)) - / (y1[i] + *y2.add(i) + *y3.add(i) + *y4.add(i)); + let affine = (fy1[i] + *fy2.add(i) + *fy3.add(i) + *fy4.add(i)) + / (y1[i] + *y2.add(i) + *y3.add(i) + *y4.add(i)); - (*div)[i] = dfxdx + dfydy + affine; + (*div)[i] = dfxdx + dfydy + affine; + } } } diff --git a/src/apps/energy.rs b/src/apps/energy.rs index 8fa9d51..071b869 100644 --- a/src/apps/energy.rs +++ b/src/apps/energy.rs @@ -8,6 +8,15 @@ const IEND: usize = DEFAULT_PROBLEM_SIZE; const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (IEND as u32).div_ceil(THREADS_PER_BLOCK); +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{Linear1D, PartitioningStrategy, Region}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, @@ -126,77 +135,115 @@ impl KernelBase for Energy { } fn run_kernel(&mut self) { + let mut e_new = unsafe { &mut *(self.e_new as *mut [Real; IEND]) }; + let mut q_new = unsafe { &mut *(self.q_new as *mut [Real; IEND]) }; + + let p1: PreloadMut<[Real; IEND]> = preload_mut(&mut e_new); + let p2: PreloadMut<[Real; IEND]> = preload_mut(&mut q_new); + + let mut e_new_reg = Region::<'_, _, Linear1D>::from(&p1); + let mut q_new_reg = Region::<'_, _, Linear1D>::from(&p2); unsafe { - energycalc1( - self.e_new as *mut [Real; IEND], - &*(self.e_old as *const [Real; IEND]), - &*(self.delvc as *const [Real; IEND]), - &*(self.p_old as *const [Real; IEND]), - &*(self.q_old as *const [Real; IEND]), - &*(self.work as *const [Real; IEND]), - IEND, - ); - energycalc2( - &*(self.delvc as *const [Real; IEND]), - self.q_new as *mut [Real; IEND], - &*(self.comp_half_step as *const [Real; IEND]), - &*(self.p_half_step as *const [Real; IEND]), - self.e_new as *mut [Real; IEND], - &*(self.bvc as *const [Real; IEND]), - &*(self.pbvc as *const [Real; IEND]), - &*(self.ql_old as *const [Real; IEND]), - &*(self.qq_old as *const [Real; IEND]), - self.rho0, - IEND, - ); - energycalc3( - self.e_new as *mut [Real; IEND], - &*(self.delvc as *const [Real; IEND]), - &*(self.p_old as *const [Real; IEND]), - &*(self.q_old as *const [Real; IEND]), - &*(self.p_half_step as *const [Real; IEND]), - &*(self.q_new as *const [Real; IEND]), - IEND, - ); - energycalc4( - self.e_new as *mut [Real; IEND], - &*(self.work as *const [Real; IEND]), - self.e_cut, - self.emin, - IEND, - ); - energycalc5( - &*(self.delvc as *const [Real; IEND]), - &*(self.pbvc as *const [Real; IEND]), - self.e_new as *mut [Real; IEND], - &*(self.vnewc as *const [Real; IEND]), - &*(self.bvc as *const [Real; IEND]), - &*(self.p_new as *const [Real; IEND]), - &*(self.ql_old as *const [Real; IEND]), - &*(self.qq_old as *const [Real; IEND]), - &*(self.p_old as *const [Real; IEND]), - &*(self.q_old as *const [Real; IEND]), - &*(self.p_half_step as *const [Real; IEND]), - &*(self.q_new as *const [Real; IEND]), - self.rho0, - self.e_cut, - self.emin, - IEND, - ); - energycalc6( - &*(self.delvc as *const [Real; IEND]), - &*(self.pbvc as *const [Real; IEND]), - self.e_new as *mut [Real; IEND], - &*(self.vnewc as *const [Real; IEND]), - &*(self.bvc as *const [Real; IEND]), - &*(self.p_new as *const [Real; IEND]), - self.q_new as *mut [Real; IEND], - &*(self.ql_old as *const [Real; IEND]), - &*(self.qq_old as *const [Real; IEND]), - self.rho0, - self.q_cut, - IEND, - ); + offload! { + kernel = energycalc1, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, + &*(self.e_old as *const [Real; IEND]), + &*(self.delvc as *const [Real; IEND]), + &*(self.p_old as *const [Real; IEND]), + &*(self.q_old as *const [Real; IEND]), + &*(self.work as *const [Real; IEND]), + IEND, + ), + }; + offload! { + kernel = energycalc2, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + &*(self.delvc as *const [Real; IEND]), + q_new_reg, + &*(self.comp_half_step as *const [Real; IEND]), + &*(self.p_half_step as *const [Real; IEND]), + e_new_reg, + &*(self.bvc as *const [Real; IEND]), + &*(self.pbvc as *const [Real; IEND]), + &*(self.ql_old as *const [Real; IEND]), + &*(self.qq_old as *const [Real; IEND]), + self.rho0, + IEND, + ), + }; + offload! { + kernel = energycalc3, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, + &*(self.delvc as *const [Real; IEND]), + &*(self.p_old as *const [Real; IEND]), + &*(self.q_old as *const [Real; IEND]), + &*(self.p_half_step as *const [Real; IEND]), + &*(self.q_new as *const [Real; IEND]), + IEND, + ), + }; + offload! { + kernel = energycalc4, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + e_new_reg, + &*(self.work as *const [Real; IEND]), + self.e_cut, + self.emin, + IEND, + ), + }; + offload! { + kernel = energycalc5, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + &*(self.delvc as *const [Real; IEND]), + &*(self.pbvc as *const [Real; IEND]), + e_new_reg, + &*(self.vnewc as *const [Real; IEND]), + &*(self.bvc as *const [Real; IEND]), + &*(self.p_new as *const [Real; IEND]), + &*(self.ql_old as *const [Real; IEND]), + &*(self.qq_old as *const [Real; IEND]), + &*(self.p_old as *const [Real; IEND]), + &*(self.q_old as *const [Real; IEND]), + &*(self.p_half_step as *const [Real; IEND]), + &*(self.q_new as *const [Real; IEND]), + self.rho0, + self.e_cut, + self.emin, + IEND, + ), + }; + offload! { + kernel = energycalc6, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + &*(self.delvc as *const [Real; IEND]), + &*(self.pbvc as *const [Real; IEND]), + e_new_reg, + &*(self.vnewc as *const [Real; IEND]), + &*(self.bvc as *const [Real; IEND]), + &*(self.p_new as *const [Real; IEND]), + q_new_reg, + &*(self.ql_old as *const [Real; IEND]), + &*(self.qq_old as *const [Real; IEND]), + self.rho0, + self.q_cut, + IEND, + ), + }; } } @@ -243,239 +290,12 @@ impl KernelBase for Energy { } } -#[cfg(target_os = "linux")] -unsafe fn energycalc1( - e_new: *mut [Real; IEND], - e_old: &[Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - work: &[Real; IEND], - iend: usize, -) { - core::intrinsics::offload( - _energycalc1, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - (e_new, e_old, delvc, p_old, q_old, work, iend), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc2( - delvc: &[Real; IEND], - q_new: *mut [Real; IEND], - comp_half_step: &[Real; IEND], - p_half_step: &[Real; IEND], - e_new: *mut [Real; IEND], - bvc: &[Real; IEND], - pbvc: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc2, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - ( - delvc, - q_new, - comp_half_step, - p_half_step, - e_new, - bvc, - pbvc, - ql_old, - qq_old, - rho0, - iend, - ), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc3( - e_new: *mut [Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - iend: usize, -) { - core::intrinsics::offload( - _energycalc3, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - (e_new, delvc, p_old, q_old, p_half_step, q_new, iend), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc4( - e_new: *mut [Real; IEND], - work: &[Real; IEND], - e_cut: Real, - emin: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc4, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - (e_new, work, e_cut, emin, iend), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc5( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - rho0: Real, - e_cut: Real, - emin: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc5, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - ( - delvc, - pbvc, - e_new, - vnewc, - bvc, - p_new, - ql_old, - qq_old, - p_old, - q_old, - p_half_step, - q_new, - rho0, - e_cut, - emin, - iend, - ), - ) -} -#[cfg(target_os = "linux")] -unsafe fn energycalc6( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - q_new: *mut [Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - q_cut: Real, - iend: usize, -) { - core::intrinsics::offload( - _energycalc6, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - ( - delvc, pbvc, e_new, vnewc, bvc, p_new, q_new, ql_old, qq_old, rho0, q_cut, iend, - ), - ) -} - -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _energycalc1( - e_new: *mut [Real; IEND], - e_old: &[Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - work: &[Real; IEND], - iend: usize, - ); - pub fn _energycalc2( - delvc: &[Real; IEND], - q_new: *mut [Real; IEND], - comp_half_step: &[Real; IEND], - p_half_step: &[Real; IEND], - e_new: *mut [Real; IEND], - bvc: &[Real; IEND], - pbvc: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - iend: usize, - ); - pub fn _energycalc3( - e_new: *mut [Real; IEND], - delvc: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - iend: usize, - ); - pub fn _energycalc4( - e_new: *mut [Real; IEND], - work: &[Real; IEND], - e_cut: Real, - emin: Real, - iend: usize, - ); - pub fn _energycalc5( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - p_old: &[Real; IEND], - q_old: &[Real; IEND], - p_half_step: &[Real; IEND], - q_new: &[Real; IEND], - rho0: Real, - e_cut: Real, - emin: Real, - iend: usize, - ); - pub fn _energycalc6( - delvc: &[Real; IEND], - pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], - vnewc: &[Real; IEND], - bvc: &[Real; IEND], - p_new: &[Real; IEND], - q_new: *mut [Real; IEND], - ql_old: &[Real; IEND], - qq_old: &[Real; IEND], - rho0: Real, - q_cut: Real, - iend: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::{Real, RealExt}; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc1( - e_new: *mut [Real; IEND], +#[offload_kernel] +fn energycalc1( + mut e_new: Region, e_old: &[Real; IEND], delvc: &[Real; IEND], p_old: &[Real; IEND], @@ -483,25 +303,20 @@ pub extern "gpu-kernel" fn _energycalc1( work: &[Real; IEND], iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - (*e_new)[i] = (*e_old)[i] - Real::from(0.5) * (*delvc)[i] * ((*p_old)[i] + (*q_old)[i]) - + Real::from(0.5) * (*work)[i]; - } + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + *v = (*e_old)[i] - Real::from(0.5) * (*delvc)[i] * ((*p_old)[i] + (*q_old)[i]) + + Real::from(0.5) * (*work)[i]; } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc2( +#[offload_kernel] +fn energycalc2( delvc: &[Real; IEND], - q_new: *mut [Real; IEND], + mut q_new: Region, comp_half_step: &[Real; IEND], p_half_step: &[Real; IEND], - e_new: *mut [Real; IEND], + mut e_new: Region, bvc: &[Real; IEND], pbvc: &[Real; IEND], ql_old: &[Real; IEND], @@ -509,33 +324,29 @@ pub extern "gpu-kernel" fn _energycalc2( rho0: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - if ((*delvc)[i]).to_f64() > 0.0 { - (*q_new)[i] = Real::from(0.0); + let i = Linear1D::index(); + if let Some(v1) = q_new.get_mut() + && let Some(v2) = e_new.get_mut() + { + if ((*delvc)[i]).to_f64() > 0.0 { + *v1 = Real::from(0.0); + } else { + let vhalf = Real::from(1.0) / (Real::from(1.0) + (*comp_half_step)[i]); + let mut ssc = + ((*pbvc)[i] * (*v2) + vhalf * vhalf * (*bvc)[i] * (*p_half_step)[i]) / rho0; + if ssc.to_f64() <= 0.1111111e-36 { + ssc = Real::from(0.3333333e-18); } else { - let vhalf = Real::from(1.0) / (Real::from(1.0) + (*comp_half_step)[i]); - let mut ssc = ((*pbvc)[i] * (*e_new)[i] - + vhalf * vhalf * (*bvc)[i] * (*p_half_step)[i]) - / rho0; - if ssc.to_f64() <= 0.1111111e-36 { - ssc = Real::from(0.3333333e-18); - } else { - ssc = ssc.sqrt(); - } - (*q_new)[i] = ssc * (*ql_old)[i] + (*qq_old)[i]; + ssc = ssc.sqrt(); } + *v1 = ssc * (*ql_old)[i] + (*qq_old)[i]; } } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc3( - e_new: *mut [Real; IEND], +#[offload_kernel] +fn energycalc3( + mut e_new: Region, delvc: &[Real; IEND], p_old: &[Real; IEND], q_old: &[Real; IEND], @@ -543,50 +354,40 @@ pub extern "gpu-kernel" fn _energycalc3( q_new: &[Real; IEND], iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - (*e_new)[i] += Real::from(0.5) - * (*delvc)[i] - * (Real::from(3.0) * ((*p_old)[i] + (*q_old)[i]) - - Real::from(4.0) * ((*p_half_step)[i] + (*q_new)[i])); - } + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + *v += Real::from(0.5) + * (*delvc)[i] + * (Real::from(3.0) * ((*p_old)[i] + (*q_old)[i]) + - Real::from(4.0) * ((*p_half_step)[i] + (*q_new)[i])); } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc4( - e_new: *mut [Real; IEND], +#[offload_kernel] +fn energycalc4( + mut e_new: Region, work: &[Real; IEND], e_cut: Real, emin: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - (*e_new)[i] += Real::from(0.5) * (*work)[i]; - if ((*e_new)[i]).abs() < e_cut { - (*e_new)[i] = Real::from(0.0); - } - if (*e_new)[i] < emin { - (*e_new)[i] = emin; - } + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + *v += Real::from(0.5) * (*work)[i]; + if (*v).abs() < e_cut { + *v = Real::from(0.0); + } + if *v < emin { + *v = emin; } } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc5( +#[offload_kernel] +fn energycalc5( delvc: &[Real; IEND], pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], + mut e_new: Region, vnewc: &[Real; IEND], bvc: &[Real; IEND], p_new: &[Real; IEND], @@ -601,70 +402,64 @@ pub extern "gpu-kernel" fn _energycalc5( emin: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend { - let q_tilde = if ((*delvc)[i]).to_f64() > 0.0 { - Real::from(0.0) + let i = Linear1D::index(); + if let Some(v) = e_new.get_mut() { + let q_tilde = if ((*delvc)[i]).to_f64() > 0.0 { + Real::from(0.0) + } else { + let mut ssc = + ((*pbvc)[i] * (*v) + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) / rho0; + if ssc.to_f64() <= 0.1111111e-36 { + ssc = Real::from(0.3333333e-18); } else { - let mut ssc = ((*pbvc)[i] * (*e_new)[i] - + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) - / rho0; - if ssc.to_f64() <= 0.1111111e-36 { - ssc = Real::from(0.3333333e-18); - } else { - ssc = ssc.sqrt(); - } - ssc * (*ql_old)[i] + (*qq_old)[i] - }; - (*e_new)[i] -= (Real::from(7.0) * ((*p_old)[i] + (*q_old)[i]) - - Real::from(8.0) * ((*p_half_step)[i] + (*q_new)[i]) - + ((*p_new)[i] + q_tilde)) - * (*delvc)[i] - / Real::from(6.0); - if ((*e_new)[i]).abs() < e_cut { - (*e_new)[i] = Real::from(0.0); - } - if (*e_new)[i] < emin { - (*e_new)[i] = emin; + ssc = ssc.sqrt(); } + ssc * (*ql_old)[i] + (*qq_old)[i] + }; + *v -= (Real::from(7.0) * ((*p_old)[i] + (*q_old)[i]) + - Real::from(8.0) * ((*p_half_step)[i] + (*q_new)[i]) + + ((*p_new)[i] + q_tilde)) + * (*delvc)[i] + / Real::from(6.0); + if (*v).abs() < e_cut { + *v = Real::from(0.0); + } + if *v < emin { + *v = emin; } } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[inline(never)] -#[rustc_offload_kernel] -pub extern "gpu-kernel" fn _energycalc6( +#[offload_kernel] +fn energycalc6( delvc: &[Real; IEND], pbvc: &[Real; IEND], - e_new: *mut [Real; IEND], + mut e_new: Region, vnewc: &[Real; IEND], bvc: &[Real; IEND], p_new: &[Real; IEND], - q_new: *mut [Real; IEND], + mut q_new: Region, ql_old: &[Real; IEND], qq_old: &[Real; IEND], rho0: Real, q_cut: Real, iend: usize, ) { - unsafe { - let i = (block_idx_x() * block_dim_x() + thread_idx_x()) as usize; - if i < iend && ((*delvc)[i]).to_f64() <= 0.0 { - let mut ssc = ((*pbvc)[i] * (*e_new)[i] - + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) - / rho0; - if ssc.to_f64() <= 0.1111111e-36 { - ssc = Real::from(0.3333333e-18); - } else { - ssc = ssc.sqrt(); - } - (*q_new)[i] = ssc * (*ql_old)[i] + (*qq_old)[i]; - if ((*q_new)[i]).abs() < q_cut { - (*q_new)[i] = Real::from(0.0); - } + let i = Linear1D::index(); + if let Some(v1) = e_new.get_mut() + && let Some(v2) = q_new.get_mut() + && ((*delvc)[i]).to_f64() <= 0.0 + { + let mut ssc = + ((*pbvc)[i] * (*v1) + (*vnewc)[i] * (*vnewc)[i] * (*bvc)[i] * (*p_new)[i]) / rho0; + if ssc.to_f64() <= 0.1111111e-36 { + ssc = Real::from(0.3333333e-18); + } else { + ssc = ssc.sqrt(); + } + *v2 = ssc * (*ql_old)[i] + (*qq_old)[i]; + if (*v2).abs() < q_cut { + *v2 = Real::from(0.0); } } } diff --git a/src/apps/fir.rs b/src/apps/fir.rs index 1b267b6..ad40553 100644 --- a/src/apps/fir.rs +++ b/src/apps/fir.rs @@ -6,6 +6,15 @@ pub const COEFFLEN: usize = 16; const THREADS_PER_BLOCK: u32 = 256; const BLOCKS: u32 = (IEND as u32).div_ceil(THREADS_PER_BLOCK); +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{Linear1D, PartitioningStrategy, Region}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_dim_x as block_dim_x, _block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x, @@ -88,19 +97,21 @@ impl KernelBase for Fir { } fn run_kernel(&mut self) { - unsafe { - core::intrinsics::offload::<_, _, ()>( - _fir, - [BLOCKS, 1, 1], - [THREADS_PER_BLOCK, 1, 1], - ( - self.m_out as *mut [Real; IEND], - &*(self.m_in as *const [Real; IEND + COEFFLEN]), - &self.coeff as &[Real; COEFFLEN], - IEND, - ), - ); - } + let mut m_out = unsafe { &mut *(self.m_out as *mut [Real; IEND]) }; + let p: PreloadMut<[Real; IEND]> = preload_mut(&mut m_out); + let mut m_out_reg = Region::<'_, _, Linear1D>::from(&p); + offload! { + kernel = fir, + grid_dim = [BLOCKS, 1, 1], + block_dim = [THREADS_PER_BLOCK, 1, 1], + args = ( + m_out_reg, + unsafe { &*(self.m_in as *const [Real; IEND + COEFFLEN]) }, + unsafe { &self.coeff as &[Real; COEFFLEN] }, + IEND, + ), + }; + drop(p); } fn update_checksum(&self) -> f64 { @@ -117,30 +128,18 @@ impl KernelBase for Fir { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _fir( - m_out: *mut [Real; IEND], - m_in: &[Real; IEND + COEFFLEN], - coeff: &[Real; COEFFLEN], - iend: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _fir( - m_out: *mut [Real; IEND], +#[offload_kernel] +fn fir( + mut m_out: Region, m_in: &[Real; IEND + COEFFLEN], coeff: &[Real; COEFFLEN], iend: usize, ) { - let i = unsafe { (block_idx_x() * block_dim_x() + thread_idx_x()) as usize }; - if i < iend { + let i = Linear1D::index(); + if let Some(v) = m_out.get_mut() { let mut sum: Real = Real::from(0.0); let mut j = 0; while j < COEFFLEN { @@ -149,8 +148,6 @@ pub unsafe extern "gpu-kernel" fn _fir( } j += 1; } - unsafe { - (*m_out)[i] = sum; - } + *v = sum; } } diff --git a/src/apps/ltimes.rs b/src/apps/ltimes.rs index deedc70..568e89b 100644 --- a/src/apps/ltimes.rs +++ b/src/apps/ltimes.rs @@ -3,6 +3,15 @@ pub const NUM_G: usize = 32; pub const NUM_M: usize = 25; const DEFAULT_REPS: u32 = 50; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride3D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{ _block_idx_x as block_idx_x, _block_idx_y as block_idx_y, _block_idx_z as block_idx_z, @@ -101,12 +110,16 @@ impl KernelBase for LTimes { let grid_y = NUM_G.div_ceil(g_block); let grid_z = num_z.div_ceil(z_block); - core::intrinsics::offload::<_, _, ()>( - _ltimes, - [grid_x as u32, grid_y as u32, grid_z as u32], - [m_block as u32, g_block as u32, z_block as u32], - ( - self.phidat as *mut [Real; 390400], + let mut phidat = unsafe { &mut *(self.phidat as *mut [Real; 390400]) }; + let p: PreloadMut<[Real; 390400]> = preload_mut(&mut phidat); + let mut phidat_reg = Region::<'_, _, Stride3D<32, 8, 1, 25, 32>>::from(&p); + + offload! { + kernel = ltimes, + grid_dim = [grid_x as u32, grid_y as u32, grid_z as u32], + block_dim = [m_block as u32, g_block as u32, z_block as u32], + args = ( + phidat_reg, self.elldat as *const [Real; 1600], self.psidat as *const [Real; 999424], NUM_D, @@ -114,7 +127,7 @@ impl KernelBase for LTimes { NUM_G, num_z, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -135,27 +148,12 @@ impl KernelBase for LTimes { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _ltimes( - phi: *mut [Real; 390400], - ell: &[Real; 1600], - psi: &[Real; 999424], - num_d: usize, - num_m: usize, - num_g: usize, - num_z: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _ltimes( - phi: *mut [Real; 390400], +#[offload_kernel] +fn ltimes( + mut phi: Region>, ell: &[Real; 1600], psi: &[Real; 999424], num_d: usize, @@ -163,22 +161,19 @@ pub unsafe extern "gpu-kernel" fn _ltimes( num_g: usize, num_z: usize, ) { - let num_m = NUM_M; - let num_g = NUM_G; let num_d = NUM_D; + let num_g = NUM_G; - let m = (block_idx_x() * 32 + thread_idx_x()) as usize; - let g = (block_idx_y() * 8 + thread_idx_y()) as usize; - let z = (block_idx_z() * 1 + thread_idx_z()) as usize; - - if m < num_m && g < num_g && z < num_z { - let phi_idx = m + num_m * (g + num_g * z); + let m = unsafe { (block_idx_x() * 32 + thread_idx_x()) as usize }; + let g = unsafe { (block_idx_y() * 8 + thread_idx_y()) as usize }; + let z = unsafe { (block_idx_z() * 1 + thread_idx_z()) as usize }; + if let Some(v) = phi.get_mut() { for d in 0..num_d { let ell_idx = d + num_d * m; let psi_idx = d + num_d * (g + num_g * z); - (*phi)[phi_idx] += (*ell)[ell_idx] * (*psi)[psi_idx]; + *v += ell[ell_idx] * psi[psi_idx]; } } } diff --git a/src/apps/matvec_3d_stencil.rs b/src/apps/matvec_3d_stencil.rs index 35b826c..a6f5e8f 100644 --- a/src/apps/matvec_3d_stencil.rs +++ b/src/apps/matvec_3d_stencil.rs @@ -1,6 +1,15 @@ pub const N_DEFAULT: usize = 1000000; const DEFAULT_REPS: u32 = 100; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{_block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x}; @@ -126,11 +135,11 @@ impl KernelBase for Matvec3DStencil { let jp = self.jp; let kp = self.kp; - core::intrinsics::offload::<_, _, ()>( - _matvec3dstencil, - [n.div_ceil(256) as u32, 1, 1], - [256, 1, 1], - ( + offload! { + kernel = matvec3dstencil, + grid_dim = [n.div_ceil(256) as u32, 1, 1], + block_dim = [256, 1, 1], + args = ( self.x as *const [Real; 1124864], self.b as *mut [Real; 1124864], self.matrix[0] as *const [Real; 1124864], @@ -152,7 +161,7 @@ impl KernelBase for Matvec3DStencil { kp, n, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -176,39 +185,11 @@ impl KernelBase for Matvec3DStencil { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _matvec3dstencil( - x: *const [Real; 1124864], - b: *mut [Real; 1124864], - m0: *const [Real; 1124864], - m1: *const [Real; 1124864], - m2: *const [Real; 1124864], - m3: *const [Real; 1124864], - m4: *const [Real; 1124864], - m5: *const [Real; 1124864], - m6: *const [Real; 1124864], - m7: *const [Real; 1124864], - m8: *const [Real; 1124864], - m9: *const [Real; 1124864], - m10: *const [Real; 1124864], - m11: *const [Real; 1124864], - m12: *const [Real; 1124864], - m13: *const [Real; 1124864], - real_zones: *const [u64; 1000000], - jp: usize, - kp: usize, - n: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _matvec3dstencil( +#[offload_kernel] +fn matvec3dstencil( x: *const [Real; 1124864], b: *mut [Real; 1124864], m0: *const [Real; 1124864], diff --git a/src/apps/pressure.rs b/src/apps/pressure.rs index 582f039..da4bbc4 100644 --- a/src/apps/pressure.rs +++ b/src/apps/pressure.rs @@ -1,6 +1,15 @@ pub const N_DEFAULT: usize = 1000000; const DEFAULT_REPS: u32 = 700; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{PartitioningStrategy, Region, Stride1D}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{_block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x}; @@ -91,24 +100,33 @@ impl KernelBase for Pressure { let grid = [n.div_ceil(256) as u32, 1, 1]; let block = [256, 1, 1]; - core::intrinsics::offload::<_, _, ()>( - _pressure_calc1, - grid, - block, - ( - self.bvc as *mut [Real; N_DEFAULT], + let mut bvc = unsafe { &mut *(self.bvc as *mut [Real; N_DEFAULT]) }; + let mut p_new = unsafe { &mut *(self.p_new as *mut [Real; N_DEFAULT]) }; + + let p1: PreloadMut<[Real; N_DEFAULT]> = preload_mut(&mut bvc); + let p2: PreloadMut<[Real; N_DEFAULT]> = preload_mut(&mut p_new); + + let mut bvc_reg = Region::<'_, _, Stride1D<256>>::from(&p1); + let mut p_new_reg = Region::<'_, _, Stride1D<256>>::from(&p2); + + offload! { + kernel = pressure_calc1, + grid_dim = grid, + block_dim = block, + args = ( + bvc_reg, self.compression as *const [Real; N_DEFAULT], self.cls, n, ), - ); - - core::intrinsics::offload::<_, _, ()>( - _pressure_calc2, - grid, - block, - ( - self.p_new as *mut [Real; N_DEFAULT], + }; + + offload! { + kernel = pressure_calc2, + grid_dim = grid, + block_dim = block, + args = ( + p_new_reg, self.bvc as *const [Real; N_DEFAULT], self.e_old as *const [Real; N_DEFAULT], self.vnewc as *const [Real; N_DEFAULT], @@ -117,7 +135,7 @@ impl KernelBase for Pressure { self.pmin, n, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -141,52 +159,25 @@ impl KernelBase for Pressure { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _pressure_calc1( - bvc: *mut [Real; N_DEFAULT], - compression: *const [Real; N_DEFAULT], - cls: Real, - n: usize, - ); - - pub fn _pressure_calc2( - p_new: *mut [Real; N_DEFAULT], - bvc: *const [Real; N_DEFAULT], - e_old: *const [Real; N_DEFAULT], - vnewc: *const [Real; N_DEFAULT], - p_cut: Real, - eosvmax: Real, - pmin: Real, - n: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _pressure_calc1( - bvc: *mut [Real; N_DEFAULT], +#[offload_kernel] +fn pressure_calc1( + mut bvc: Region>, compression: *const [Real; N_DEFAULT], cls: Real, n: usize, ) { - let i = unsafe { (block_idx_x() * 256 + thread_idx_x()) as usize }; - if i < n { - unsafe { - (*bvc)[i] = cls * ((*compression)[i] + Real::from(1.0)); - } + let i = Stride1D::<256>::index(); + if let Some(v) = bvc.get_mut() { + *v = cls * ((*compression)[i] + Real::from(1.0)); } } -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _pressure_calc2( - p_new: *mut [Real; N_DEFAULT], +#[offload_kernel] +fn pressure_calc2( + mut p_new: Region>, bvc: *const [Real; N_DEFAULT], e_old: *const [Real; N_DEFAULT], vnewc: *const [Real; N_DEFAULT], @@ -195,8 +186,8 @@ pub unsafe extern "gpu-kernel" fn _pressure_calc2( pmin: Real, n: usize, ) { - let i = unsafe { (block_idx_x() * 256 + thread_idx_x()) as usize }; - if i < n { + let i = Stride1D::<256>::index(); + if let Some(v) = p_new.get_mut() { unsafe { let mut p = (*bvc)[i] * (*e_old)[i]; @@ -210,7 +201,7 @@ pub unsafe extern "gpu-kernel" fn _pressure_calc2( p = pmin; } - (*p_new)[i] = p; + *v = p; } } } diff --git a/src/apps/vol3d.rs b/src/apps/vol3d.rs index 7cf5877..8df12da 100644 --- a/src/apps/vol3d.rs +++ b/src/apps/vol3d.rs @@ -1,6 +1,15 @@ pub const N_DEFAULT: usize = 1000000; const DEFAULT_REPS: u32 = 100; +use core::offload::offload_kernel; +use rustc_offload_frontend::partition::{OffsetStride1D, PartitioningStrategy, Region}; + +#[cfg(target_os = "linux")] +use rustc_offload_frontend::offload; + +#[cfg(target_os = "linux")] +use core::offload::offload::{PreloadMut, preload_mut}; + #[cfg(target_arch = "nvptx64")] use core::arch::nvptx::{_block_idx_x as block_idx_x, _thread_idx_x as thread_idx_x}; @@ -121,22 +130,26 @@ impl KernelBase for Vol3D { let kp = self.kp; let count = lpz + 1 - fpz; - core::intrinsics::offload::<_, _, ()>( - _vol3d, - [((count + 255) / 256) as u32, 1, 1], - [256, 1, 1], - ( + let mut vol = unsafe { &mut *(self.vol as *mut [Real; 1124864]) }; + let p: PreloadMut<[Real; 1124864]> = preload_mut(&mut vol); + let mut vol_reg = Region::<'_, _, OffsetStride1D<256>>::from(&p); + + offload! { + kernel = vol3d, + grid_dim = [((count + 255) / 256) as u32, 1, 1], + block_dim = [256, 1, 1], + args = ( self.x as *const [Real; 1124864], self.y as *const [Real; 1124864], self.z as *const [Real; 1124864], - self.vol as *mut [Real; 1124864], + vol_reg, self.vnormq, jp, kp, fpz, lpz, ), - ); + }; } fn update_checksum(&self) -> f64 { @@ -158,102 +171,87 @@ impl KernelBase for Vol3D { } } -#[cfg(target_os = "linux")] -unsafe extern "C" { - pub fn _vol3d( - x: *const [Real; 1124864], - y: *const [Real; 1124864], - z: *const [Real; 1124864], - vol: *mut [Real; 1124864], - vnormq: Real, - jp: usize, - kp: usize, - fpz: usize, - lpz: usize, - ); -} - #[cfg(not(target_os = "linux"))] use crate::common::types::Real; -#[cfg(not(target_os = "linux"))] -#[unsafe(no_mangle)] -#[rustc_offload_kernel] -pub unsafe extern "gpu-kernel" fn _vol3d( +#[offload_kernel] +fn vol3d( x: *const [Real; 1124864], y: *const [Real; 1124864], z: *const [Real; 1124864], - vol: *mut [Real; 1124864], + mut vol: Region>, vnormq: Real, jp: usize, kp: usize, fpz: usize, lpz: usize, ) { - let idx = unsafe { (block_idx_x() * 256 + thread_idx_x()) as usize }; + let idx = OffsetStride1D::<256>::index(); let i = fpz + idx; if i > lpz { return; } - unsafe { - let i0 = i; - let i1 = i + 1; - let i2 = i + jp; - let i3 = i + 1 + jp; - let i4 = i + kp; - let i5 = i + 1 + kp; - let i6 = i + jp + kp; - let i7 = i + 1 + jp + kp; - - let x71 = (*x)[i7] - (*x)[i1]; - let x72 = (*x)[i7] - (*x)[i2]; - let x74 = (*x)[i7] - (*x)[i4]; - let x30 = (*x)[i3] - (*x)[i0]; - let x50 = (*x)[i5] - (*x)[i0]; - let x60 = (*x)[i6] - (*x)[i0]; - - let y71 = (*y)[i7] - (*y)[i1]; - let y72 = (*y)[i7] - (*y)[i2]; - let y74 = (*y)[i7] - (*y)[i4]; - let y30 = (*y)[i3] - (*y)[i0]; - let y50 = (*y)[i5] - (*y)[i0]; - let y60 = (*y)[i6] - (*y)[i0]; - - let z71 = (*z)[i7] - (*z)[i1]; - let z72 = (*z)[i7] - (*z)[i2]; - let z74 = (*z)[i7] - (*z)[i4]; - let z30 = (*z)[i3] - (*z)[i0]; - let z50 = (*z)[i5] - (*z)[i0]; - let z60 = (*z)[i6] - (*z)[i0]; - - let mut xps = x71 + x60; - let mut yps = y71 + y60; - let mut zps = z71 + z60; - - let mut cyz = y72 * z30 - z72 * y30; - let mut czx = z72 * x30 - x72 * z30; - let mut cxy = x72 * y30 - y72 * x30; - let mut v = xps * cyz + yps * czx + zps * cxy; - - xps = x72 + x50; - yps = y72 + y50; - zps = z72 + z50; - - cyz = y74 * z60 - z74 * y60; - czx = z74 * x60 - x74 * z60; - cxy = x74 * y60 - y74 * x60; - v += xps * cyz + yps * czx + zps * cxy; - - xps = x74 + x30; - yps = y74 + y30; - zps = z74 + z30; - - cyz = y71 * z50 - z71 * y50; - czx = z71 * x50 - x71 * z50; - cxy = x71 * y50 - y71 * x50; - v += xps * cyz + yps * czx + zps * cxy; - - (*vol)[i] = v * vnormq; + if let Some(mut vvol) = vol.get_mut() { + unsafe { + let i0 = i; + let i1 = i + 1; + let i2 = i + jp; + let i3 = i + 1 + jp; + let i4 = i + kp; + let i5 = i + 1 + kp; + let i6 = i + jp + kp; + let i7 = i + 1 + jp + kp; + + let x71 = (*x)[i7] - (*x)[i1]; + let x72 = (*x)[i7] - (*x)[i2]; + let x74 = (*x)[i7] - (*x)[i4]; + let x30 = (*x)[i3] - (*x)[i0]; + let x50 = (*x)[i5] - (*x)[i0]; + let x60 = (*x)[i6] - (*x)[i0]; + + let y71 = (*y)[i7] - (*y)[i1]; + let y72 = (*y)[i7] - (*y)[i2]; + let y74 = (*y)[i7] - (*y)[i4]; + let y30 = (*y)[i3] - (*y)[i0]; + let y50 = (*y)[i5] - (*y)[i0]; + let y60 = (*y)[i6] - (*y)[i0]; + + let z71 = (*z)[i7] - (*z)[i1]; + let z72 = (*z)[i7] - (*z)[i2]; + let z74 = (*z)[i7] - (*z)[i4]; + let z30 = (*z)[i3] - (*z)[i0]; + let z50 = (*z)[i5] - (*z)[i0]; + let z60 = (*z)[i6] - (*z)[i0]; + + let mut xps = x71 + x60; + let mut yps = y71 + y60; + let mut zps = z71 + z60; + + let mut cyz = y72 * z30 - z72 * y30; + let mut czx = z72 * x30 - x72 * z30; + let mut cxy = x72 * y30 - y72 * x30; + let mut v = xps * cyz + yps * czx + zps * cxy; + + xps = x72 + x50; + yps = y72 + y50; + zps = z72 + z50; + + cyz = y74 * z60 - z74 * y60; + czx = z74 * x60 - x74 * z60; + cxy = x74 * y60 - y74 * x60; + v += xps * cyz + yps * czx + zps * cxy; + + xps = x74 + x30; + yps = y74 + y30; + zps = z74 + z30; + + cyz = y71 * z50 - z71 * y50; + czx = z71 * x50 - x71 * z50; + cxy = x71 * y50 - y71 * x50; + v += xps * cyz + yps * czx + zps * cxy; + + vvol.set(fpz, v * vnormq); + } } } diff --git a/src/lib.rs b/src/lib.rs index 40f6055..b4dcbc7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,129 +1,13 @@ #![allow(internal_features)] -#![allow(non_snake_case)] -#![allow(clippy::deref_addrof)] -#![allow(clippy::too_many_arguments)] -#![allow(clippy::missing_safety_doc)] -#![allow(unused_features)] -#![feature(abi_gpu_kernel)] -#![feature(core_float_math)] -#![feature(core_intrinsics)] -#![feature(float_algebraic)] -#![feature(rustc_attrs)] -#![cfg_attr(target_arch = "amdgpu", feature(stdarch_amdgpu))] -#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))] -#![no_std] - -#[cfg(target_os = "linux")] -extern crate libc; - -#[panic_handler] -fn panic(_: &core::panic::PanicInfo) -> ! { - loop {} -} +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload, offload)] +#![feature(float_algebraic, core_float_math)] +#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx, abi_gpu_kernel))] +#![cfg_attr(target_arch = "nvptx64", no_std)] +#![feature(rustc_attrs, core_intrinsics)] pub mod apps; pub mod common; - -#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] -use apps::del_dot_vec_2d::DelDotVec2D; -#[cfg(all(target_os = "linux", feature = "energy"))] -use apps::energy::Energy; -#[cfg(all(target_os = "linux", feature = "fir"))] -use apps::fir::Fir; - -#[cfg(all(target_os = "linux", feature = "ltimes"))] -use apps::ltimes::LTimes; - -#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] -use apps::matvec_3d_stencil::Matvec3DStencil; - -#[cfg(all(target_os = "linux", feature = "pressure"))] -use apps::pressure::Pressure; - -#[cfg(all(target_os = "linux", feature = "vol3d"))] -use apps::vol3d::Vol3D; - -#[cfg(all(target_os = "linux", feature = "energy"))] -static mut K_ENERGY: Energy = Energy::INIT; -#[cfg(all(target_os = "linux", feature = "fir"))] -static mut K_FIR: Fir = Fir::INIT; -#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] -static mut K_DEL: DelDotVec2D = DelDotVec2D::INIT; -#[cfg(all(target_os = "linux", feature = "ltimes"))] -static mut K_LTIMES: LTimes = LTimes::INIT; -#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] -static mut K_MATVEC3DSTENCIL: Matvec3DStencil = Matvec3DStencil::INIT; -#[cfg(all(target_os = "linux", feature = "pressure"))] -static mut K_PRESSURE: Pressure = Pressure::INIT; -#[cfg(all(target_os = "linux", feature = "vol3d"))] -static mut K_VOL3D: Vol3D = Vol3D::INIT; - -#[cfg(target_os = "linux")] -#[unsafe(no_mangle)] -fn main() { - use crate::common::executor::{Executor, KernelResult, MAX_KERNELS}; - use crate::common::kernel_base::KernelBase; - use core::mem::MaybeUninit; - - let mut k_links: [Option<&mut dyn KernelBase>; MAX_KERNELS] = [const { None }; MAX_KERNELS]; - let mut count = 0; - - #[cfg(feature = "energy")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_ENERGY) }); - count += 1; - } - #[cfg(feature = "fir")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_FIR) }); - count += 1; - } - #[cfg(feature = "del_dot_vec_2d")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_DEL) }); - count += 1; - } - #[cfg(feature = "ltimes")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_LTIMES) }); - count += 1; - } - #[cfg(feature = "matvec_3d_stencil")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_MATVEC3DSTENCIL) }); - count += 1; - } - - #[cfg(feature = "pressure")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_PRESSURE) }); - count += 1; - } - - #[cfg(feature = "vol3d")] - { - k_links[count] = Some(unsafe { &mut *(&raw mut K_VOL3D) }); - count += 1; - } - - let mut kernel_refs: [MaybeUninit<&mut dyn KernelBase>; MAX_KERNELS] = - [const { MaybeUninit::uninit() }; MAX_KERNELS]; - - for i in 0..count { - kernel_refs[i] = MaybeUninit::new(k_links[i].take().unwrap()); - } - - let kernels_slice = unsafe { - core::slice::from_raw_parts_mut(kernel_refs.as_mut_ptr() as *mut &mut dyn KernelBase, count) - }; - - let mut suite = Executor::new(kernels_slice); - - static mut RESULT_BUF: [MaybeUninit; MAX_KERNELS] = - [const { MaybeUninit::uninit() }; MAX_KERNELS]; - - let results = suite.run_suite(unsafe { &mut *(&raw mut RESULT_BUF) }); - - Executor::print_report(results); - Executor::export_csv(results, c"results.csv".as_ptr()); -} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..aa51d09 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,118 @@ +#![allow(internal_features)] +#![allow(linker_messages)] +#![allow(improper_ctypes)] +#![allow(improper_gpu_kernel_arg)] +#![allow(improper_ctypes_definitions)] +#![feature(gpu_offload)] +#![cfg_attr(target_os = "linux", feature(core_intrinsics, offload))] +#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))] +#![cfg_attr(target_arch = "nvptx64", no_std)] +#![cfg_attr(target_arch = "nvptx64", no_main)] + +use rust_perf; + +#[cfg(target_os = "linux")] +extern crate libc; + +#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] +use rust_perf::apps::del_dot_vec_2d::DelDotVec2D; +#[cfg(all(target_os = "linux", feature = "energy"))] +use rust_perf::apps::energy::Energy; +#[cfg(all(target_os = "linux", feature = "fir"))] +use rust_perf::apps::fir::Fir; + +#[cfg(all(target_os = "linux", feature = "ltimes"))] +use rust_perf::apps::ltimes::LTimes; + +#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] +use rust_perf::apps::matvec_3d_stencil::Matvec3DStencil; + +#[cfg(all(target_os = "linux", feature = "pressure"))] +use rust_perf::apps::pressure::Pressure; + +#[cfg(all(target_os = "linux", feature = "vol3d"))] +use rust_perf::apps::vol3d::Vol3D; + +#[cfg(all(target_os = "linux", feature = "energy"))] +static mut K_ENERGY: Energy = Energy::INIT; +#[cfg(all(target_os = "linux", feature = "fir"))] +static mut K_FIR: Fir = Fir::INIT; +#[cfg(all(target_os = "linux", feature = "del_dot_vec_2d"))] +static mut K_DEL: DelDotVec2D = DelDotVec2D::INIT; +#[cfg(all(target_os = "linux", feature = "ltimes"))] +static mut K_LTIMES: LTimes = LTimes::INIT; +#[cfg(all(target_os = "linux", feature = "matvec_3d_stencil"))] +static mut K_MATVEC3DSTENCIL: Matvec3DStencil = Matvec3DStencil::INIT; +#[cfg(all(target_os = "linux", feature = "pressure"))] +static mut K_PRESSURE: Pressure = Pressure::INIT; +#[cfg(all(target_os = "linux", feature = "vol3d"))] +static mut K_VOL3D: Vol3D = Vol3D::INIT; + +#[cfg(target_os = "linux")] +fn main() { + use core::mem::MaybeUninit; + use rust_perf::common::executor::{Executor, KernelResult, MAX_KERNELS}; + use rust_perf::common::kernel_base::KernelBase; + + let mut k_links: [Option<&mut dyn KernelBase>; MAX_KERNELS] = [const { None }; MAX_KERNELS]; + let mut count = 0; + + #[cfg(feature = "energy")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_ENERGY) }); + count += 1; + } + #[cfg(feature = "fir")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_FIR) }); + count += 1; + } + #[cfg(feature = "del_dot_vec_2d")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_DEL) }); + count += 1; + } + #[cfg(feature = "ltimes")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_LTIMES) }); + count += 1; + } + #[cfg(feature = "matvec_3d_stencil")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_MATVEC3DSTENCIL) }); + count += 1; + } + + #[cfg(feature = "pressure")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_PRESSURE) }); + count += 1; + } + + #[cfg(feature = "vol3d")] + { + k_links[count] = Some(unsafe { &mut *(&raw mut K_VOL3D) }); + count += 1; + } + + let mut kernel_refs: [MaybeUninit<&mut dyn KernelBase>; MAX_KERNELS] = + [const { MaybeUninit::uninit() }; MAX_KERNELS]; + + for i in 0..count { + kernel_refs[i] = MaybeUninit::new(k_links[i].take().unwrap()); + } + + let kernels_slice = unsafe { + core::slice::from_raw_parts_mut(kernel_refs.as_mut_ptr() as *mut &mut dyn KernelBase, count) + }; + + let mut suite = Executor::new(kernels_slice); + + static mut RESULT_BUF: [MaybeUninit; MAX_KERNELS] = + [const { MaybeUninit::uninit() }; MAX_KERNELS]; + + let results = suite.run_suite(unsafe { &mut *(&raw mut RESULT_BUF) }); + + Executor::print_report(results); + Executor::export_csv(results, c"results.csv".as_ptr()); +}