Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "crates/rustc_offload_frontend"]
path = crates/rustc_offload_frontend
url = https://github.com/sa4dus/rustc_offload_frontend
branch = main
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@ version = "0.1.0"
edition = "2024"

[lib]
crate-type = ["cdylib"]
crate-type = ["rlib", "cdylib"]
path = "src/lib.rs"

[[bin]]
name = "main"
path = "src/main.rs"

[dependencies]
libc = { version = "0.2.175", default-features = false }
rustc_offload_frontend = { path = "crates/rustc_offload_frontend" }

[features]
default = ["all", "f64"]
Expand Down
1 change: 1 addition & 0 deletions crates/rustc_offload_frontend/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/target
16 changes: 16 additions & 0 deletions crates/rustc_offload_frontend/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions crates/rustc_offload_frontend/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[package]
name = "rustc_offload_frontend"
version = "0.1.0"
edition = "2024"
license = "MIT"
description = "rustc offload feature frontend draft"
repository = "https://github.com/sa4dus/rustc_offload_frontend"
readme = "README.md"

[lib]
crate-type = ["rlib", "cdylib"]
path = "src/lib.rs"

[[bin]]
name = "main"
path = "src/main.rs"

[dependencies]
libc = { version = "0.2.175", default-features = false }

[features]

[profile.release]
lto = "fat"
panic = "abort"

[profile.dev]
lto = "fat"
panic = "abort"
1 change: 1 addition & 0 deletions crates/rustc_offload_frontend/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# rustc_offload_frontend
62 changes: 62 additions & 0 deletions crates/rustc_offload_frontend/src/gpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#[derive(Clone, Copy)]
pub struct Dim3 {
pub x: usize,
pub y: usize,
pub z: usize,
}

pub(crate) fn global_thread_dim() -> Dim3 {
#[cfg(target_arch = "nvptx64")]
unsafe {
use core::arch::nvptx::*;
Dim3 {
x: (_block_idx_x() * _block_dim_x() + _thread_idx_x()) as usize,
y: (_block_idx_y() * _block_dim_y() + _thread_idx_y()) as usize,
z: (_block_idx_z() * _block_dim_z() + _thread_idx_z()) as usize,
}
}
#[cfg(target_os = "linux")]
Dim3 { x: 0, y: 0, z: 0 }
}

pub(crate) fn block_idx() -> Dim3 {
#[cfg(target_arch = "nvptx64")]
unsafe {
use core::arch::nvptx::*;
Dim3 {
x: _block_idx_x() as usize,
y: _block_idx_y() as usize,
z: _block_idx_z() as usize,
}
}
#[cfg(target_os = "linux")]
Dim3 { x: 0, y: 0, z: 0 }
}

pub(crate) fn block_dim() -> Dim3 {
#[cfg(target_arch = "nvptx64")]
unsafe {
use core::arch::nvptx::*;
Dim3 {
x: _block_dim_x() as usize,
y: _block_dim_y() as usize,
z: _block_dim_z() as usize,
}
}
#[cfg(target_os = "linux")]
Dim3 { x: 0, y: 0, z: 0 }
}

pub(crate) fn thread_idx() -> Dim3 {
#[cfg(target_arch = "nvptx64")]
unsafe {
use core::arch::nvptx::*;
Dim3 {
x: _thread_idx_x() as usize,
y: _thread_idx_y() as usize,
z: _thread_idx_z() as usize,
}
}
#[cfg(target_os = "linux")]
Dim3 { x: 0, y: 0, z: 0 }
}
69 changes: 69 additions & 0 deletions crates/rustc_offload_frontend/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#![allow(internal_features)]
#![allow(linker_messages)]
#![allow(improper_ctypes)]
#![allow(improper_gpu_kernel_arg)]
#![allow(improper_ctypes_definitions)]
#![feature(gpu_offload, offload)]
#![cfg_attr(target_arch = "nvptx64", feature(stdarch_nvptx))]
#![cfg_attr(target_arch = "nvptx64", no_std)]

pub use core::offload::offload_kernel;

pub mod gpu;
pub mod partition;

#[macro_export]
macro_rules! offload {
( $($field:ident = $val:expr),* $(,)? ) => {
$crate::offload!(@munch
[ $($field = $val),* ];
kernel = NONE;
grid_dim = ([1, 1, 1]);
block_dim = ([1, 1, 1]);
dyn_cache = (0);
args = NONE
);
};

(@munch [kernel = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
$crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = (SOME $val); grid_dim = $g; block_dim = $b; dyn_cache = $d; args = $a);
};
(@munch [grid_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
$crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = ($val); block_dim = $b; dyn_cache = $d; args = $a);
};
(@munch [block_dim = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
$crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = ($val); dyn_cache = $d; args = $a);
};
(@munch [dyn_cache = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
$crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; dyn_cache = ($val); args = $a);
};
(@munch [args = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
$crate::offload!(@munch [$($rest_f = $rest_v),*]; kernel = $k; grid_dim = $g; block_dim = $b; dyn_cache = $d; args = (SOME $val));
};

(@munch [$invalid:ident = $val:expr $(, $rest_f:ident = $rest_v:expr)*]; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
compile_error!(concat!("unknown field ", stringify!($invalid)));
};

(@munch []; kernel = NONE; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = $a:tt) => {
compile_error!("missing `kernel`");
};
(@munch []; kernel = $k:tt; grid_dim = $g:tt; block_dim = $b:tt; dyn_cache = $d:tt; args = NONE) => {
compile_error!("missing `args`");
};
(@munch []; kernel = (SOME $kernel:expr); grid_dim = ($grid_dim:expr); block_dim = ($block_dim:expr); dyn_cache = ($dyn_cache:expr); args = (SOME $args:expr)) => {
core::intrinsics::offload::<_, _, ()>(
$kernel,
$grid_dim,
$block_dim,
$dyn_cache,
$args,
)
};
}

#[cfg(target_arch = "nvptx64")]
#[panic_handler]
fn panic(_: &core::panic::PanicInfo) -> ! {
loop {}
}
145 changes: 145 additions & 0 deletions crates/rustc_offload_frontend/src/main.rs
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doing the tests here now, just not to mix the perf suite with random frontend examples

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lmk when it's ready for review.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be ready for another pass, quick summary of what i did

  • fix stencil case with separate input and output args so it's not UB
  • array example (saxpy)
  • RawRegion so the user doesn't need to worry abt different cases (&mut [T], &mut [T; N], etc)
  • i'm using Region even for not mutable elements, it doesn't introduce extra complexity to the implementation and it's something the users are gonna need to write kernels (we can also just expose indexes as cuda-oxide, but idk, to be consistent with our setup)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before leaving, i'll use our safe frontend in rajaperf test cases, with that we should have enough variety of examples.

also, when writing those kernels, in case the design has any other flaws (user interface wise) i expect them to arise there

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can also just expose indexes

More the other way around, I don't think we can avoid that. People can always call the underlying intrinsics themselves. But I also don't we should try and hide it, since I don't think we'll cover all cases. If 80% are expressible with our abstractions that's (more than) good enough, people can (and should) add their own ones.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i meant inside the pattern, like Linear1D::index() and returns thread_x + block_x * block_dim_x, but yes, i agree

Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#![allow(internal_features)]
#![allow(linker_messages)]
#![allow(improper_ctypes)]
#![allow(improper_gpu_kernel_arg)]
#![allow(improper_ctypes_definitions)]
#![feature(gpu_offload)]
#![cfg_attr(target_os = "linux", feature(core_intrinsics, offload))]
#![cfg_attr(target_arch = "nvptx64", feature(abi_gpu_kernel))]
#![cfg_attr(target_arch = "nvptx64", no_std)]
#![cfg_attr(target_arch = "nvptx64", no_main)]

#[cfg(target_os = "linux")]
extern crate libc;

use rustc_offload_frontend::offload_kernel;
use rustc_offload_frontend::partition::{Linear1D, Linear2D, Region, Stride2D};

#[cfg(target_os = "linux")]
use core::offload::offload::{PreloadMut, preload_mut};

#[cfg(target_arch = "nvptx64")]
use rustc_offload_frontend::partition::PartitioningStrategy;

#[offload_kernel]
fn linear1d(mut x: Region<f64, Linear1D>) {
if let Some(e) = x.get_mut() {
*e = 42.0;
}
}

#[offload_kernel]
fn stride2d(mut grid: Region<f64, Stride2D<2, 2, 4, 4, 8>>) {
if let Some(mut view) = grid.get_mut() {
view.set(0, 0, 42.0);
view.set(1, 1, 42.0);
}
}

#[offload_kernel]
fn conv_blur2d(input: &[f64], mut output: Region<f64, Linear2D<4>>) {
if let Some(out_cell) = output.get_mut() {
let mut sum = 0.0;

for dy in -1..=1 {
for dx in -1..=1 {
let idx = (Linear2D::<4>::index() as isize + dy * 4 as isize + dx) as usize;
if let Some(v) = input.get(idx) {
sum += v;
}
}
}

*out_cell = sum / 9.0;
}
}

#[offload_kernel]
fn saxpy_kernel(alpha: f32, x: &[f32], mut y: Region<f32, Linear1D>) {
if let (Some(val_x), Some(val_y)) = (x.get(Linear1D::index()), y.get_mut()) {
*val_y = alpha * (*val_x) + (*val_y);
}
}

#[cfg(target_os = "linux")]
fn main() {
use rustc_offload_frontend::offload;

// linear1d
let mut x = [0.0f64; 256];
let p: PreloadMut<[f64; 256]> = preload_mut(&mut x);
let mut reg = Region::<'_, _, Linear1D>::from(&p);
offload! {
kernel = linear1d,
grid_dim = [256, 1, 1],
args = (reg,),
};
drop(p);
for i in 0..x.len() {
assert_eq!(x[i], 42.0 as f64);
}
println!("::passed:: linear1d");

// stride2d
let mut blocks = [0.0; 64];
let p: PreloadMut<[f64; 64]> = preload_mut(&mut blocks);
let mut reg_stride = Region::<_, Stride2D<2, 2, 4, 4, 8>>::from(&p);
offload! {
kernel = stride2d,
block_dim = [2, 2, 1],
args = (reg_stride,),
};
drop(p);
// thread (0, 0, 0) takes a 2x2 block and writes on the diagonal elements
assert_eq!(blocks[0], 42.0);
assert_eq!(blocks[9], 42.0);
println!("::passed:: stride2d");

// conv_blur2d
let input = [
0.0, 0.0, 0.0, 0.0, //
0.0, 9.0, 9.0, 0.0, //
0.0, 9.0, 9.0, 0.0, //
0.0, 0.0, 0.0, 0.0, //
];
let mut output = [0.0f64; 16];
let p: PreloadMut<[f64; 16]> = preload_mut(&mut output);
let mut reg_output = Region::<_, Linear2D<4>>::from(&p);
offload! {
kernel = conv_blur2d,
block_dim = [4, 4, 1],
args = (&input as &[f64], reg_output,),
};
drop(p);

let expected = [
1.0, 2.0, 2.0, 1.0, //
2.0, 4.0, 4.0, 2.0, //
2.0, 4.0, 4.0, 2.0, //
1.0, 2.0, 2.0, 1.0, //
];
assert_eq!(output, expected);
println!("::passed:: conv_blur2d");

// saxpy
const N: usize = 512;
let alpha: f32 = 2.5;
let x: [f32; N] = [2.0; N];
let mut y: [f32; N] = [1.0; N];
let p: PreloadMut<[f32; N]> = preload_mut(&mut y);
let mut reg_y = Region::<_, Linear1D>::from(&p);

offload! {
kernel = saxpy_kernel,
grid_dim = [N as u32, 1, 1],
args = (alpha, &x as &[f32], reg_y,),
};
drop(p);

for i in 0..N {
assert_eq!(y[i], 6.0f32);
}
println!("::passed:: saxpy");

println!("all checks passed!");
}
Loading