Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compiler/rustc_codegen_llvm/src/back/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::back::profiling::{
LlvmSelfProfiler, selfprofile_after_pass_callback, selfprofile_before_pass_callback,
};
use crate::builder::SBuilder;
use crate::builder::gpu_offload::scalar_width;
use crate::builder::gpu_helper::scalar_width;
use crate::common::AsCCharPtr;
use crate::errors::{
CopyBitcode, FromLlvmDiag, FromLlvmOptimizationDiag, LlvmError, ParseTargetMachineConfig,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_codegen_llvm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::ops::Deref;

use rustc_ast::expand::typetree::FncTree;
pub(crate) mod autodiff;
pub(crate) mod gpu_helper;
pub(crate) mod gpu_offload;

use libc::{c_char, c_uint};
Expand Down
178 changes: 178 additions & 0 deletions compiler/rustc_codegen_llvm/src/builder/gpu_helper.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
use crate::SimpleCx;
use crate::builder::Builder;
use crate::llvm;
use crate::llvm::{Type, Value};
use rustc_abi::Align;
use rustc_codegen_ssa::MemFlags;
use rustc_codegen_ssa::common::TypeKind;
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
use rustc_middle::bug;
use rustc_middle::ty::offload_meta::{OffloadMetadata, OffloadSize};

pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
match cx.type_kind(ty) {
TypeKind::Half
| TypeKind::Float
| TypeKind::Double
| TypeKind::X86_FP80
| TypeKind::FP128
| TypeKind::PPC_FP128 => cx.float_width(ty) as u64,
TypeKind::Integer => cx.int_width(ty),
other => bug!("scalar_width was called on a non scalar type {other:?}"),
}
}

fn get_runtime_size<'ll, 'tcx>(
builder: &mut Builder<'_, 'll, 'tcx>,
args: &[&'ll Value],
index: usize,
meta: &OffloadMetadata,
) -> &'ll Value {
match meta.payload_size {
OffloadSize::Slice { element_size } => {
let length_idx = index + 1;
let length = args[length_idx];
let length_i64 = builder.intcast(length, builder.cx.type_i64(), false);
builder.mul(length_i64, builder.cx.get_const_i64(element_size))
}
_ => bug!("unexpected offload size {:?}", meta.payload_size),
}
}

// For now we have a very simplistic indexing scheme into our
// offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr.
pub(crate) fn get_geps<'ll, 'tcx>(
builder: &mut Builder<'_, 'll, 'tcx>,
ty: &'ll Type,
ty2: &'ll Type,
a1: &'ll Value,
a2: &'ll Value,
a4: &'ll Value,
is_dynamic: bool,
) -> [&'ll Value; 3] {
let cx = builder.cx;
let i32_0 = cx.get_const_i32(0);

let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
let gep3 = if is_dynamic { builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]) } else { a4 };
[gep1, gep2, gep3]
}

pub(crate) fn generate_mapper_call<'ll, 'tcx>(
builder: &mut Builder<'_, 'll, 'tcx>,
geps: [&'ll Value; 3],
o_type: &'ll Value,
fn_to_call: &'ll Value,
fn_ty: &'ll Type,
num_args: u64,
s_ident_t: &'ll Value,
) {
let cx = builder.cx;
let nullptr = cx.const_null(cx.type_ptr());
let i64_max = cx.get_const_i64(u64::MAX);
let num_args = cx.get_const_i32(num_args);
let args =
vec![s_ident_t, i64_max, num_args, geps[0], geps[1], geps[2], o_type, nullptr, nullptr];
builder.call(fn_ty, None, None, fn_to_call, &args, None, None);
}

pub(crate) fn preper_datatransfers<'ll, 'tcx>(
builder: &mut Builder<'_, 'll, 'tcx>,
args: &[&'ll Value],
types: &[&Type],
offload_sizes: &'ll Value,
metadata: &[OffloadMetadata],
has_dynamic: bool,
) -> (&'ll Type, &'ll Type, &'ll Value, &'ll Value, &'ll Value) {
let cx = builder.cx;
let num_args = types.len() as u64;
let bb = builder.llbb();

// Step 0)
unsafe {
llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn());
}

let ty = cx.type_array(cx.type_ptr(), num_args);
// Baseptr are just the input pointer to the kernel, stored in a local alloca
let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs");
// Ptrs are the result of a gep into the baseptr, at least for our trivial types.
let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
// These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
let ty2 = cx.type_array(cx.type_i64(), num_args);

let a4 = if has_dynamic {
let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");

builder.memcpy(
alloc,
Align::EIGHT,
offload_sizes,
Align::EIGHT,
cx.get_const_i64(8 * args.len() as u64),
MemFlags::empty(),
None,
);

alloc
} else {
offload_sizes
};

// Step 1)
unsafe {
llvm::LLVMPositionBuilderAtEnd(&builder.llbuilder, bb);
}

// Now we allocate once per function param, a copy to be passed to one of our maps.
let mut vals = vec![];
let mut geps = vec![];
let i32_0 = cx.get_const_i32(0);
for &v in args {
let ty = cx.val_ty(v);
let ty_kind = cx.type_kind(ty);
let (base_val, gep_base) = match ty_kind {
TypeKind::Pointer => (v, v),
TypeKind::Half | TypeKind::Float | TypeKind::Double | TypeKind::Integer => {
// FIXME(Sa4dUs): check for `f128` support, latest NVIDIA cards support it
let num_bits = scalar_width(cx, ty);

let bb = builder.llbb();
unsafe {
llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, builder.llfn());
}
let addr = builder.direct_alloca(cx.type_i64(), Align::EIGHT, "addr");
unsafe {
llvm::LLVMPositionBuilderAtEnd(builder.llbuilder, bb);
}

let cast = builder.bitcast(v, cx.type_ix(num_bits));
let value = builder.zext(cast, cx.type_i64());
builder.store(value, addr, Align::EIGHT);
(value, addr)
}
other => bug!("offload does not support {other:?}"),
};

let gep = builder.inbounds_gep(cx.type_f32(), gep_base, &[i32_0]);

vals.push(base_val);
geps.push(gep);
}

for i in 0..num_args {
let idx = cx.get_const_i32(i);
let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
builder.store(vals[i as usize], gep1, Align::EIGHT);
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
builder.store(geps[i as usize], gep2, Align::EIGHT);

if !matches!(metadata[i as usize].payload_size, OffloadSize::Static(_)) {
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]);
builder.store(size_val, gep3, Align::EIGHT);
}
}
(ty, ty2, a1, a2, a4)
}
Loading
Loading