From fae2f07321fef096b77d841df654656f054aecd6 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 May 2026 16:52:05 -0700 Subject: [PATCH 1/5] Enable explicit data transfers to gpu --- compiler/rustc_codegen_llvm/src/back/write.rs | 2 +- compiler/rustc_codegen_llvm/src/builder.rs | 1 + .../src/builder/gpu_helper.rs | 178 ++++++++++++++++++ .../src/builder/gpu_offload.rs | 157 +-------------- compiler/rustc_codegen_llvm/src/intrinsic.rs | 101 +++++++++- compiler/rustc_codegen_ssa/src/mir/block.rs | 18 ++ .../rustc_codegen_ssa/src/traits/intrinsic.rs | 7 + compiler/rustc_hir/src/lang_items.rs | 6 + compiler/rustc_span/src/symbol.rs | 4 + library/core/src/offload/mod.rs | 28 +++ 10 files changed, 346 insertions(+), 156 deletions(-) create mode 100644 compiler/rustc_codegen_llvm/src/builder/gpu_helper.rs diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs index 4426f6ebb3c17..93ed54fd69fba 100644 --- a/compiler/rustc_codegen_llvm/src/back/write.rs +++ b/compiler/rustc_codegen_llvm/src/back/write.rs @@ -32,7 +32,7 @@ use crate::back::profiling::{ LlvmSelfProfiler, selfprofile_after_pass_callback, selfprofile_before_pass_callback, }; use crate::builder::SBuilder; -use crate::builder::gpu_offload::scalar_width; +use crate::builder::gpu_helper::scalar_width; use crate::common::AsCCharPtr; use crate::errors::{ CopyBitcode, FromLlvmDiag, FromLlvmOptimizationDiag, LlvmError, ParseTargetMachineConfig, diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 134bc5006dd00..e3078b2abbe25 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -4,6 +4,7 @@ use std::ops::Deref; use rustc_ast::expand::typetree::FncTree; pub(crate) mod autodiff; +pub(crate) mod gpu_helper; pub(crate) mod gpu_offload; use libc::{c_char, c_uint}; diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_helper.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_helper.rs new file mode 100644 index 0000000000000..c18146a670019 --- /dev/null +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_helper.rs @@ -0,0 +1,178 @@ +use crate::SimpleCx; +use crate::builder::Builder; +use crate::llvm; +use crate::llvm::{Type, Value}; +use rustc_abi::Align; +use rustc_codegen_ssa::MemFlags; +use rustc_codegen_ssa::common::TypeKind; +use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods}; +use rustc_middle::bug; +use rustc_middle::ty::offload_meta::{OffloadMetadata, OffloadSize}; + +pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 { + match cx.type_kind(ty) { + TypeKind::Half + | TypeKind::Float + | TypeKind::Double + | TypeKind::X86_FP80 + | TypeKind::FP128 + | TypeKind::PPC_FP128 => cx.float_width(ty) as u64, + TypeKind::Integer => cx.int_width(ty), + other => bug!("scalar_width was called on a non scalar type {other:?}"), + } +} + +fn get_runtime_size<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, + args: &[&'ll Value], + index: usize, + meta: &OffloadMetadata, +) -> &'ll Value { + match meta.payload_size { + OffloadSize::Slice { element_size } => { + let length_idx = index + 1; + let length = args[length_idx]; + let length_i64 = builder.intcast(length, builder.cx.type_i64(), false); + builder.mul(length_i64, builder.cx.get_const_i64(element_size)) + } + _ => bug!("unexpected offload size {:?}", meta.payload_size), + } +} + +// For now we have a very simplistic indexing scheme into our +// offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr. +pub(crate) fn get_geps<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, + ty: &'ll Type, + ty2: &'ll Type, + a1: &'ll Value, + a2: &'ll Value, + a4: &'ll Value, + is_dynamic: bool, +) -> [&'ll Value; 3] { + let cx = builder.cx; + let i32_0 = cx.get_const_i32(0); + + let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]); + let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]); + let gep3 = if is_dynamic { builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]) } else { a4 }; + [gep1, gep2, gep3] +} + +pub(crate) fn generate_mapper_call<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, + geps: [&'ll Value; 3], + o_type: &'ll Value, + fn_to_call: &'ll Value, + fn_ty: &'ll Type, + num_args: u64, + s_ident_t: &'ll Value, +) { + let cx = builder.cx; + let nullptr = cx.const_null(cx.type_ptr()); + let i64_max = cx.get_const_i64(u64::MAX); + let num_args = cx.get_const_i32(num_args); + let args = + vec![s_ident_t, i64_max, num_args, geps[0], geps[1], geps[2], o_type, nullptr, nullptr]; + builder.call(fn_ty, None, None, fn_to_call, &args, None, None); +} + +pub(crate) fn preper_datatransfers<'ll, 'tcx>( + builder: &mut Builder<'_, 'll, 'tcx>, + args: &[&'ll Value], + types: &[&Type], + offload_sizes: &'ll Value, + metadata: &[OffloadMetadata], + has_dynamic: bool, +) -> (&'ll Type, &'ll Type, &'ll Value, &'ll Value, &'ll Value) { + let cx = builder.cx; + let num_args = types.len() as u64; + let bb = builder.llbb(); + + // Step 0) + unsafe { + llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn()); + } + + let ty = cx.type_array(cx.type_ptr(), num_args); + // Baseptr are just the input pointer to the kernel, stored in a local alloca + let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs"); + // Ptrs are the result of a gep into the baseptr, at least for our trivial types. + let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs"); + // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16. + let ty2 = cx.type_array(cx.type_i64(), num_args); + + let a4 = if has_dynamic { + let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes"); + + builder.memcpy( + alloc, + Align::EIGHT, + offload_sizes, + Align::EIGHT, + cx.get_const_i64(8 * args.len() as u64), + MemFlags::empty(), + None, + ); + + alloc + } else { + offload_sizes + }; + + // Step 1) + unsafe { + llvm::LLVMPositionBuilderAtEnd(&builder.llbuilder, bb); + } + + // Now we allocate once per function param, a copy to be passed to one of our maps. + let mut vals = vec![]; + let mut geps = vec![]; + let i32_0 = cx.get_const_i32(0); + for &v in args { + let ty = cx.val_ty(v); + let ty_kind = cx.type_kind(ty); + let (base_val, gep_base) = match ty_kind { + TypeKind::Pointer => (v, v), + TypeKind::Half | TypeKind::Float | TypeKind::Double | TypeKind::Integer => { + // FIXME(Sa4dUs): check for `f128` support, latest NVIDIA cards support it + let num_bits = scalar_width(cx, ty); + + let bb = builder.llbb(); + unsafe { + llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, builder.llfn()); + } + let addr = builder.direct_alloca(cx.type_i64(), Align::EIGHT, "addr"); + unsafe { + llvm::LLVMPositionBuilderAtEnd(builder.llbuilder, bb); + } + + let cast = builder.bitcast(v, cx.type_ix(num_bits)); + let value = builder.zext(cast, cx.type_i64()); + builder.store(value, addr, Align::EIGHT); + (value, addr) + } + other => bug!("offload does not support {other:?}"), + }; + + let gep = builder.inbounds_gep(cx.type_f32(), gep_base, &[i32_0]); + + vals.push(base_val); + geps.push(gep); + } + + for i in 0..num_args { + let idx = cx.get_const_i32(i); + let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]); + builder.store(vals[i as usize], gep1, Align::EIGHT); + let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]); + builder.store(geps[i as usize], gep2, Align::EIGHT); + + if !matches!(metadata[i as usize].payload_size, OffloadSize::Static(_)) { + let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); + let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]); + builder.store(size_val, gep3, Align::EIGHT); + } + } + (ty, ty2, a1, a2, a4) +} diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 0b009321802cf..a89040efcf09f 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -3,14 +3,13 @@ use std::ffi::CString; use bitflags::Flags; use llvm::Linkage::*; use rustc_abi::Align; -use rustc_codegen_ssa::MemFlags; -use rustc_codegen_ssa::common::TypeKind; use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue}; use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods}; use rustc_middle::bug; use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize}; use crate::builder::Builder; +use crate::builder::gpu_helper::*; use crate::common::CodegenCx; use crate::llvm::AttributePlace::Function; use crate::llvm::{self, Linkage, Type, Value}; @@ -534,36 +533,6 @@ fn declare_offload_fn<'ll>( ) } -pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 { - match cx.type_kind(ty) { - TypeKind::Half - | TypeKind::Float - | TypeKind::Double - | TypeKind::X86_FP80 - | TypeKind::FP128 - | TypeKind::PPC_FP128 => cx.float_width(ty) as u64, - TypeKind::Integer => cx.int_width(ty), - other => bug!("scalar_width was called on a non scalar type {other:?}"), - } -} - -fn get_runtime_size<'ll, 'tcx>( - builder: &mut Builder<'_, 'll, 'tcx>, - args: &[&'ll Value], - index: usize, - meta: &OffloadMetadata, -) -> &'ll Value { - match meta.payload_size { - OffloadSize::Slice { element_size } => { - let length_idx = index + 1; - let length = args[length_idx]; - let length_i64 = builder.intcast(length, builder.cx.type_i64(), false); - builder.mul(length_i64, builder.cx.get_const_i64(element_size)) - } - _ => bug!("unexpected offload size {:?}", meta.payload_size), - } -} - // For each kernel *call*, we now use some of our previous declared globals to move data to and from // the gpu. For now, we only handle the data transfer part of it. // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu. @@ -613,136 +582,21 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( let end_mapper_decl = offload_globals.end_mapper; let fn_ty = offload_globals.mapper_fn_ty; + let (ty, ty2, a1, a2, a4) = + preper_datatransfers(builder, args, types, offload_sizes, metadata, has_dynamic); let num_args = types.len() as u64; - let bb = builder.llbb(); + assert_eq!(num_args as usize, args.len()); - // Step 0) + let bb = builder.llbb(); unsafe { llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, builder.llfn()); } - - let ty = cx.type_array(cx.type_ptr(), num_args); - // Baseptr are just the input pointer to the kernel, stored in a local alloca - let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs"); - // Ptrs are the result of a gep into the baseptr, at least for our trivial types. - let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs"); - // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16. - let ty2 = cx.type_array(cx.type_i64(), num_args); - - let a4 = if has_dynamic { - let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes"); - - builder.memcpy( - alloc, - Align::EIGHT, - offload_sizes, - Align::EIGHT, - cx.get_const_i64(8 * args.len() as u64), - MemFlags::empty(), - None, - ); - - alloc - } else { - offload_sizes - }; - //%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); - - // Step 1) unsafe { llvm::LLVMPositionBuilderAtEnd(&builder.llbuilder, bb); } - // Now we allocate once per function param, a copy to be passed to one of our maps. - let mut vals = vec![]; - let mut geps = vec![]; - let i32_0 = cx.get_const_i32(0); - for &v in args { - let ty = cx.val_ty(v); - let ty_kind = cx.type_kind(ty); - let (base_val, gep_base) = match ty_kind { - TypeKind::Pointer => (v, v), - TypeKind::Half | TypeKind::Float | TypeKind::Double | TypeKind::Integer => { - // FIXME(Sa4dUs): check for `f128` support, latest NVIDIA cards support it - let num_bits = scalar_width(cx, ty); - - let bb = builder.llbb(); - unsafe { - llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, builder.llfn()); - } - let addr = builder.direct_alloca(cx.type_i64(), Align::EIGHT, "addr"); - unsafe { - llvm::LLVMPositionBuilderAtEnd(builder.llbuilder, bb); - } - - let cast = builder.bitcast(v, cx.type_ix(num_bits)); - let value = builder.zext(cast, cx.type_i64()); - builder.store(value, addr, Align::EIGHT); - (value, addr) - } - other => bug!("offload does not support {other:?}"), - }; - - let gep = builder.inbounds_gep(cx.type_f32(), gep_base, &[i32_0]); - - vals.push(base_val); - geps.push(gep); - } - - for i in 0..num_args { - let idx = cx.get_const_i32(i); - let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]); - builder.store(vals[i as usize], gep1, Align::EIGHT); - let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]); - builder.store(geps[i as usize], gep2, Align::EIGHT); - - if !matches!(metadata[i as usize].payload_size, OffloadSize::Static(_)) { - let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); - let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]); - builder.store(size_val, gep3, Align::EIGHT); - } - } - - // For now we have a very simplistic indexing scheme into our - // offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr. - fn get_geps<'ll, 'tcx>( - builder: &mut Builder<'_, 'll, 'tcx>, - ty: &'ll Type, - ty2: &'ll Type, - a1: &'ll Value, - a2: &'ll Value, - a4: &'ll Value, - is_dynamic: bool, - ) -> [&'ll Value; 3] { - let cx = builder.cx; - let i32_0 = cx.get_const_i32(0); - - let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]); - let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]); - let gep3 = if is_dynamic { builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]) } else { a4 }; - [gep1, gep2, gep3] - } - - fn generate_mapper_call<'ll, 'tcx>( - builder: &mut Builder<'_, 'll, 'tcx>, - geps: [&'ll Value; 3], - o_type: &'ll Value, - fn_to_call: &'ll Value, - fn_ty: &'ll Type, - num_args: u64, - s_ident_t: &'ll Value, - ) { - let cx = builder.cx; - let nullptr = cx.const_null(cx.type_ptr()); - let i64_max = cx.get_const_i64(u64::MAX); - let num_args = cx.get_const_i32(num_args); - let args = - vec![s_ident_t, i64_max, num_args, geps[0], geps[1], geps[2], o_type, nullptr, nullptr]; - builder.call(fn_ty, None, None, fn_to_call, &args, None, None); - } - // Step 2) let s_ident_t = offload_globals.ident_t_global; let geps = get_geps(builder, ty, ty2, a1, a2, a4, has_dynamic); @@ -767,6 +621,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( // Step 3) // Here we fill the KernelArgsTy, see the documentation above + let i32_0 = cx.get_const_i32(0); for (i, value) in values.iter().enumerate() { let ptr = builder.inbounds_gep(tgt_kernel_decl, a5, &[i32_0, cx.get_const_i32(i as u64)]); let name = std::ffi::CString::new(value.1).unwrap(); diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 1c7b415fd04c7..7f405ba461ae6 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -34,15 +34,14 @@ use tracing::debug; use crate::abi::FnAbiLlvmExt; use crate::builder::Builder; use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call}; -use crate::builder::gpu_offload::{ - OffloadKernelDims, gen_call_handling, gen_define_handling, register_offload, -}; +use crate::builder::gpu_offload::*; use crate::context::CodegenCx; use crate::declare::declare_raw_fn; use crate::errors::{ AutoDiffWithoutEnable, AutoDiffWithoutLto, IntrinsicSignatureMismatch, IntrinsicWrongArch, OffloadWithoutEnable, OffloadWithoutFatLTO, UnknownIntrinsic, }; +use crate::intrinsic::ty::offload_meta::OffloadSize; use crate::llvm::{self, Type, Value}; use crate::type_of::LayoutLlvmExt; use crate::va_arg::emit_va_arg; @@ -171,6 +170,24 @@ fn call_simple_intrinsic<'ll, 'tcx>( } impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { + fn codegen_offload_preload_call( + &mut self, + instance: ty::Instance<'tcx>, + args: &[OperandRef<'tcx, &'ll llvm::Value>], + is_mut: bool, + ) { + let tcx = self.tcx; + if tcx.sess.opts.unstable_opts.offload.is_empty() { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable); + } + + if tcx.sess.lto() != rustc_session::config::Lto::Fat { + let _ = tcx.dcx().emit_warn(OffloadWithoutFatLTO); + } + + codegen_offload_preload(self, tcx, instance, args); + } + fn codegen_intrinsic_call( &mut self, instance: ty::Instance<'tcx>, @@ -1894,6 +1911,82 @@ fn codegen_autodiff<'ll, 'tcx>( ); } +// For each PreLoad *call*, we now use some of our previous declared globals to move data to the gpu. +// For now, we only handle the data transfer part of it. Consecutive calls become a no-op on the +// LLVM side. +// +// Current steps: +// 0. Alloca some variables for the following steps +// 1. set insert point before PreLoad call. +// 2. generate all the GEPS and stores, to be used in 3) +// 3. generate __tgt_target_data_begin calls to move data to the GPU +// +// unchanged: keep kernel call. Later move the kernel to the GPU +// +// 4. set insert point after kernel call. +// 5. generate all the GEPS and stores, to be used in 6) +// 6. generate __tgt_target_data_end calls to move data from the GPU +fn codegen_offload_preload<'ll, 'tcx>( + bx: &mut Builder<'_, 'll, 'tcx>, + tcx: TyCtxt<'tcx>, + _instance: ty::Instance<'tcx>, + args: &[OperandRef<'tcx, &'ll Value>], +) { + dbg!("Starting the preload handling!"); + let cx = bx.cx; + register_offload(cx); + + let arg: &OperandRef<'_, &'ll Value> = &args[0]; + let args = match arg.val { + OperandValue::Immediate(val) => vec![val], + _ => bug!("not yet handled"), + }; + + let arg_ty = arg.layout.ty; + + let ty::Ref(_, pointee_ty, _) = *arg_ty.kind() else { + bug!("expected preload argument to be a reference, got {arg_ty:?}"); + }; + + let meta = OffloadMetadata::from_ty(tcx, pointee_ty); + let metadata = &[meta]; + let types = cx.layout_of(pointee_ty).llvm_type(cx); + + let offload_globals_ref = cx.offload_globals.borrow(); + let offload_globals = match offload_globals_ref.as_ref() { + Some(globals) => globals, + None => { + dbg!("Have to initialize offload? This is a bug!"); + // Offload is not initialized, cannot continue + return; + } + }; + dbg!("asdf"); + //let target_symbol = "asdf_I_ll_nameclash".to_owned(); + let target_symbol = cx.generate_local_symbol_name(""); + let offload_data = gen_define_handling(&cx, metadata, target_symbol, offload_globals); + let has_dynamic = metadata.iter().any(|m| !matches!(m.payload_size, OffloadSize::Static(_))); + let (ty, ty2, a1, a2, a4) = crate::builder::gpu_helper::preper_datatransfers( + bx, + &args, + &[types], + offload_data.offload_sizes, + metadata, + has_dynamic, + ); + let geps = crate::builder::gpu_helper::get_geps(bx, ty, ty2, a1, a2, a4, has_dynamic); + + crate::builder::gpu_helper::generate_mapper_call( + bx, + geps, + offload_data.memtransfer_begin, + offload_globals.begin_mapper, + offload_globals.mapper_fn_ty, + 1, + offload_globals.ident_t_global, + ); +} + // Generates the LLVM code to offload a Rust function to a target device (e.g., GPU). // For each kernel call, it generates the necessary globals (including metadata such as // size and pass mode), manages memory mapping to and from the device, handles all @@ -1905,6 +1998,7 @@ fn codegen_offload<'ll, 'tcx>( args: &[OperandRef<'tcx, &'ll Value>], ) { let cx = bx.cx; + register_offload(cx); let fn_args = instance.args; let (target_id, target_args) = match fn_args.into_type_list(tcx)[0].kind() { @@ -1960,7 +2054,6 @@ fn codegen_offload<'ll, 'tcx>( return; } }; - register_offload(cx); let offload_data = gen_define_handling(&cx, &metadata, target_symbol, offload_globals); gen_call_handling( bx, diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs index b6b95c5f12aae..8320bbb3efdd1 100644 --- a/compiler/rustc_codegen_ssa/src/mir/block.rs +++ b/compiler/rustc_codegen_ssa/src/mir/block.rs @@ -919,6 +919,24 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { fn_span, ); + if Some(instance.def_id()) == bx.tcx().lang_items().preload_fn() { + let cg_args: Vec<_> = + args.iter().map(|arg| self.codegen_operand(bx, &arg.node)).collect(); + + bx.codegen_offload_preload_call( + instance, &cg_args, false, // immutable preload + ); + } + + if Some(instance.def_id()) == bx.tcx().lang_items().preload_mut_fn() { + let cg_args: Vec<_> = + args.iter().map(|arg| self.codegen_operand(bx, &arg.node)).collect(); + + bx.codegen_offload_preload_call( + instance, &cg_args, true, // mutable preload + ); + } + match instance.def { // We don't need AsyncDropGlueCtorShim here because it is not `noop func`, // it is `func returning noop future` diff --git a/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs b/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs index dcd4e722a27a8..d6d5f43ca952a 100644 --- a/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs @@ -31,6 +31,13 @@ pub trait IntrinsicCallBuilderMethods<'tcx>: BackendTypes { span: Span, ) -> IntrinsicResult<'tcx, Self::Value>; + fn codegen_offload_preload_call( + &mut self, + instance: ty::Instance<'tcx>, + args: &[OperandRef<'tcx, Self::Value>], + is_mut: bool, + ); + fn codegen_llvm_intrinsic_call( &mut self, instance: ty::Instance<'tcx>, diff --git a/compiler/rustc_hir/src/lang_items.rs b/compiler/rustc_hir/src/lang_items.rs index 4a3615e5421fe..6780fd4e77e39 100644 --- a/compiler/rustc_hir/src/lang_items.rs +++ b/compiler/rustc_hir/src/lang_items.rs @@ -325,6 +325,12 @@ language_item_table! { DropGlue, sym::drop_glue, drop_glue_fn, Target::Fn, GenericRequirement::Exact(1); AllocLayout, sym::alloc_layout, alloc_layout, Target::Struct, GenericRequirement::None; + // Compiler-generated mapper functions for gpu offloading + PreloadStruct, sym::preload_type, preload_type, Target::Struct, GenericRequirement::None; + PreloadMutStruct, sym::preload_mut_type, preload_mut_type, Target::Struct, GenericRequirement::None; + PreloadFn, sym::preload, preload_fn, Target::Fn, GenericRequirement::None; + PreloadMutFn, sym::preload_mut, preload_mut_fn, Target::Fn, GenericRequirement::None; + /// For all binary crates without `#![no_main]`, Rust will generate a "main" function. /// The exact name and signature are target-dependent. The "main" function will invoke /// this lang item, passing it the `argc` and `argv` (or null, if those don't exist diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 7263680c302f1..cdcac84708ec5 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1563,6 +1563,10 @@ symbols! { prefetch_write_instruction, prefix_nops, preg, + preload, + preload_mut, + preload_type, + preload_mut_type, prelude, prelude_import, preserves_flags, diff --git a/library/core/src/offload/mod.rs b/library/core/src/offload/mod.rs index 164bbb9f0047d..fa2835a5a6c25 100644 --- a/library/core/src/offload/mod.rs +++ b/library/core/src/offload/mod.rs @@ -3,3 +3,31 @@ pub use crate::macros::builtin::offload_kernel; #[unstable(feature = "gpu_offload", issue = "131513")] pub use crate::offload; + +use crate::marker::PhantomData; + +#[lang = "preload_type"] +#[unstable(feature = "offload", issue = "124509")] +pub struct Preload<'a, T: ?Sized> { + cpu_ptr: *const T, + _marker: PhantomData<&'a T>, +} + +#[lang = "preload_mut_type"] +#[unstable(feature = "offload", issue = "124509")] +pub struct PreloadMut<'a, T: ?Sized> { + cpu_ptr: *mut T, + _marker: PhantomData<&'a mut T>, +} + +#[lang = "preload"] +#[unstable(feature = "offload", issue = "124509")] +pub fn preload<'a, T: ?Sized>(x: &'a T) -> Preload<'a, T> { + Preload { cpu_ptr: x as *const T, _marker: PhantomData } +} + +#[lang = "preload_mut"] +#[unstable(feature = "offload", issue = "124509")] +pub fn preload_mut<'a, T: ?Sized>(x: &'a mut T) -> PreloadMut<'a, T> { + PreloadMut { cpu_ptr: x as *mut T, _marker: PhantomData } +} From 6c8bec9c2fe72fa4e83c76b6e26b1354e9bf1e14 Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 May 2026 18:36:24 -0700 Subject: [PATCH 2/5] impl drop for PreloadMut, which must returned the mutated value --- .../src/builder/gpu_offload.rs | 2 + compiler/rustc_codegen_llvm/src/intrinsic.rs | 90 ++++++++++++++++++- compiler/rustc_codegen_ssa/src/mir/block.rs | 16 +++- .../rustc_codegen_ssa/src/traits/intrinsic.rs | 8 +- library/core/src/offload/mod.rs | 10 +++ 5 files changed, 123 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index a89040efcf09f..d3412b64cdc99 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -446,6 +446,7 @@ pub(crate) fn gen_define_handling<'ll>( let valid_begin_mappings = MappingFlags::TO | MappingFlags::LITERAL | MappingFlags::IMPLICIT; let transfer_to: Vec = transfer.iter().map(|m| m.intersection(valid_begin_mappings).bits()).collect(); + dbg!(&transfer); let transfer_from: Vec = transfer.iter().map(|m| m.intersection(MappingFlags::FROM).bits()).collect(); let valid_kernel_mappings = MappingFlags::LITERAL | MappingFlags::IMPLICIT; @@ -469,6 +470,7 @@ pub(crate) fn gen_define_handling<'ll>( add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.begin"), &transfer_to); let memtransfer_kernel = add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.kernel"), &transfer_kernel); + dbg!(&transfer_from); let memtransfer_end = add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.end"), &transfer_from); diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 7f405ba461ae6..d6c6d90eafb7e 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -19,7 +19,7 @@ use rustc_hir::def_id::LOCAL_CRATE; use rustc_hir::find_attr; use rustc_middle::mir::BinOp; use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf}; -use rustc_middle::ty::offload_meta::OffloadMetadata; +use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata}; use rustc_middle::ty::{self, GenericArgsRef, Instance, SimdAlign, Ty, TyCtxt, TypingEnv}; use rustc_middle::{bug, span_bug}; use rustc_session::config::CrateType; @@ -188,6 +188,17 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { codegen_offload_preload(self, tcx, instance, args); } + fn codegen_offload_preload_mut_drop( + &mut self, + preload_ty: Ty<'tcx>, + place: PlaceRef<'tcx, &'ll llvm::Value>, + ) { + let tcx = self.tcx; + dbg!("Dropping PreloadMut; emit offload end mapper"); + + codegen_offload_preload_mut_drop(self, tcx, preload_ty, place); + } + fn codegen_intrinsic_call( &mut self, instance: ty::Instance<'tcx>, @@ -1911,6 +1922,83 @@ fn codegen_autodiff<'ll, 'tcx>( ); } +fn codegen_offload_preload_mut_drop<'ll, 'tcx>( + bx: &mut Builder<'_, 'll, 'tcx>, + tcx: TyCtxt<'tcx>, + preload_ty: Ty<'tcx>, + place: PlaceRef<'tcx, &'ll llvm::Value>, +) { + let cx = bx.cx; + dbg!("Starting the PreloadMut drop handling!"); + // PreloadMut<'a, T> -> extract T. + let ty::Adt(_adt_def, generic_args) = preload_ty.kind() else { + bug!("expected PreloadMut ADT, got {preload_ty:?}"); + }; + + // This should be the `T` parameter of PreloadMut<'a, T>. + // If this indexes the lifetime in your tree, use the correct type arg index + // or `generic_args.types().next().unwrap()`. + let pointee_ty: Ty<'tcx> = + generic_args.types().next().unwrap_or_else(|| bug!("PreloadMut without type parameter")); + + // Load field 0: `cpu_ptr: *mut T`. + let cpu_ptr_place = place.project_field(bx, 0); + dbg!(&cpu_ptr_place); + let cpu_ptr_operand = bx.load_operand(cpu_ptr_place); + dbg!(&cpu_ptr_operand); + + let args: Vec<&'ll Value> = match cpu_ptr_operand.val { + OperandValue::Immediate(ptr) => vec![ptr], + OperandValue::Pair(_data, _meta) => { + bug!("unsized PreloadMut drop not handled yet") + } + _ => bug!("unexpected PreloadMut cpu_ptr operand"), + }; + + let mut meta = OffloadMetadata::from_ty(tcx, pointee_ty); + // We end a mut Mapper. Unless the user never mutated a mut variable passed in a mutable way, we + // must return it from the device to update the host version. If they never mutated it, they + // surely got a clippy or rustc warning, so it's up to them for wasting time. + meta.mode |= MappingFlags::FROM; + dbg!(&meta); + let metadata: &[OffloadMetadata; 1] = &[meta]; + + let types: &Type = cx.layout_of(pointee_ty).llvm_type(cx); + + let offload_globals_ref = cx.offload_globals.borrow(); + let offload_globals = match offload_globals_ref.as_ref() { + Some(globals) => globals, + None => { + dbg!("Have to initialize offload? This is a bug!"); + return; + } + }; + + let target_symbol = cx.generate_local_symbol_name(""); + dbg!("done for now"); + let offload_data = gen_define_handling(&cx, metadata, target_symbol, offload_globals); + let has_dynamic = metadata.iter().any(|m| !matches!(m.payload_size, OffloadSize::Static(_))); + let (ty, ty2, a1, a2, a4) = crate::builder::gpu_helper::preper_datatransfers( + bx, + &args, + &[types], + offload_data.offload_sizes, + metadata, + has_dynamic, + ); + let geps = crate::builder::gpu_helper::get_geps(bx, ty, ty2, a1, a2, a4, has_dynamic); + + crate::builder::gpu_helper::generate_mapper_call( + bx, + geps, + offload_data.memtransfer_end, + offload_globals.end_mapper, + offload_globals.mapper_fn_ty, + 1, + offload_globals.ident_t_global, + ); +} + // For each PreLoad *call*, we now use some of our previous declared globals to move data to the gpu. // For now, we only handle the data transfer part of it. Consecutive calls become a no-op on the // LLVM side. diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs index 8320bbb3efdd1..9204c87d17466 100644 --- a/compiler/rustc_codegen_ssa/src/mir/block.rs +++ b/compiler/rustc_codegen_ssa/src/mir/block.rs @@ -9,7 +9,7 @@ use rustc_lint_defs::builtin::TAIL_CALL_TRACK_CALLER; use rustc_middle::mir::{self, AssertKind, InlineAsmMacro, SwitchTargets, UnwindTerminateReason}; use rustc_middle::ty::layout::{HasTyCtxt, LayoutOf, ValidityRequirement}; use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths}; -use rustc_middle::ty::{self, Instance, Ty, TypeVisitableExt}; +use rustc_middle::ty::{self, Instance, Ty, TyCtxt, TypeVisitableExt}; use rustc_middle::{bug, span_bug}; use rustc_session::config::OptLevel; use rustc_span::{Span, Spanned}; @@ -27,6 +27,14 @@ use crate::mir::IntrinsicResult; use crate::traits::*; use crate::{MemFlags, meth}; +fn is_preload_mut_type<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> bool { + let ty::Adt(adt_def, _) = ty.kind() else { + return false; + }; + + Some(adt_def.did()) == tcx.lang_items().preload_mut_type() +} + // Indicates if we are in the middle of merging a BB's successor into it. This // can happen when BB jumps directly to its successor and the successor has no // other predecessors. @@ -604,6 +612,12 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { ) -> MergingSucc { let ty = location.ty(self.mir, bx.tcx()).ty; let ty = self.monomorphize(ty); + + if is_preload_mut_type(bx.tcx(), ty) { + let place = self.codegen_place(bx, location.as_ref()); + + bx.codegen_offload_preload_mut_drop(ty, place); + } let drop_fn = Instance::resolve_drop_glue(bx.tcx(), ty); if let ty::InstanceKind::DropGlue(_, None) = drop_fn.def { diff --git a/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs b/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs index d6d5f43ca952a..c6e24cae0b000 100644 --- a/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs @@ -5,7 +5,7 @@ use super::BackendTypes; use crate::RetagInfo; use crate::mir::IntrinsicResult; use crate::mir::operand::OperandRef; -use crate::mir::place::PlaceValue; +use crate::mir::place::{PlaceRef, PlaceValue}; pub trait IntrinsicCallBuilderMethods<'tcx>: BackendTypes { /// Higher-level interface to emitting calls to intrinsics @@ -38,6 +38,12 @@ pub trait IntrinsicCallBuilderMethods<'tcx>: BackendTypes { is_mut: bool, ); + fn codegen_offload_preload_mut_drop( + &mut self, + preload_ty: ty::Ty<'tcx>, + place: PlaceRef<'tcx, Self::Value>, + ); + fn codegen_llvm_intrinsic_call( &mut self, instance: ty::Instance<'tcx>, diff --git a/library/core/src/offload/mod.rs b/library/core/src/offload/mod.rs index fa2835a5a6c25..5fc5bc13adc3d 100644 --- a/library/core/src/offload/mod.rs +++ b/library/core/src/offload/mod.rs @@ -31,3 +31,13 @@ pub fn preload<'a, T: ?Sized>(x: &'a T) -> Preload<'a, T> { pub fn preload_mut<'a, T: ?Sized>(x: &'a mut T) -> PreloadMut<'a, T> { PreloadMut { cpu_ptr: x as *mut T, _marker: PhantomData } } + +impl Drop for PreloadMut<'_, T> { + fn drop(&mut self) { + // Intentionally empty. + // + // This exists so MIR creates Drop terminators for PreloadMut. + // rustc codegen intercepts those terminators and emits the + // offload return mapper. + } +} From 198c71eeea364e19e7a6d070e8738692da09c1ef Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 May 2026 18:56:21 -0700 Subject: [PATCH 3/5] Also handle drop for Preload --- compiler/rustc_codegen_llvm/src/intrinsic.rs | 15 +++++++++++---- compiler/rustc_codegen_ssa/src/mir/block.rs | 15 ++++++++++++++- .../rustc_codegen_ssa/src/traits/intrinsic.rs | 3 ++- library/core/src/offload/mod.rs | 10 ++++++++++ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index d6c6d90eafb7e..b44b3a5fd78d3 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -188,15 +188,16 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { codegen_offload_preload(self, tcx, instance, args); } - fn codegen_offload_preload_mut_drop( + fn codegen_offload_preload_drop( &mut self, preload_ty: Ty<'tcx>, place: PlaceRef<'tcx, &'ll llvm::Value>, + is_mut: bool, ) { let tcx = self.tcx; dbg!("Dropping PreloadMut; emit offload end mapper"); - codegen_offload_preload_mut_drop(self, tcx, preload_ty, place); + codegen_offload_preload_drop(self, tcx, preload_ty, place, is_mut); } fn codegen_intrinsic_call( @@ -1922,11 +1923,12 @@ fn codegen_autodiff<'ll, 'tcx>( ); } -fn codegen_offload_preload_mut_drop<'ll, 'tcx>( +fn codegen_offload_preload_drop<'ll, 'tcx>( bx: &mut Builder<'_, 'll, 'tcx>, tcx: TyCtxt<'tcx>, preload_ty: Ty<'tcx>, place: PlaceRef<'tcx, &'ll llvm::Value>, + is_mut: bool, ) { let cx = bx.cx; dbg!("Starting the PreloadMut drop handling!"); @@ -1959,7 +1961,12 @@ fn codegen_offload_preload_mut_drop<'ll, 'tcx>( // We end a mut Mapper. Unless the user never mutated a mut variable passed in a mutable way, we // must return it from the device to update the host version. If they never mutated it, they // surely got a clippy or rustc warning, so it's up to them for wasting time. - meta.mode |= MappingFlags::FROM; + if is_mut { + meta.mode |= MappingFlags::FROM; + } else { + // We still want the refcounter to go down, so the runtime nows when it can free the data. + meta.mode |= MappingFlags::NONE; + } dbg!(&meta); let metadata: &[OffloadMetadata; 1] = &[meta]; diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs index 9204c87d17466..61ad6bfaf3067 100644 --- a/compiler/rustc_codegen_ssa/src/mir/block.rs +++ b/compiler/rustc_codegen_ssa/src/mir/block.rs @@ -35,6 +35,14 @@ fn is_preload_mut_type<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> bool { Some(adt_def.did()) == tcx.lang_items().preload_mut_type() } +fn is_preload_type<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> bool { + let ty::Adt(adt_def, _) = ty.kind() else { + return false; + }; + + Some(adt_def.did()) == tcx.lang_items().preload_type() +} + // Indicates if we are in the middle of merging a BB's successor into it. This // can happen when BB jumps directly to its successor and the successor has no // other predecessors. @@ -616,7 +624,12 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { if is_preload_mut_type(bx.tcx(), ty) { let place = self.codegen_place(bx, location.as_ref()); - bx.codegen_offload_preload_mut_drop(ty, place); + bx.codegen_offload_preload_drop(ty, place, true); + } + if is_preload_type(bx.tcx(), ty) { + let place = self.codegen_place(bx, location.as_ref()); + + bx.codegen_offload_preload_drop(ty, place, false); } let drop_fn = Instance::resolve_drop_glue(bx.tcx(), ty); diff --git a/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs b/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs index c6e24cae0b000..f6e6bb0a49f80 100644 --- a/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/traits/intrinsic.rs @@ -38,10 +38,11 @@ pub trait IntrinsicCallBuilderMethods<'tcx>: BackendTypes { is_mut: bool, ); - fn codegen_offload_preload_mut_drop( + fn codegen_offload_preload_drop( &mut self, preload_ty: ty::Ty<'tcx>, place: PlaceRef<'tcx, Self::Value>, + mut_drop: bool, ); fn codegen_llvm_intrinsic_call( diff --git a/library/core/src/offload/mod.rs b/library/core/src/offload/mod.rs index 5fc5bc13adc3d..942bc677c14c3 100644 --- a/library/core/src/offload/mod.rs +++ b/library/core/src/offload/mod.rs @@ -41,3 +41,13 @@ impl Drop for PreloadMut<'_, T> { // offload return mapper. } } + +impl Drop for Preload<'_, T> { + fn drop(&mut self) { + // Intentionally empty. + // + // This exists so MIR creates Drop terminators for Preload. + // rustc codegen intercepts those terminators and emits the + // offload return mapper. + } +} From 2d429ff43c93a509d2ace12a3e73a81c077a72be Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 May 2026 18:59:02 -0700 Subject: [PATCH 4/5] wip test --- .../gpu_offload/explicit_memtransfer.rs | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs diff --git a/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs b/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs new file mode 100644 index 0000000000000..d9ef23851fb97 --- /dev/null +++ b/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs @@ -0,0 +1,23 @@ +#![feature(abi_gpu_kernel, gpu_offload, offload)] +#![no_std] + +use core::offload::offload::*; + +#[cfg(target_os = "linux")] +#[unsafe(no_mangle)] +fn main() { + //println!("Hello, world!"); + let mut x = [1234.0f64; 256]; + let p: PreloadMut<[f64; 256]> = preload_mut(&mut x); + core::hint::black_box(p); + let y = [1234.0f64; 128]; + let q: Preload<[f64; 128]> = preload(&y); + core::hint::black_box(q); +} + +use core::offload::offload_kernel; + +//#[offload_kernel] +//fn foo(a: &[f32], b: &[f32], c: *mut f32) { +// unsafe { *c = a[0] + b[0] }; +//} From 555b339761b7bfe99b67ce6d852f919b46f90a1d Mon Sep 17 00:00:00 2001 From: Manuel Drehwald Date: Sun, 31 May 2026 19:00:55 -0700 Subject: [PATCH 5/5] wip test --- tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs b/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs index d9ef23851fb97..dd2a85f9e621e 100644 --- a/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs +++ b/tests/codegen-llvm/gpu_offload/explicit_memtransfer.rs @@ -9,10 +9,15 @@ fn main() { //println!("Hello, world!"); let mut x = [1234.0f64; 256]; let p: PreloadMut<[f64; 256]> = preload_mut(&mut x); + // The next line does not compile + //let p2: PreloadMut<[f64; 256]> = preload_mut(&mut x); core::hint::black_box(p); let y = [1234.0f64; 128]; let q: Preload<[f64; 128]> = preload(&y); - core::hint::black_box(q); + let r: Preload<[f64; 128]> = preload(&y); + core::hint::black_box(&q); + core::hint::black_box(&r); + core::hint::black_box(&q); } use core::offload::offload_kernel;