Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
d3e4fa7
start moving gpu program building to the correct earlier step
ilhamv Feb 12, 2026
28bbd25
Add global module parameter in GPU prgram builder
ilhamv Feb 13, 2026
b76beb2
fix typo
ilhamv Feb 13, 2026
6dda917
minor comments
ilhamv Feb 13, 2026
8604a09
Merge branch 'CEMeNT-PSAAP:dev' into gpu_fix
ilhamv Feb 22, 2026
792ced5
fix data shape and gpu particle signature in program builder
ilhamv Feb 22, 2026
2447d82
fix source isotropy flagging
ilhamv Feb 25, 2026
f8f0951
add new surface - general cylinder
melekderman Feb 17, 2026
bbf1e6b
back in black
melekderman Feb 17, 2026
85cd32d
minor edits
ilhamv Feb 19, 2026
842f88b
define ConeX, ConeY, ConeZ and general cylinder using quadric
melekderman Feb 19, 2026
d7322dc
back in black
melekderman Feb 19, 2026
abbdec7
rename mcdc -> simulation
ilhamv Feb 26, 2026
d27f1cf
Merge branch 'dev' into gpu_fix
ilhamv Mar 7, 2026
ed25662
fix typo on simulationPy
ilhamv Mar 7, 2026
c487803
attempt to focus on alloc_device_bytes
ilhamv Mar 9, 2026
c6a1ccb
organize
ilhamv Mar 9, 2026
79910cc
clearn up gpu forward declare
ilhamv Mar 9, 2026
18213f3
clean up prints
ilhamv Mar 9, 2026
cc25d3d
refactor gpu builder
ilhamv Mar 9, 2026
54570ae
implement gpu function adapter
ilhamv Mar 10, 2026
34eac81
minor update. implement local array
ilhamv Mar 10, 2026
6f07c98
refactor find bin to be more explicit
ilhamv Mar 10, 2026
303f728
remove old find_bin call with default values
ilhamv Mar 10, 2026
9634f20
replace array function return with direct array slicing
ilhamv Mar 11, 2026
5ab5342
minor clean up
ilhamv Mar 11, 2026
444a572
replace particles with particle_data in particle banks
ilhamv Mar 11, 2026
efc63a9
fix transport function adapt
ilhamv Mar 11, 2026
60031b5
add gpu_strategy to settings
ilhamv Mar 11, 2026
f788e21
redesign literals to work on gpu mode
ilhamv Mar 11, 2026
2c0825b
make surface move gpu-compatible
ilhamv Mar 11, 2026
bdce631
redesign neutron physics model selection
ilhamv Mar 12, 2026
1c0d1f8
replace mcdc_get*_all and *_vector with explicit slicing for GPU comp…
ilhamv Mar 12, 2026
c59d01e
good progress. identified an issue in find_cell_async
ilhamv Mar 13, 2026
1c99e31
gpu setup done
ilhamv Mar 16, 2026
c205772
rename variables in transport/source
ilhamv Mar 16, 2026
73cc227
reorganize gpu mode parameters
ilhamv Mar 16, 2026
e58f285
update gpu transport source loop
ilhamv Mar 16, 2026
aef48b4
working implementation
ilhamv Mar 17, 2026
f151505
improve literals
ilhamv Mar 17, 2026
054b1db
cleanup debug
ilhamv Mar 17, 2026
b7971d8
in the middle of debugging bank management
ilhamv Mar 18, 2026
06d2a0c
replace simulation with program in particle bank adds
ilhamv Apr 6, 2026
0b79a1b
combine transport module adaptors
ilhamv Apr 8, 2026
7b08ebc
separate bank size increment from bank-particle-adding function. Sour…
ilhamv Apr 8, 2026
f4dfed1
debugging
ilhamv Apr 8, 2026
a30f0d8
fix byte size assignment in code factory
ilhamv Apr 10, 2026
d23292f
remove debug prints
ilhamv Apr 10, 2026
28e171d
Merge branch 'dev' into gpu_fix
ilhamv Apr 10, 2026
0a0c89e
back in black
ilhamv Apr 10, 2026
2ab73cc
Merge branch 'dev' into gpu_fix
ilhamv Apr 10, 2026
2dde778
fix minor bugs
ilhamv Apr 10, 2026
6f17476
fix misplaced scalinh
ilhamv Apr 10, 2026
7678a8f
get all rank to generate literals
ilhamv Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
613 changes: 0 additions & 613 deletions mcdc/code_factory/gpu/adapt.py

This file was deleted.

340 changes: 237 additions & 103 deletions mcdc/code_factory/gpu/program_builder.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions mcdc/code_factory/gpu/transport/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import mcdc.code_factory.gpu.transport.geometry as geometry
import mcdc.code_factory.gpu.transport.particle_bank as particle_bank
import mcdc.code_factory.gpu.transport.simulation as simulation
import mcdc.code_factory.gpu.transport.util as util
1 change: 1 addition & 0 deletions mcdc/code_factory/gpu/transport/geometry/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import mcdc.code_factory.gpu.transport.geometry.interface as interface
2 changes: 1 addition & 1 deletion mcdc/code_factory/gpu/transport/geometry/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@


@njit
def report_lost_particle(particle_container, mcdc):
def report_lost_particle(particle_container, simulation):
particle = particle_container[0]
particle["alive"] = False
25 changes: 19 additions & 6 deletions mcdc/code_factory/gpu/transport/particle_bank.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
from numba import njit

###

import mcdc.numba_types as type_
import mcdc.transport.particle as particle_module
import mcdc.transport.util as util
import mcdc.code_factory.gpu.program_builder as gpu_program

from mcdc.constant import GPU_ASYNC_SIMPLE

# =============================================================================
# Bank and pop particle
# =============================================================================


@njit
def bank_active_particle(P_rec_arr, mcdc):
particle_container = local_array(1, type_.particle)
kernel.recordlike_to_particle(particle_container, P_rec_arr)
if SIMPLE_ASYNC:
step_async(prog, particle_container[0])
def bank_active_particle(particle_container, program):
simulation = util.access_simulation(program)

active_particle_container = util.local_array(1, type_.particle)
particle_module.copy(active_particle_container, particle_container)
if simulation["settings"]["gpu_async_type"] == GPU_ASYNC_SIMPLE:
gpu_program.step_async(program, active_particle_container[0])
"""
else:
find_cell_async(prog, particle_container[0])
gpu_program.find_cell_async(program, active_particle_container[0])
"""


@njit
Expand Down
302 changes: 50 additions & 252 deletions mcdc/code_factory/gpu/transport/simulation.py
Original file line number Diff line number Diff line change
@@ -1,287 +1,85 @@
from mpi4py import MPI
import mcdc.code_factory.gpu.adapt as adapt
import harmonize

caching = config.caching

# =============================================================================
# Functions for GPU Interop
# =============================================================================

# The symbols declared below will be overwritten to reference external code that
# manages GPU execution (if GPU execution is supported and selected)
alloc_state, free_state = [None] * 2

src_alloc_program, src_free_program = [None] * 2
(
src_load_global,
src_load_constant,
src_store_global,
src_store_data,
src_store_pointer_data,
) = [None] * 5
src_init_program, src_exec_program, src_complete, src_clear_flags = [None] * 4

pre_alloc_program, pre_free_program = [None] * 2
pre_load_global, pre_load_data, pre_store_global, pre_store_data = [None] * 4
pre_init_program, pre_exec_program, pre_complete, pre_clear_flags = [None] * 4


# If GPU execution is supported and selected, the functions shown below will
# be redefined to overwrite the above symbols and perform initialization/
# finalization of GPU state
@njit
def setup_gpu(mcdc, data_tally):
pass


@njit
def teardown_gpu(mcdc):
pass


def gpu_sources_spec():
def make_work(prog: nb.uintp) -> nb.boolean:
mcdc = adapt.mcdc_global(prog)

atomic_add(mcdc["mpi_work_iter"], 0, 1)
idx_work = mcdc["mpi_work_iter"][0]

if idx_work >= mcdc["mpi_work_size"]:
return False

generate_source_particle(
mcdc["mpi_work_start"], nb.uint64(idx_work), mcdc["source_seed"], prog
)
return True

def initialize(prog: nb.uintp):
pass
from numba import njit

def finalize(prog: nb.uintp):
pass
###

base_fns = (initialize, finalize, make_work)
import mcdc.code_factory.gpu.program_builder as gpu_module
import mcdc.config as config
import mcdc.transport.particle_bank as particle_bank_module

def step(prog: nb.uintp, P_input: adapt.particle_gpu):
mcdc = adapt.mcdc_global(prog)
data = adapt.mcdc_data(prog)
particle_container = np.zeros(1, type_.particle)
particle_container[0] = P_input
particle = particle_container[0]
if particle["fresh"]:
prep_particle(particle_container, prog)
particle["fresh"] = False
step_particle(particle_container, data, prog)
if particle["alive"]:
adapt.step_async(prog, P)
from mcdc.constant import GPU_STORAGE_SEPARATE, GPU_STRATEGY_ASYNC
from mcdc.transport.simulation import source_closeout

async_fns = [step]
return adapt.harm.RuntimeSpec("mcdc_source", adapt.state_spec, base_fns, async_fns)


BLOCK_COUNT = config.args.gpu_block_count

ASYNC_EXECUTION = config.args.gpu_strategy == "async"
caching = config.caching


@njit(cache=caching)
def gpu_loop_source(seed, data, mcdc):

# Progress bar indicator
N_prog = 0

if mcdc["technique"]["domain_decomposition"]:
particle_bank_module.dd_check_in(mcdc)

# =====================================================================
# GPU Interop
# =====================================================================

def source_loop(seed, simulation, data):
# For async execution
iter_count = 655360000
# For event-based execution
batch_size = 1
batch_size = 64

full_work_size = mcdc["mpi_work_size"]
if ASYNC_EXECUTION:
settings = simulation["settings"]

full_work_size = simulation["mpi_work_size"]

if settings["gpu_strategy"] == GPU_STRATEGY_ASYNC:
phase_size = 1000000000
else:
phase_size = 1000000
phase_count = (full_work_size + phase_size - 1) // phase_size

for phase in range(phase_count):

mcdc["mpi_work_iter"][0] = phase_size * phase
mcdc["mpi_work_size"] = min(phase_size * (phase + 1), full_work_size)
mcdc["source_seed"] = seed
simulation["mpi_work_iter"][0] = phase_size * phase
simulation["mpi_work_size"] = min(phase_size * (phase + 1), full_work_size)
simulation["source_seed"] = seed

# Store the global state to the GPU
src_store_constant(mcdc["gpu_state_pointer"], mcdc)
src_store_data(mcdc["gpu_state_pointer"], data)
if settings["gpu_storage"] == GPU_STORAGE_SEPARATE:
harmonize.memcpy_host_to_device(
simulation["gpu_meta"]["state_pointer"], simulation
)
harmonize.memcpy_host_to_device(
simulation["gpu_meta"]["state_pointer"], data
)

# Execute the program, and continue to do so until it is done
if ASYNC_EXECUTION:
src_exec_program(mcdc["source_program_pointer"], BLOCK_COUNT, iter_count)
while not src_complete(mcdc["source_program_pointer"]):
particle_bank_module.dd_particle_send(mcdc)
src_exec_program(
mcdc["source_program_pointer"], BLOCK_COUNT, iter_count
block_count = gpu_module.BLOCK_COUNT

if settings["gpu_strategy"] == GPU_STRATEGY_ASYNC:
gpu_module.exec_program(
simulation["gpu_meta"]["program_pointer"], block_count, iter_count
)
while not gpu_module.complete(simulation["gpu_meta"]["program_pointer"]):
gpu_module.exec_program(
simulation["gpu_meta"]["program_pointer"], block_count, iter_count
)
else:
src_exec_program(mcdc["source_program_pointer"], BLOCK_COUNT, batch_size)
while not src_complete(mcdc["source_program_pointer"]):
particle_bank_module.dd_particle_send(mcdc)
src_exec_program(
mcdc["source_program_pointer"], BLOCK_COUNT, batch_size
gpu_module.exec_program(
simulation["gpu_meta"]["program_pointer"], block_count, batch_size
)
while not gpu_module.complete(simulation["gpu_meta"]["program_pointer"]):
gpu_module.exec_program(
simulation["gpu_meta"]["program_pointer"], block_count, batch_size
)
gpu_module.clear_flags(simulation["gpu_meta"]["program_pointer"])

# Recover the original program state
src_load_constant(mcdc, mcdc["gpu_state_pointer"])
src_load_data(data, mcdc["gpu_state_pointer"])
src_clear_flags(mcdc["source_program_pointer"])

mcdc["mpi_work_size"] = full_work_size

particle_bank_module.set_bank_size(mcdc["bank_active"], 0)

# =====================================================================
# Closeout (Moved out of the typical particle loop)
# =====================================================================

source_closeout(mcdc, 1, 1, data)

if mcdc["technique"]["domain_decomposition"]:
source_dd_resolution(data, mcdc)


def build_gpu_progs(input_deck, args):

STRAT = args.gpu_strategy

src_spec = gpu_sources_spec()

adapt.harm.RuntimeSpec.bind_specs()

rank = MPI.COMM_WORLD.Get_rank()
device_id = rank % args.gpu_share_stride

if MPI.COMM_WORLD.Get_size() > 1:
MPI.COMM_WORLD.Barrier()

adapt.harm.RuntimeSpec.load_specs()

if STRAT == "async":
args.gpu_arena_size = args.gpu_arena_size // 32
src_fns = src_spec.async_functions()
pre_fns = pre_spec.async_functions()
else:
src_fns = src_spec.event_functions()
pre_fns = pre_spec.event_functions()

ARENA_SIZE = args.gpu_arena_size
BLOCK_COUNT = args.gpu_block_count

global alloc_state, free_state
alloc_state = src_fns["alloc_state"]
free_state = src_fns["free_state"]

global src_alloc_program, src_free_program
global src_load_global, src_store_global, src_load_data, src_store_data, src_store_pointer_data
global src_init_program, src_exec_program, src_complete, src_clear_flags
src_alloc_program = src_fns["alloc_program"]
src_free_program = src_fns["free_program"]
src_load_global = src_fns["load_state_device_global"]
src_store_global = src_fns["store_state_device_global"]
src_store_pointer_global = src_fns["store_pointer_state_device_global"]
src_load_data = src_fns["load_state_device_data"]
src_store_data = src_fns["store_state_device_data"]
src_store_pointer_data = src_fns["store_pointer_state_device_data"]
src_init_program = src_fns["init_program"]
src_exec_program = src_fns["exec_program"]
src_complete = src_fns["complete"]
src_clear_flags = src_fns["clear_flags"]
src_set_device = src_fns["set_device"]

global pre_alloc_program, pre_free_program
global pre_load_global, pre_store_global, pre_load_data, pre_store_data
global pre_init_program, pre_exec_program, pre_complete, pre_clear_flags
pre_alloc_state = pre_fns["alloc_state"]
pre_free_state = pre_fns["free_state"]
pre_alloc_program = pre_fns["alloc_program"]
pre_free_program = pre_fns["free_program"]
pre_load_global = pre_fns["load_state_device_global"]
pre_store_global = pre_fns["store_state_device_global"]
pre_load_data = pre_fns["load_state_device_data"]
pre_store_data = pre_fns["store_state_device_data"]
pre_init_program = pre_fns["init_program"]
pre_exec_program = pre_fns["exec_program"]
pre_complete = pre_fns["complete"]
pre_clear_flags = pre_fns["clear_flags"]

@njit
def real_setup_gpu(mcdc_array, data_tally):
mcdc = mcdc_array[0]
src_set_device(device_id)
arena_size = ARENA_SIZE
mcdc["gpu_meta"]["state_pointer"] = adapt.cast_voidptr_to_uintp(alloc_state())
# src_store_global(mcdc["gpu_meta"]["state_pointer"], mcdc_array[0])
if config.gpu_state_storage == "separate":
src_store_pointer_global(
mcdc["gpu_meta"]["state_pointer"], mcdc["gpu_meta"]["global_pointer"]
harmonize.memcpy_device_to_host(
simulation, simulation["gpu_meta"]["state_pointer"]
)
src_store_pointer_data(
mcdc["gpu_meta"]["state_pointer"], mcdc["gpu_meta"]["tally_pointer"]
harmonize.memcpy_device_to_host(
data, simulation["gpu_meta"]["state_pointer"]
)
else:
src_store_pointer_global(mcdc["gpu_meta"]["state_pointer"], mcdc_array)
src_store_pointer_data(mcdc["gpu_meta"]["state_pointer"], data_tally)

mcdc["gpu_meta"]["source_program_pointer"] = adapt.cast_voidptr_to_uintp(
src_alloc_program(mcdc["gpu_meta"]["state_pointer"], ARENA_SIZE)
)
src_init_program(mcdc["gpu_meta"]["source_program_pointer"], BLOCK_COUNT)
return

@njit
def real_teardown_gpu(mcdc):
src_free_program(
adapt.cast_uintp_to_voidptr(mcdc["gpu_meta"]["source_program_pointer"])
)
free_state(adapt.cast_uintp_to_voidptr(mcdc["gpu_meta"]["state_pointer"]))

global setup_gpu, teardown_gpu
setup_gpu = real_setup_gpu
teardown_gpu = real_teardown_gpu

global loop_source
loop_source = gpu_loop_source


# =============================================================================
# Functions for GPU Interop
# =============================================================================

# The symbols declared below will be overwritten to reference external code that
# manages GPU execution (if GPU execution is supported and selected)
alloc_state, free_state = [None] * 2

src_alloc_program, src_free_program = [None] * 2
src_load_constant, src_load_constant, src_store_constant, src_store_data = [None] * 4
src_init_program, src_exec_program, src_complete, src_clear_flags = [None] * 4

pre_alloc_program, pre_free_program = [None] * 2
pre_load_constant, pre_load_data, pre_store_constant, pre_store_data = [None] * 4
pre_init_program, pre_exec_program, pre_complete, pre_clear_flags = [None] * 4

gpu_module.clear_flags(simulation["gpu_meta"]["program_pointer"])

# If GPU execution is supported and selected, the functions shown below will
# be redefined to overwrite the above symbols and perform initialization/
# finalization of GPU state
@njit
def setup_gpu(mcdc):
pass
simulation["mpi_work_size"] = full_work_size

particle_bank_module.set_bank_size(simulation["bank_active"], 0)

@njit
def teardown_gpu(mcdc):
pass
source_closeout(simulation, 1, 1, data)
Loading
Loading