Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
188 commits
Select commit Hold shift + click to select a range
e47ff1a
Add env variable to use different mmu worker locations
tom-kuchler Feb 1, 2024
986ca5a
fix typo
tom-kuchler Feb 2, 2024
4053de8
Temporarily remove cold starts for testing
tom-kuchler Feb 7, 2024
263f6b7
Temporarily remove cold starts for testing
tom-kuchler Feb 7, 2024
0ce5f91
Fixed bug where interface would not add content for empty output sets…
tom-kuchler Feb 8, 2024
66ad455
Fix overcounting of items in composition sets
tom-kuchler Feb 9, 2024
de6c03d
add a function name registrar to the registry
utaal Feb 9, 2024
f183945
wip
utaal-b Feb 9, 2024
f883d39
Merge remote-tracking branch 'origin/integrate-dparser' into dev/expe…
tom-kuchler Feb 9, 2024
8bbe302
removed need for copy on try_send
tom-kuchler Feb 14, 2024
cdcd636
wip promise
tom-kuchler Feb 20, 2024
de49c19
Promise working, mmu and wasm pass machine tests
tom-kuchler Feb 20, 2024
3475f6f
Adapt system driver tests and hyper engine to new interface
tom-kuchler Feb 21, 2024
2242906
Make switch away from acutal engine struct as it has become obsolete
tom-kuchler Feb 22, 2024
ac3e5e1
Switched engine argument get to allow promises to be generated outsid…
tom-kuchler Feb 23, 2024
55aef8c
Add new queueing to dispatcher
tom-kuchler Feb 23, 2024
5c2dbb1
Remove unnecessary lock from sender in queue
tom-kuchler Feb 26, 2024
c384b54
Added check for debt to still be valid when picked up
tom-kuchler Feb 27, 2024
6ff41da
Change transfers to also use the workqueues
tom-kuchler Feb 28, 2024
406b736
Merge branch 'main' into dev/experiment_cleanup
ellerre Mar 1, 2024
fcb36c7
Add composition to registry, update related functions, specific types…
tom-kuchler Mar 5, 2024
04985f8
Updated server http interface to allow registering functions at runtime
tom-kuchler Mar 6, 2024
a534887
Merge branch 'dev/experiment_cleanup' of github.com:eth-easl/dandelio…
tom-kuchler Mar 7, 2024
83f0d47
Added automatic adding of system functions
tom-kuchler Mar 7, 2024
cb5d910
bunch of incomplete things to get going
jonathanplsmith Mar 8, 2024
d1e2409
basic toy kernel launching
jonathanplsmith Mar 8, 2024
b986d63
launch basic kernel from module
jonathanplsmith Mar 11, 2024
37890de
Fix composition parsing for system functions
tom-kuchler Mar 11, 2024
1eb4fc8
two kernels; allocation
jonathanplsmith Mar 11, 2024
97dbc05
better Module type
jonathanplsmith Mar 11, 2024
8c2cc17
major overhaul to make code more idiomatic
jonathanplsmith Mar 11, 2024
bffbd5f
small thing
jonathanplsmith Mar 13, 2024
f289116
Merge branch 'dev/experiment_cleanup' into dev/gpu
jonathanplsmith Mar 13, 2024
273bb9f
MVP parse_function and improved EngineLoop.run()
jonathanplsmith Mar 15, 2024
ef312b8
Dev/timestamp recorder (#42)
tom-kuchler Mar 15, 2024
f85b8e8
Function.load() implemented
jonathanplsmith Mar 15, 2024
7fa820b
argument consumption in gpu_run()
jonathanplsmith Mar 15, 2024
4522be7
Clean up timestamping to be feature flag
tom-kuchler Mar 18, 2024
2346f13
moving inputs onto GPU
jonathanplsmith Mar 18, 2024
f14d5d1
hard coded outputs
jonathanplsmith Mar 19, 2024
2b168a6
Change timestamping to accomodate dynamic spans better
tom-kuchler Mar 20, 2024
17eb2f3
Fix summary formatting
tom-kuchler Mar 20, 2024
57b9b98
non hard-coded outputs
jonathanplsmith Mar 20, 2024
9b15740
gpu inputs/outputs tests
jonathanplsmith Mar 20, 2024
d948a9b
Blueprint parsing (still need to improve launch config)
jonathanplsmith Mar 21, 2024
d747a0b
Switched to crossbeam channel for work queue
tom-kuchler Mar 22, 2024
f1b77ca
Moved timestamps to be cleser to enqueing
tom-kuchler Mar 22, 2024
0b0fd76
Fixed clone issue with recorder
tom-kuchler Mar 22, 2024
4145430
launch config sizing and size_sweep test
jonathanplsmith Mar 22, 2024
6a9d35a
Detect physical cores and jump over hyper threads in resource allocation
tom-kuchler Mar 22, 2024
50b9b1d
Update cheri engine for the new queueing
richardlee159 Mar 24, 2024
a0fbcc5
Decrease poll time
tom-kuchler Mar 25, 2024
4c42c61
Merge branch 'dev/experiment_cleanup' of github.com:eth-easl/dandelio…
tom-kuchler Mar 25, 2024
db3096e
feature cleanup
jonathanplsmith Mar 25, 2024
12a36fa
buffer pool
jonathanplsmith Mar 25, 2024
841270d
refactoring I
jonathanplsmith Mar 25, 2024
2e92892
refactoring II
jonathanplsmith Mar 25, 2024
7a07262
Add busy polling on taking work from queue
tom-kuchler Mar 25, 2024
cd1d195
Fix panic on full queue to become an error
tom-kuchler Mar 26, 2024
5d276ff
Add option to limit total CPUs and update recording
tom-kuchler Mar 27, 2024
5895224
buffer pool using contiguous region
jonathanplsmith Mar 27, 2024
2a81e33
simple benchmarks
jonathanplsmith Mar 28, 2024
4332e27
Moved matrix multiplication to recieveing data
tom-kuchler Mar 30, 2024
fafb745
Update hyper for dandelion server
tom-kuchler Apr 2, 2024
9a89010
Change core initialization to warn in case of hyperthreading, handlin…
tom-kuchler Apr 2, 2024
1d85061
temporarily disable resource allocation for hyper engine
tom-kuchler Apr 3, 2024
c24dc87
Adding check to remove shared memory files on shutdown
tom-kuchler Apr 4, 2024
30d609b
Move env parsing into separate module file
tom-kuchler Apr 4, 2024
60935fd
Separated parsing to make it queueable on work queueu
tom-kuchler Apr 8, 2024
012d95c
Change to send matrix in response, switch to new response layout
tom-kuchler Apr 10, 2024
654ca30
process pool
jonathanplsmith Apr 15, 2024
21e082d
Change hyper io to reqwest on hyper update removing client, Add hyper…
tom-kuchler Apr 16, 2024
dd20076
Make hyper context into bytes context. Add transfer functions bytes t…
tom-kuchler Apr 16, 2024
3d6446d
GPU can be selected everywhere
jonathanplsmith Apr 18, 2024
2217990
Add get_chunk_ref to context interface to make reading easier
tom-kuchler Apr 19, 2024
6dd9285
Try to make serde serialize directly from contexts
tom-kuchler Apr 19, 2024
de89684
Loading module from Context
jonathanplsmith Apr 20, 2024
35b5289
add gpu engines to server
jonathanplsmith Apr 22, 2024
e2a1457
submodule via ssh
jonathanplsmith Apr 25, 2024
98029de
reworked function config and added new features for benchmarking
jonathanplsmith Apr 29, 2024
6c221ef
inference test and updated single thread engine
jonathanplsmith Apr 29, 2024
0308e92
Fix serialization
tom-kuchler Apr 29, 2024
4cc3890
added loops and updated benchmark test
jonathanplsmith Apr 29, 2024
9c9664b
push for benchmarks
jonathanplsmith May 1, 2024
38c887c
adapt server for gpu
jonathanplsmith May 8, 2024
a1fc882
Prepare server for benchmarking, fix mulit-GPU bugs
jonathanplsmith May 14, 2024
6f28d04
Merge branch 'dev/experiment_cleanup' into dev/gpu -- all machine int…
jonathanplsmith May 14, 2024
c581680
fix server for non-GPU engines
jonathanplsmith May 14, 2024
c4feeb4
Update ioscale and compute to work with new interface
tom-kuchler May 14, 2024
2eeb043
Updates before merge
jonathanplsmith May 15, 2024
9a54fe9
Merge branch 'dev/experiment_cleanup' into dev/gpu, fix GPU inputs fo…
jonathanplsmith May 15, 2024
676f2b8
update composition with GPU inputs
jonathanplsmith May 15, 2024
2cef032
split GpuDriver into process/thread variants
jonathanplsmith May 17, 2024
1262427
Add independent scaling of frontend
tom-kuchler May 21, 2024
49d1103
server prep for inference benchmark
jonathanplsmith May 22, 2024
89129db
debug out of memory error
jonathanplsmith May 22, 2024
3e3c044
manually enable debugging
jonathanplsmith May 22, 2024
34c0d9e
Added sanity check on resource allocation
tom-kuchler May 22, 2024
18bbe51
fix OutOfMemory bug and add extra test
jonathanplsmith May 22, 2024
1d66820
zero buffers for safety
jonathanplsmith May 22, 2024
c38cc39
add endpoint for matmul with loading / storing matrix via io function
tom-kuchler May 24, 2024
978bd9f
Merge branch 'dev/experiment_cleanup' into dev/gpu
jonathanplsmith May 24, 2024
cfa737f
merge
jonathanplsmith May 24, 2024
566b1a1
typechecker
jonathanplsmith May 24, 2024
66b9e11
remove unnecessary thread switch
jonathanplsmith May 25, 2024
e515bd8
let workers use more cores
jonathanplsmith May 25, 2024
476c9b8
try two cores for gpu_process runtime
jonathanplsmith May 27, 2024
363a206
Revert "try two cores for gpu_process runtime"
jonathanplsmith May 27, 2024
26ebe03
measure serialisation impact
jonathanplsmith May 28, 2024
2b2f186
move blueprint serialisation off hot path
jonathanplsmith May 28, 2024
6314429
Revert last two commits
jonathanplsmith May 29, 2024
0b04b62
Refactor, add comments, improve logic
jonathanplsmith May 29, 2024
84ba4d2
do transfer in spawn_blocking
jonathanplsmith May 29, 2024
e53c2a0
Revert "do transfer in spawn_blocking"
jonathanplsmith May 30, 2024
8607539
do transfers in new task
jonathanplsmith May 30, 2024
1e50d57
yielding?
jonathanplsmith Jun 2, 2024
20da500
loads of tasks and loads of cores
jonathanplsmith Jun 2, 2024
ae53b13
spawn_blocking transfers
jonathanplsmith Jun 2, 2024
b40aedd
experiment with block_in_place
jonathanplsmith Jun 2, 2024
74f02dc
backpressure mitigation
jonathanplsmith Jun 2, 2024
6733a81
busy looping?
jonathanplsmith Jun 2, 2024
4035561
only use two workers
jonathanplsmith Jun 3, 2024
014034d
inference mmu
jonathanplsmith Jun 3, 2024
cbe10d2
repetitions from input
jonathanplsmith Jun 7, 2024
a8fdd5f
different hardcode
jonathanplsmith Jun 12, 2024
a488e8b
4 workers
jonathanplsmith Jun 21, 2024
b9725f3
...with inference inputs
jonathanplsmith Jun 21, 2024
4798009
16x grid
jonathanplsmith Jun 21, 2024
71bcadd
2 workers 16x grid
jonathanplsmith Jun 21, 2024
244858d
Undo 16x grid and add batched inference workload
jonathanplsmith Jun 25, 2024
90db6d3
batching size as input
jonathanplsmith Jun 26, 2024
e066681
save before merge
jonathanplsmith Jul 6, 2024
f377f92
Merge branch 'main' of https://github.com/eth-easl/dandelion into dev…
jonathanplsmith Jul 8, 2024
ad832ed
fix reqwest, QOL changes: update readme, add more discriptive errors,…
jonathanplsmith Jul 8, 2024
f09cfe0
Merge in newest main branch, update to ROCm 6.1.2
jonathanplsmith Aug 26, 2024
d707447
update required feature
jonathanplsmith Aug 26, 2024
f2267cb
Small changes based on comments
jonathanplsmith Aug 26, 2024
6b56c3a
Add import to right place
jonathanplsmith Aug 26, 2024
9c66f1b
Move write_gpu_outputs out of interface
jonathanplsmith Aug 26, 2024
4df5cb5
Give GPU its own mmap based context
jonathanplsmith Aug 26, 2024
fb79a66
Add GPU count and worker count to config
jonathanplsmith Aug 26, 2024
9fc0938
fix small error in tests
jonathanplsmith Aug 26, 2024
0128878
another small mistake
jonathanplsmith Aug 26, 2024
95f8dbf
New JSON format for GPU jobs: multiple .hsaco files supported. double…
Nov 6, 2024
1196455
addition of full model tests
Nov 17, 2024
377a3d9
tests update
Nov 18, 2024
a83c41e
update of old JSON test files to new format
Nov 18, 2024
6478358
BatchNorm now functioning
Nov 19, 2024
b430d4e
ResNet18 newly compiled + LeNet5 client tests working
Nov 28, 2024
f5edb3c
debug env var removed
Nov 28, 2024
a528afc
ResNet34 and ResNet152 work as tests. ResNet18 and ResNet34 to be mea…
CallMeRush Dec 6, 2024
f22379e
multiple batched ResNet18 added
CallMeRush Dec 8, 2024
8ef5479
ResNet34 batched added
CallMeRush Dec 10, 2024
ef8adbb
ResNet50 added
CallMeRush Dec 11, 2024
719bc0f
ResNet50 added
CallMeRush Dec 11, 2024
d954bc6
ResNet18ONNX added
CallMeRush Dec 16, 2024
29852db
merge main into dev/gpu
Feb 27, 2025
c47d501
- fixed merge: GpuMemoryDomain on par with the others.\n- basic NVidi…
CallMeRush Feb 28, 2025
d81b1e0
fix: reflect cuda changes to hip
CallMeRush Feb 28, 2025
70c0dd8
TVM compiled models and Rust tests added for CUDA
CallMeRush Mar 13, 2025
4d044d6
gpu_thread works
CallMeRush Mar 17, 2025
ad8f8b4
fix: like in the main
CallMeRush Mar 17, 2025
5b66ba6
GPU process working
CallMeRush Mar 24, 2025
43ec9d8
GpuProcess working
CallMeRush Mar 28, 2025
16d22ba
Llama 3.2 1B added
CallMeRush Apr 1, 2025
069a964
Llama KV compiled, not storing into keys/values
CallMeRush Apr 10, 2025
f1509f6
Llama KV full functional + using flexbuffers to register large functi…
CallMeRush Apr 16, 2025
4088eef
resnet + vit_b_16 models added
CallMeRush Jun 25, 2025
130603d
merging main
CallMeRush Jun 25, 2025
845e9b2
added timestamps for GPU specific operations
CallMeRush Jun 26, 2025
da44964
fixes timestamps
CallMeRush Jun 26, 2025
5e3bcda
added resnet models + synchronization after kernel calls
CallMeRush Jun 26, 2025
9b8a383
reflect dandelionExperiment changes: single ModelInference RequestType
CallMeRush Jul 2, 2025
5eb2795
partial update: all request data is contained in the GpuContext; Syst…
CallMeRush Jul 9, 2025
f12c888
create SubReadOnly, to store references to model weights + refactoring
CallMeRush Jul 14, 2025
1d92ced
weights_from_disk feature added, used to simulate the performance ove…
CallMeRush Jul 15, 2025
144feec
support other EngineQueue types, through the WorkQueue + EnqueueWork …
CallMeRush Jul 17, 2025
e292fa7
Support model weights reuse
CallMeRush Jul 28, 2025
cb1a5a7
Extend recorder to print model reuse
CallMeRush Jul 28, 2025
982caf8
Implement the GPU queue
CallMeRush Jul 28, 2025
9d5bf77
weights_from_disk bug fix
CallMeRush Aug 20, 2025
08ebe3b
implement SubBytes for GpuContext: also inputs are referenced and not…
CallMeRush Aug 20, 2025
61f69fb
feature : auto-batching implemented
CallMeRush Aug 29, 2025
44347d6
recorder : add batch information
CallMeRush Aug 29, 2025
2fb1331
gpu_id added to recorder
CallMeRush Sep 6, 2025
96f210f
gpu_id added to recorder
CallMeRush Sep 6, 2025
1fe61b2
recorder: batch size correctluy recorder by all
CallMeRush Sep 11, 2025
bd5f8db
auto_batching queue improvement: actually exploit cached models
CallMeRush Sep 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[submodule "unity/source"]
path = machine_interface/c_machine_libraries/unity/source
url = https://github.com/ThrowTheSwitch/Unity.git
url = git@github.com:ThrowTheSwitch/Unity.git
[submodule "http"]
path = net/http
url = https://github.com/cesanta/mongoose.git
url = git@github.com:cesanta/mongoose.git
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,27 @@ If you use Dandelion, please cite our paper:
# C Dependencies

For testing the C code to interact with Cheri we are using unity which is included directly in the project.

## GPU worker build

The `gpu_worker` binary required by the `gpu_process` is assumed to be present in corresponding `target` directory:
```
cargo build --bin gpu_worker --features $(gpu-arch),gpu_process --target $(arch)-unknown-linux-gnu [--release]
```

Where `gpu-arch` is either `cuda` or `hip`.

Also make sure that shared memory objects are executable:
```
sudo mount -o remount,exec /dev/shm
```

### GPU worker path

To use a `gpu_worker` that is not at the original location it was built in, set the `GPU_WORKER_PATH` environment variable to point to the desired binary

## GPU engine library path
`DANDELION_LIBRARY_PATH` overwrites the directory where the GPU engines will look for kernel libraries. If the variable is unset the engines will look in `machine_interface/tests/libs/`.

## GPU Allocations
To prevent memory leakage, GPU kernels are disallowed from calling `malloc()`. All the memory a kernel requires should be specified in the respective config file.
2 changes: 2 additions & 0 deletions dandelion_commons/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ edition = "2021"

[features]
timestamp = []
reuse_weights = []
auto_batching = []

[dependencies]
hdrhist = "0.5.0"
Expand Down
11 changes: 11 additions & 0 deletions dandelion_commons/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,17 @@ pub enum DandelionError {
OtherProctionError,
/// Work queue from the dispatcher to the engines is full
WorkQueueFull,
// GPU engine specfific errors
/// error from HIP Runtime
HipError(String),
/// error from HIP Runtime
CudaError(String),
/// identifier used in config file was not declared before
UndeclaredIdentifier(String),
/// argument given to the FromInput sizing was out of bounds
FromInputOutOfBounds,
/// could not deserialise JSON for config
ParsingJSONError(String),
}

// Implement display to be compliant with core::error::Error
Expand Down
196 changes: 195 additions & 1 deletion dandelion_commons/src/records.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use crate::FunctionId;
use core::fmt;
use std::sync::{Arc, Mutex};
use std::time::Instant;

/// Maximum usize to expect when converting a record point to a usize
/// By setting the last element to this explicitly, the compiler will throw an error,
/// if there are more than this, because it enumerates from 0 and won't allow a number to be assigned twice.
const LAST_RECORD_POINT: usize = 17;
const LAST_RECORD_POINT: usize = 25;

#[repr(usize)]
#[derive(Clone, Copy, Debug, PartialEq)]
Expand Down Expand Up @@ -44,6 +45,23 @@ pub enum RecordPoint {
EngineStart,
/// End execution of the function on the engine (sync)
EngineEnd,
/// --- GPU ---
/// Start GPU inputs and buffers load (sync)
GPUTransferStart,
/// End GPU inputs and buffers load (sync)
GPUTransferEnd,
/// Start GPU kernel executions (sync)
GPUInferenceStart,
/// End GPU kernel executions (sync)
GPUInferenceEnd,
/// Start GPU output read (sync)
GPUOutputStart,
/// End GPU output read (sync)
GPUOutputEnd,
/// Start GPU output read (sync)
BatchAtomStart,
/// End GPU output read (sync)
BatchAtomEnd,
/// Return from execution engine (async)
FutureReturn = LAST_RECORD_POINT,
}
Expand Down Expand Up @@ -165,24 +183,128 @@ impl TimestampArchive {
}
}

#[cfg(feature = "reuse_weights")]
struct ReuseWeightsArchive {
collected_gpu_cache_hit: std::sync::Mutex<Vec<bool>>,
collected_gpu_id: std::sync::Mutex<Vec<u8>>,
}

#[cfg(feature = "reuse_weights")]
impl ReuseWeightsArchive {
fn init() -> Self {
return Self {
collected_gpu_cache_hit: std::sync::Mutex::new(Vec::new()),
collected_gpu_id: std::sync::Mutex::new(Vec::new()),
};
}

fn insert(&self, new_gpu_cache_hit: bool, new_gpu_id: u8) {
let mut guard_cache = self.collected_gpu_cache_hit.lock().unwrap();
guard_cache.push(new_gpu_cache_hit);

let mut guard_gpu = self.collected_gpu_id.lock().unwrap();
guard_gpu.push(new_gpu_id);
}

fn reset(&self) {
let mut guard_cache = self.collected_gpu_cache_hit.lock().unwrap();
*guard_cache = Vec::new();

let mut guard_gpu = self.collected_gpu_id.lock().unwrap();
*guard_gpu = Vec::new();
}

fn append_gpu_info(&self, gpu_info: (bool, u8), summary: &mut String, indent: usize) {
// push self
summary.push_str(&format!(
"{}gpu_cache_hit:{}, gpu_id:{}",
"-".repeat(indent),
gpu_info.0,
gpu_info.1,
));
}

fn get_summary(&self, summary: &mut String) {
for (gpu_cache_hit, gpu_id) in self.collected_gpu_cache_hit.lock().unwrap().iter().zip(self.collected_gpu_id.lock().unwrap().iter()) {
let gpu_info = (*gpu_cache_hit, *gpu_id);
self.append_gpu_info(gpu_info, summary, 0);
summary.push_str("\n");
}
}
}

#[cfg(feature = "auto_batching")]
struct BatchArchive {
collected_batch_size: std::sync::Mutex<Vec<usize>>,
}

#[cfg(feature = "auto_batching")]
impl BatchArchive {
fn init() -> Self {
return Self {
collected_batch_size: std::sync::Mutex::new(Vec::new()),
};
}

fn insert(&self, new_batch_size: usize) {
let mut guard = self.collected_batch_size.lock().unwrap();
guard.push(new_batch_size);
}

fn reset(&self) {
let mut guard = self.collected_batch_size.lock().unwrap();
*guard = Vec::new();
}

fn append_batch_size(&self, batch_size: usize, summary: &mut String, indent: usize) {
// push self
summary.push_str(&format!("{}batch_size:{}", "-".repeat(indent), batch_size));
}

fn get_summary(&self, summary: &mut String) {
for recorder in self.collected_batch_size.lock().unwrap().iter() {
self.append_batch_size(*recorder, summary, 0);
summary.push_str("\n");
}
}
}

/// General implementation of recorder struct, additional functionality enabled by flags
pub struct Recorder {
#[cfg(feature = "timestamp")]
timestamps: std::sync::Arc<FunctionTimestamp>,
#[cfg(feature = "reuse_weights")]
gpu_cache_hit: Arc<Mutex<bool>>,
#[cfg(feature = "reuse_weights")]
gpu_id: Arc<Mutex<u8>>,
#[cfg(feature = "auto_batching")]
batch_size: Arc<Mutex<usize>>,
}

impl Recorder {
pub fn new(_function_id: FunctionId, _start: Instant) -> Self {
return Self {
#[cfg(feature = "timestamp")]
timestamps: FunctionTimestamp::new(_function_id, _start),
#[cfg(feature = "reuse_weights")]
gpu_cache_hit: Arc::new(Mutex::new(false)),
#[cfg(feature = "reuse_weights")]
gpu_id: Arc::new(Mutex::new(u8::MAX)),
#[cfg(feature = "auto_batching")]
batch_size: Arc::new(Mutex::new(0)),
};
}

pub fn new_from_parent(_function_id: FunctionId, _parent: &Self) -> Self {
return Self {
#[cfg(feature = "timestamp")]
timestamps: FunctionTimestamp::new(_function_id, _parent.timestamps.creation),
#[cfg(feature = "reuse_weights")]
gpu_cache_hit: Arc::new(Mutex::new(false)),
#[cfg(feature = "reuse_weights")]
gpu_id: Arc::new(Mutex::new(u8::MAX)),
#[cfg(feature = "auto_batching")]
batch_size: Arc::new(Mutex::new(0)),
};
}

Expand All @@ -191,6 +313,24 @@ impl Recorder {
self.timestamps.record(_current_point);
}

pub fn set_gpu_info(&mut self, _gpu_cache_hit: bool, _gpu_id: u8) {
#[cfg(feature = "reuse_weights")]
{
let mut gpu_cache_hit = self.gpu_cache_hit.lock().unwrap();
*gpu_cache_hit = _gpu_cache_hit;
let mut gpu_id = self.gpu_id.lock().unwrap();
*gpu_id = _gpu_id;
}
}

pub fn set_batch_size(&mut self, _batch_size: usize) {
#[cfg(feature = "auto_batching")]
{
let mut batch_size = self.batch_size.lock().unwrap();
*batch_size = _batch_size;
}
}

pub fn add_children(&mut self, _new_children: Vec<Recorder>) {
#[cfg(feature = "timestamp")]
for child in _new_children {
Expand All @@ -202,6 +342,12 @@ impl Recorder {
let recorder = Recorder {
#[cfg(feature = "timestamp")]
timestamps: self.timestamps.clone(),
#[cfg(feature = "reuse_weights")]
gpu_cache_hit: self.gpu_cache_hit.clone(),
#[cfg(feature = "reuse_weights")]
gpu_id: self.gpu_id.clone(),
#[cfg(feature = "auto_batching")]
batch_size: self.batch_size.clone(),
};
return recorder;
}
Expand All @@ -218,13 +364,39 @@ impl fmt::Display for Recorder {
}
self.timestamps.fmt(_f)?;
}
#[cfg(feature = "reuse_weights")]
{
if std::sync::Arc::strong_count(&self.gpu_cache_hit) != 1
&& std::sync::Arc::weak_count(&self.gpu_cache_hit) != 0
{
panic!("Trying to format recorder that still has more than one reference");
}
#[cfg(feature = "timestamp")]
write!(_f, ",")?;
write!(_f, " gpu_cache_hit: {}, gpu_id: {}", self.gpu_cache_hit.lock().unwrap(), self.gpu_id.lock().unwrap())?;
}
#[cfg(feature = "auto_batching")]
{
if std::sync::Arc::strong_count(&self.batch_size) != 1
&& std::sync::Arc::weak_count(&self.batch_size) != 0
{
panic!("Trying to format recorder that still has more than one reference");
}
#[cfg(feature = "timestamp")]
write!(_f, ",")?;
write!(_f, " batch_size: {}", self.batch_size.lock().unwrap())?;
}
Ok(())
}
}

pub struct Archive {
#[cfg(feature = "timestamp")]
timestamp_archive: TimestampArchive,
#[cfg(feature = "reuse_weights")]
gpu_info_archive: ReuseWeightsArchive,
#[cfg(feature = "auto_batching")]
batch_archive: BatchArchive,
}

pub struct ArchiveInit {
Expand All @@ -237,13 +409,26 @@ impl Archive {
return Archive {
#[cfg(feature = "timestamp")]
timestamp_archive: TimestampArchive::init(),
#[cfg(feature = "reuse_weights")]
gpu_info_archive: ReuseWeightsArchive::init(),
#[cfg(feature = "auto_batching")]
batch_archive: BatchArchive::init(),
};
}

pub fn insert_recorder(&self, _recorder: Recorder) {
#[cfg(feature = "timestamp")]
self.timestamp_archive
.insert(std::sync::Arc::into_inner(_recorder.timestamps).unwrap());
#[cfg(feature = "reuse_weights")]
self.gpu_info_archive.insert(
std::sync::Arc::into_inner((*_recorder.gpu_cache_hit.lock().unwrap()).into()).unwrap(),
std::sync::Arc::into_inner((*_recorder.gpu_id.lock().unwrap()).into()).unwrap(),
);
#[cfg(feature = "auto_batching")]
self.batch_archive.insert(
std::sync::Arc::into_inner((*_recorder.batch_size.lock().unwrap()).into()).unwrap(),
);
}

pub fn get_summary(&self) -> String {
Expand All @@ -252,11 +437,20 @@ impl Archive {
let mut summary = String::new();
#[cfg(feature = "timestamp")]
self.timestamp_archive.get_summary(&mut summary);
#[cfg(feature = "reuse_weights")]
self.gpu_info_archive.get_summary(&mut summary);
#[cfg(feature = "auto_batching")]
self.batch_archive.get_summary(&mut summary);
println!("{}", summary);
return summary;
}

pub fn reset(&self) {
#[cfg(feature = "timestamp")]
self.timestamp_archive.reset();
#[cfg(feature = "reuse_weights")]
self.gpu_info_archive.reset();
#[cfg(feature = "auto_batching")]
self.batch_archive.reset();
}
}
5 changes: 4 additions & 1 deletion dispatcher/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ wasm = ["machine_interface/wasm"]
mmu = ["machine_interface/mmu"]
kvm = ["machine_interface/kvm"]
reqwest_io = ["machine_interface/reqwest_io"]
timestamp = ["dandelion_commons/timestamp"]
timestamp = ["dandelion_commons/timestamp"]
gpu = ["machine_interface/gpu"]
gpu_queue = []
auto_batching = []
22 changes: 22 additions & 0 deletions dispatcher/src/composition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,28 @@ impl From<(usize, Vec<Arc<Context>>)> for CompositionSet {
}
}

// TODO : is there a better way?
#[cfg(feature = "auto_batching")]
use machine_interface::function_driver::AtomInputs;
#[cfg(feature = "auto_batching")]
impl Into<AtomInputs> for CompositionSet {
fn into(self) -> AtomInputs {
AtomInputs {
item_list: self.item_list.clone(),
set_index: self.set_index.clone(),
}
}
}
#[cfg(feature = "auto_batching")]
impl From<AtomInputs> for CompositionSet {
fn from(atom_inputs: AtomInputs) -> CompositionSet {
CompositionSet {
item_list: atom_inputs.item_list.clone(),
set_index: atom_inputs.set_index.clone(),
}
}
}

pub struct CompositionSetTransferIterator<'origin> {
/// set for which this iterator is implemented
set_iterator: std::slice::Iter<'origin, (u32, usize, Arc<Context>)>,
Expand Down
Loading