eth-easl · jonathanplsmith · Feb 1, 2024 · Feb 2, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "unity/source"]
 	path = machine_interface/c_machine_libraries/unity/source
-	url = https://github.com/ThrowTheSwitch/Unity.git
+	url = git@github.com:ThrowTheSwitch/Unity.git
 [submodule "http"]
 	path = net/http
-	url = https://github.com/cesanta/mongoose.git
+	url = git@github.com:cesanta/mongoose.git
diff --git a/README.md b/README.md
@@ -114,3 +114,27 @@ If you use Dandelion, please cite our paper:
 # C Dependencies
 
 For testing the C code to interact with Cheri we are using unity which is included directly in the project.
+
+## GPU worker build
+
+The `gpu_worker` binary required by the `gpu_process` is assumed to be present in corresponding `target` directory:
+```
+cargo build --bin gpu_worker --features $(gpu-arch),gpu_process --target $(arch)-unknown-linux-gnu [--release]
+```
+
+Where `gpu-arch` is either `cuda` or `hip`.
+
+Also make sure that shared memory objects are executable:
+```
+sudo mount -o remount,exec /dev/shm
+```
+
+### GPU worker path
+
+To use a `gpu_worker` that is not at the original location it was built in, set the `GPU_WORKER_PATH` environment variable to point to the desired binary
+
+## GPU engine library path
+`DANDELION_LIBRARY_PATH` overwrites the directory where the GPU engines will look for kernel libraries. If the variable is unset the engines will look in `machine_interface/tests/libs/`.
+
+## GPU Allocations
+To prevent memory leakage, GPU kernels are disallowed from calling `malloc()`. All the memory a kernel requires should be specified in the respective config file.
diff --git a/dandelion_commons/Cargo.toml b/dandelion_commons/Cargo.toml
@@ -5,6 +5,8 @@ edition = "2021"
 
 [features]
 timestamp = []
+reuse_weights = []
+auto_batching = []
 
 [dependencies]
 hdrhist = "0.5.0"

diff --git a/dandelion_commons/src/lib.rs b/dandelion_commons/src/lib.rs
@@ -114,6 +114,17 @@ pub enum DandelionError {
     OtherProctionError,
     /// Work queue from the dispatcher to the engines is full
     WorkQueueFull,
+    // GPU engine specfific errors
+    /// error from HIP Runtime
+    HipError(String),
+    /// error from HIP Runtime
+    CudaError(String),
+    /// identifier used in config file was not declared before
+    UndeclaredIdentifier(String),
+    /// argument given to the FromInput sizing was out of bounds
+    FromInputOutOfBounds,
+    /// could not deserialise JSON for config
+    ParsingJSONError(String),
 }
 
 // Implement display to be compliant with core::error::Error

diff --git a/dandelion_commons/src/records.rs b/dandelion_commons/src/records.rs
@@ -1,11 +1,12 @@
 use crate::FunctionId;
 use core::fmt;
+use std::sync::{Arc, Mutex};
 use std::time::Instant;
 
 /// Maximum usize to expect when converting a record point to a usize
 /// By setting the last element to this explicitly, the compiler will throw an error,
 /// if there are more than this, because it enumerates from 0 and won't allow a number to be assigned twice.
-const LAST_RECORD_POINT: usize = 17;
+const LAST_RECORD_POINT: usize = 25;
 
 #[repr(usize)]
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -44,6 +45,23 @@ pub enum RecordPoint {
     EngineStart,
     /// End execution of the function on the engine (sync)
     EngineEnd,
+    /// --- GPU ---
+    /// Start GPU inputs and buffers load (sync)
+    GPUTransferStart,
+    /// End GPU inputs and buffers load (sync)
+    GPUTransferEnd,
+    /// Start GPU kernel executions (sync)
+    GPUInferenceStart,
+    /// End GPU kernel executions (sync)
+    GPUInferenceEnd,
+    /// Start GPU output read (sync)
+    GPUOutputStart,
+    /// End GPU output read (sync)
+    GPUOutputEnd,
+    /// Start GPU output read (sync)
+    BatchAtomStart,
+    /// End GPU output read (sync)
+    BatchAtomEnd,
     /// Return from execution engine (async)
     FutureReturn = LAST_RECORD_POINT,
 }
@@ -165,24 +183,128 @@ impl TimestampArchive {
     }
 }
 
+#[cfg(feature = "reuse_weights")]
+struct ReuseWeightsArchive {
+    collected_gpu_cache_hit: std::sync::Mutex<Vec<bool>>,
+    collected_gpu_id: std::sync::Mutex<Vec<u8>>,
+}
+
+#[cfg(feature = "reuse_weights")]
+impl ReuseWeightsArchive {
+    fn init() -> Self {
+        return Self {
+            collected_gpu_cache_hit: std::sync::Mutex::new(Vec::new()),
+            collected_gpu_id: std::sync::Mutex::new(Vec::new()),
+        };
+    }
+
+    fn insert(&self, new_gpu_cache_hit: bool, new_gpu_id: u8) {
+        let mut guard_cache = self.collected_gpu_cache_hit.lock().unwrap();
+        guard_cache.push(new_gpu_cache_hit);
+
+        let mut guard_gpu = self.collected_gpu_id.lock().unwrap();
+        guard_gpu.push(new_gpu_id);
+    }
+
+    fn reset(&self) {
+        let mut guard_cache = self.collected_gpu_cache_hit.lock().unwrap();
+        *guard_cache = Vec::new();
+
+        let mut guard_gpu = self.collected_gpu_id.lock().unwrap();
+        *guard_gpu = Vec::new();
+    }
+
+    fn append_gpu_info(&self, gpu_info: (bool, u8), summary: &mut String, indent: usize) {
+        // push self
+        summary.push_str(&format!(
+            "{}gpu_cache_hit:{}, gpu_id:{}",
+            "-".repeat(indent),
+            gpu_info.0,
+            gpu_info.1,
+        ));
+    }
+
+    fn get_summary(&self, summary: &mut String) {
+        for (gpu_cache_hit, gpu_id) in self.collected_gpu_cache_hit.lock().unwrap().iter().zip(self.collected_gpu_id.lock().unwrap().iter()) {
+            let gpu_info = (*gpu_cache_hit, *gpu_id);
+            self.append_gpu_info(gpu_info, summary, 0);
+            summary.push_str("\n");
+        }
+    }
+}
+
+#[cfg(feature = "auto_batching")]
+struct BatchArchive {
+    collected_batch_size: std::sync::Mutex<Vec<usize>>,
+}
+
+#[cfg(feature = "auto_batching")]
+impl BatchArchive {
+    fn init() -> Self {
+        return Self {
+            collected_batch_size: std::sync::Mutex::new(Vec::new()),
+        };
+    }
+
+    fn insert(&self, new_batch_size: usize) {
+        let mut guard = self.collected_batch_size.lock().unwrap();
+        guard.push(new_batch_size);
+    }
+
+    fn reset(&self) {
+        let mut guard = self.collected_batch_size.lock().unwrap();
+        *guard = Vec::new();
+    }
+
+    fn append_batch_size(&self, batch_size: usize, summary: &mut String, indent: usize) {
+        // push self
+        summary.push_str(&format!("{}batch_size:{}", "-".repeat(indent), batch_size));
+    }
+
+    fn get_summary(&self, summary: &mut String) {
+        for recorder in self.collected_batch_size.lock().unwrap().iter() {
+            self.append_batch_size(*recorder, summary, 0);
+            summary.push_str("\n");
+        }
+    }
+}
+
 /// General implementation of recorder struct, additional functionality enabled by flags
 pub struct Recorder {
     #[cfg(feature = "timestamp")]
     timestamps: std::sync::Arc<FunctionTimestamp>,
+    #[cfg(feature = "reuse_weights")]
+    gpu_cache_hit: Arc<Mutex<bool>>,
+    #[cfg(feature = "reuse_weights")]
+    gpu_id: Arc<Mutex<u8>>,
+    #[cfg(feature = "auto_batching")]
+    batch_size: Arc<Mutex<usize>>,
 }
 
 impl Recorder {
     pub fn new(_function_id: FunctionId, _start: Instant) -> Self {
         return Self {
             #[cfg(feature = "timestamp")]
             timestamps: FunctionTimestamp::new(_function_id, _start),
+            #[cfg(feature = "reuse_weights")]
+            gpu_cache_hit: Arc::new(Mutex::new(false)),
+            #[cfg(feature = "reuse_weights")]
+            gpu_id: Arc::new(Mutex::new(u8::MAX)),
+            #[cfg(feature = "auto_batching")]
+            batch_size: Arc::new(Mutex::new(0)),
         };
     }
 
     pub fn new_from_parent(_function_id: FunctionId, _parent: &Self) -> Self {
         return Self {
             #[cfg(feature = "timestamp")]
             timestamps: FunctionTimestamp::new(_function_id, _parent.timestamps.creation),
+            #[cfg(feature = "reuse_weights")]
+            gpu_cache_hit: Arc::new(Mutex::new(false)),
+            #[cfg(feature = "reuse_weights")]
+            gpu_id: Arc::new(Mutex::new(u8::MAX)),
+            #[cfg(feature = "auto_batching")]
+            batch_size: Arc::new(Mutex::new(0)),
         };
     }
 
@@ -191,6 +313,24 @@ impl Recorder {
         self.timestamps.record(_current_point);
     }
 
+    pub fn set_gpu_info(&mut self, _gpu_cache_hit: bool, _gpu_id: u8) {
+        #[cfg(feature = "reuse_weights")]
+        {
+            let mut gpu_cache_hit = self.gpu_cache_hit.lock().unwrap();
+            *gpu_cache_hit = _gpu_cache_hit;
+            let mut gpu_id = self.gpu_id.lock().unwrap();
+            *gpu_id = _gpu_id;
+        }
+    }
+
+    pub fn set_batch_size(&mut self, _batch_size: usize) {
+        #[cfg(feature = "auto_batching")]
+        {
+            let mut batch_size = self.batch_size.lock().unwrap();
+            *batch_size = _batch_size;
+        }
+    }
+
     pub fn add_children(&mut self, _new_children: Vec<Recorder>) {
         #[cfg(feature = "timestamp")]
         for child in _new_children {
@@ -202,6 +342,12 @@ impl Recorder {
         let recorder = Recorder {
             #[cfg(feature = "timestamp")]
             timestamps: self.timestamps.clone(),
+            #[cfg(feature = "reuse_weights")]
+            gpu_cache_hit: self.gpu_cache_hit.clone(),
+            #[cfg(feature = "reuse_weights")]
+            gpu_id: self.gpu_id.clone(),
+            #[cfg(feature = "auto_batching")]
+            batch_size: self.batch_size.clone(),
         };
         return recorder;
     }
@@ -218,13 +364,39 @@ impl fmt::Display for Recorder {
             }
             self.timestamps.fmt(_f)?;
         }
+        #[cfg(feature = "reuse_weights")]
+        {
+            if std::sync::Arc::strong_count(&self.gpu_cache_hit) != 1
+                && std::sync::Arc::weak_count(&self.gpu_cache_hit) != 0
+            {
+                panic!("Trying to format recorder that still has more than one reference");
+            }
+            #[cfg(feature = "timestamp")]
+            write!(_f, ",")?;
+            write!(_f, " gpu_cache_hit: {}, gpu_id: {}", self.gpu_cache_hit.lock().unwrap(), self.gpu_id.lock().unwrap())?;
+        }
+        #[cfg(feature = "auto_batching")]
+        {
+            if std::sync::Arc::strong_count(&self.batch_size) != 1
+                && std::sync::Arc::weak_count(&self.batch_size) != 0
+            {
+                panic!("Trying to format recorder that still has more than one reference");
+            }
+            #[cfg(feature = "timestamp")]
+            write!(_f, ",")?;
+            write!(_f, " batch_size: {}", self.batch_size.lock().unwrap())?;
+        }
         Ok(())
     }
 }
 
 pub struct Archive {
     #[cfg(feature = "timestamp")]
     timestamp_archive: TimestampArchive,
+    #[cfg(feature = "reuse_weights")]
+    gpu_info_archive: ReuseWeightsArchive,
+    #[cfg(feature = "auto_batching")]
+    batch_archive: BatchArchive,
 }
 
 pub struct ArchiveInit {
@@ -237,13 +409,26 @@ impl Archive {
         return Archive {
             #[cfg(feature = "timestamp")]
             timestamp_archive: TimestampArchive::init(),
+            #[cfg(feature = "reuse_weights")]
+            gpu_info_archive: ReuseWeightsArchive::init(),
+            #[cfg(feature = "auto_batching")]
+            batch_archive: BatchArchive::init(),
         };
     }
 
     pub fn insert_recorder(&self, _recorder: Recorder) {
         #[cfg(feature = "timestamp")]
         self.timestamp_archive
             .insert(std::sync::Arc::into_inner(_recorder.timestamps).unwrap());
+        #[cfg(feature = "reuse_weights")]
+        self.gpu_info_archive.insert(
+            std::sync::Arc::into_inner((*_recorder.gpu_cache_hit.lock().unwrap()).into()).unwrap(),
+            std::sync::Arc::into_inner((*_recorder.gpu_id.lock().unwrap()).into()).unwrap(),
+        );
+        #[cfg(feature = "auto_batching")]
+        self.batch_archive.insert(
+            std::sync::Arc::into_inner((*_recorder.batch_size.lock().unwrap()).into()).unwrap(),
+        );
     }
 
     pub fn get_summary(&self) -> String {
@@ -252,11 +437,20 @@ impl Archive {
         let mut summary = String::new();
         #[cfg(feature = "timestamp")]
         self.timestamp_archive.get_summary(&mut summary);
+        #[cfg(feature = "reuse_weights")]
+        self.gpu_info_archive.get_summary(&mut summary);
+        #[cfg(feature = "auto_batching")]
+        self.batch_archive.get_summary(&mut summary);
+        println!("{}", summary);
         return summary;
     }
 
     pub fn reset(&self) {
         #[cfg(feature = "timestamp")]
         self.timestamp_archive.reset();
+        #[cfg(feature = "reuse_weights")]
+        self.gpu_info_archive.reset();
+        #[cfg(feature = "auto_batching")]
+        self.batch_archive.reset();
     }
 }
diff --git a/dispatcher/Cargo.toml b/dispatcher/Cargo.toml
@@ -22,4 +22,7 @@ wasm = ["machine_interface/wasm"]
 mmu = ["machine_interface/mmu"]
 kvm = ["machine_interface/kvm"]
 reqwest_io = ["machine_interface/reqwest_io"]
-timestamp = ["dandelion_commons/timestamp"]
+timestamp = ["dandelion_commons/timestamp"]
+gpu = ["machine_interface/gpu"]
+gpu_queue = []
+auto_batching = []
diff --git a/dispatcher/src/composition.rs b/dispatcher/src/composition.rs
@@ -400,6 +400,28 @@ impl From<(usize, Vec<Arc<Context>>)> for CompositionSet {
     }
 }
 
+// TODO : is there a better way?
+#[cfg(feature = "auto_batching")]
+use machine_interface::function_driver::AtomInputs;
+#[cfg(feature = "auto_batching")]
+impl Into<AtomInputs> for CompositionSet {
+    fn into(self) -> AtomInputs {
+        AtomInputs {
+            item_list: self.item_list.clone(),
+            set_index: self.set_index.clone(),
+        }
+    }
+}
+#[cfg(feature = "auto_batching")]
+impl From<AtomInputs> for CompositionSet {
+    fn from(atom_inputs: AtomInputs) -> CompositionSet {
+        CompositionSet {
+            item_list: atom_inputs.item_list.clone(),
+            set_index: atom_inputs.set_index.clone(),
+        }
+    }
+}
+
 pub struct CompositionSetTransferIterator<'origin> {
     /// set for which this iterator is implemented
     set_iterator: std::slice::Iter<'origin, (u32, usize, Arc<Context>)>,