Edison-Watch · Drummersbrother · Sep 10, 2024 · cu99dev · Sep 16, 2024 · cu99dev
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,6 +28,7 @@ CPMAddPackage("gh:simdjson/simdjson#v3.9.2")
 find_package(RapidJSON REQUIRED)
 find_library(TCLAP tclap)
 
+option(GPUEVM_O2_DEBUG "Enable O2 debug for CUDA" ON)
 option(GPUEVM_DEBUG_INTERPRETER "Printing from device during interpreter runtime" OFF)
 option(GPUEVM_DEBUG_FUZZER "Print debug info from fuzzer during runtime" OFF)
 option(GPUEVM_DEBUG_NVTX "Printing nvtx context pushing/popping during runtime" OFF)
@@ -103,10 +104,13 @@ function(configure_target_gpuevm target)
         target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g>)
         target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -v>)
 
-        # TODO make this an option
-        target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-dopt=on>)
-        target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O2>)
-        target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -O2>)
+        # Recall that we have this a suboption of the debug build type bc we couldn't figure out how to capture
+        # all the lineinfo in a -g Release build
+        if(GPUEVM_O2_DEBUG)
+            target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-dopt=on>)
+            target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O2>)
+            target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas -O2>)
+        endif()
 
         # And set the GPUEVM_DEBUG symbol
         target_compile_definitions(${target} PRIVATE GPUEVM_DEBUG)

diff --git a/src/common/fuzz_algos/CMakeLists.txt b/src/common/fuzz_algos/CMakeLists.txt
@@ -17,4 +17,4 @@ set_target_properties(evm_fuzz_algos PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 target_include_directories(evm_fuzz_algos PUBLIC ${GPU_EVM_SOURCE_DIR}/src/common)
 
-target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart)
+target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart termcolor)
diff --git a/src/common/fuzz_algos/fuzz_check_invariants.cpp b/src/common/fuzz_algos/fuzz_check_invariants.cpp
@@ -4,6 +4,7 @@
 
 #include <iomanip>
 #include <iostream>
+#include <termcolor/termcolor.hpp>
 
 __global__ void set_test_selectors_kernel(GpuArrayView<uint8_t> test_selectors_out, const __restrict__ uint8_t* invariant_selectors, const ExtFuzzFuncs fuzz_funcs, uint32_t n_tested_storages, uint32_t n_tot_runners) {
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -40,68 +41,64 @@ void set_test_selectors(GpuArrayView<uint8_t> test_selectors, const ExtFuzzFuncs
     safeCudaFree(selector_buf);
 }
 
-__global__ void set_storage_indices_kernel(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads, size_t index_offset) {
+__global__ void set_storage_indices_kernel(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads) {
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+    // For tid < n_tested_storages, we're already done with the cyclical assignment
+    if (tid < n_tested_storages) return;
     // All evms above n_test_threads just get storage index 0, since it's guaranteed to exist
     if (tid >= MAX_N_SEPARATE_CONTEXTS) return;
     if (tid >= n_test_threads) {
         storage_indices[tid] = 0;
     } else {
-        storage_indices[tid] = (tid % n_tested_storages) + index_offset;
+        storage_indices[tid] = storage_indices[tid % n_tested_storages];
     }
 }
 
-void set_storage_indices(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads, size_t index_offset) {
+void cyclic_repeat_storage_indices(GpuArrayView<size_t> storage_indices, uint32_t n_tested_storages, uint32_t n_test_threads) {
     auto [grid_size, block_size] = get_cuda_max_occupancy(storage_indices.size, (void*)set_storage_indices_kernel);
-    set_storage_indices_kernel<<<grid_size, block_size>>>(storage_indices, n_tested_storages, n_test_threads, index_offset);
+    set_storage_indices_kernel<<<grid_size, block_size>>>(storage_indices, n_tested_storages, n_test_threads);
     safeStreamSync();
 }
 
-// TODO store this in the storage corpus itself
-size_t first_unchecked_idx = 0;
 void fuzz_check_invariants(FuzzerState& fuzz_state) {
     gpuevm_nvtx_push("fuzz_check_invariants");
     // TODO: Complete the implementation and remove limitations
 
     // THE AMPERSAND IS NECESSARY WHEN USING AUTO; OTHERWISE YOU GET A BY-VALUE COPY!
     auto& api_ctx = fuzz_active_ctx();
 
-    assert(fuzz_state.fuzz_funcs.n_invariant_funcs > 0);
-    assert(fuzz_state.fuzz_funcs.n_invariant_funcs <= MAX_N_SEPARATE_CONTEXTS);
-    assert(api_ctx.n_runners == MAX_N_SEPARATE_CONTEXTS);
+    assert(fuzz_state.fuzz_funcs.n_invariant_funcs > 0 && "No invariants to check");
+    assert(fuzz_state.fuzz_funcs.n_invariant_funcs <= MAX_N_SEPARATE_CONTEXTS && "Too many invariants to check (more than one per thread)");
+    assert(api_ctx.n_runners == MAX_N_SEPARATE_CONTEXTS && "Wrong number of interpreter threads, must use one per cuda thread");
 
     // The actual number of storages we can test (since each storage needs to be tested against all invariants)
     auto n_tested_storages = MAX_N_SEPARATE_CONTEXTS / fuzz_state.fuzz_funcs.n_invariant_funcs;
     auto n_test_threads = n_tested_storages * fuzz_state.fuzz_funcs.n_invariant_funcs;
 
-    // TODO remove this requirement
-    // There must be enough storages actually recorded
-    assert(fuzz_state.storage_corpus.size >= n_tested_storages);
-
-    /* Plan for using/setting the checkedness on storages
-    1. Get the first storage index that is unchecked, (This should and can just be tracked in single counter, until we switch to having separate fuzzing and invariantchecking storage corpori)
-    2. Doublecheck that we actually have enough storages to run a full batch
-    3. Use the index as offset when we make the cyclic storage selection array
-    4. Mark the check storages from that index as checked
+    /* Storage selection for invariant checking
+     1. Check that there are n_tested_storages that are unchecked
+     2. Get the n_tested_storages first such indices
+     3. Use that array of indices to cyclically repeat
+     4. Mark those storages as checked
+     5. Move the storages into the ctx's gpu_map
      */
 
-    // 1. Get the first unchecked index
-    // TODO use cub::argindexinputiterator in the future?
-
-    // 2. Check that we have enough storages
-    assert(fuzz_state.storage_corpus.size > first_unchecked_idx + n_tested_storages);
+    // 1. Check that there are n_tested_storages that are unchecked
+    assert(fuzz_state.storage_corpus.num_unchecked_cases() >= n_tested_storages && "Not enough unchecked storages to check invariants on");
 
-    // 3. Use the unchecked_idx as (value)offset in storage selection array
+    // 2. Get the n_tested_storages first such indices
     GpuArray<size_t> storage_indices(MAX_N_SEPARATE_CONTEXTS);
-    set_storage_indices(storage_indices.view(), n_tested_storages, n_test_threads, first_unchecked_idx);
+    auto small_storage_indices_view = storage_indices.view();
+    // We need to let the get_unchecked_indices only put n_tested_storages into the array
+    small_storage_indices_view.size = n_tested_storages;
+    fuzz_state.storage_corpus.get_unchecked_indices(small_storage_indices_view, n_tested_storages);
+    // 3. Use that array of indices to cyclically repeat
+    cyclic_repeat_storage_indices(storage_indices.view(), n_tested_storages, n_test_threads);
 
     // 4. Set the checkedness of the storages to true
-    // TODO this should also probably be a member of the storage corpus
-    for (auto i = 0; i < n_tested_storages; i++) {
-        fuzz_state.storage_corpus.metadatas[i + first_unchecked_idx].checked_invariants = true;
-    }
+    fuzz_state.storage_corpus.mark_checked(small_storage_indices_view);
 
-    // Set the storages into the ctx
+    // 5. Set the storages into the ctx's gpu_map
     fuzz_state.storage_corpus.scatter_kvs(api_ctx.storage, storage_indices.view());
 
     // Now we group the selectors into contiguous groups to get maximal warp coherence
@@ -150,17 +147,17 @@ void fuzz_check_invariants(FuzzerState& fuzz_state) {
 
     CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
+    size_t n_broken_invariants = fuzz_state.fuzz_funcs.n_invariant_funcs - std::count(invariant_revert_mask.data, invariant_revert_mask.data + fuzz_state.fuzz_funcs.n_invariant_funcs, 0);
+
     // If any reverts are new (i.e. set in this mask, but not previously set in the reverted_invariants_times), set to current time
     fuzz_time_diff revert_time = timing::time_diff(fuzz_state.start_time, timing::current_time());
     for (auto invariant_idx = 0; invariant_idx < fuzz_state.fuzz_funcs.n_invariant_funcs; invariant_idx++) {
         if (invariant_revert_mask[invariant_idx] && fuzz_state.reverted_invariants_times[invariant_idx] == 0) {
-            std::cout << "[Broke inv " << invariant_idx << "]";
+            std::cout << termcolor::red << "[Broke inv " << invariant_idx << "| tot broken: " << n_broken_invariants << "]" << termcolor::reset;
             fuzz_state.reverted_invariants_times[invariant_idx] = revert_time;
         }
     }
     // TODO check what the actual memory consistency guarantees are for stream syncs vs device syncs and manual (i.e. not via kernels that are in streams) access to unified memory
     CUDA_SAFE_CALL(cudaDeviceSynchronize());
-
-    first_unchecked_idx += n_tested_storages;
     gpuevm_nvtx_pop();
 }
diff --git a/src/common/fuzz_algos/fuzz_cull.cpp b/src/common/fuzz_algos/fuzz_cull.cpp
@@ -1,14 +1,108 @@
 #include "fuzz_cull.h"
 
+#include <fuzz_dss/gpu_array_utils.h>
+
+#include <iomanip>
+
 /*
-Culling the queue of fuzzing inputs.
+Culling the inputs/storages that have been accumulated.
+
+Calldata culling ideas:
 Initial implementation reconstructs all edge coverage found by every input in the queue, and then finds a subset (not necessarily minimal, this is subset cover which is NP hard) of queue entries
 that cover the same edges.
+
+Storage culling ideas:
+Remove all storages that have above M visits and below R ratio of votes/visits
+TODO think about some sort of probabalistic thing to cull
+TODO make it reach a target absolute # passing instead (histogram vote ratios, then cutoff)
 */
 
+namespace fuzz_culling {
+    // Thresholds for storage culling
+    constexpr int MIN_VISITS = 1024;
+    constexpr float MIN_VOTE_RATIO = 1 / 1024;
+
+    // From metadata array, set into allocated mask whether a storage passes (should not be culled)
+    __global__ void cull_storages_mark_kernel(const GpuArrayView<StorageCaseMetadata> metadata, GpuArrayView<bool> passes_mask, const GpuArrayView<size_t> storage_sizes, size_t max_storage_size) {
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        if (tid >= metadata.size) return;
+        // Note that if we haven't had MIN_VISITS, then we spare that storage
+        auto votes = metadata[tid].fuzz_case_metadata.votes;
+        auto visits = metadata[tid].fuzz_case_metadata.visits;
+        auto ratio = metadata[tid].fuzz_case_metadata.vote_ratio();
+        auto storage_size = storage_sizes[tid];
+        // TODO come up with good rules here
+        if (visits < MIN_VISITS) {
+            // Automatically spared
+            passes_mask[tid] = storage_size >= (max_storage_size - 1);
+        } else {
+            // Having already had MIN_VISITS changes, it better have generated good results, or it should be of max size
+            passes_mask[tid] = (ratio >= MIN_VOTE_RATIO);
+        }
+    }
+
+    size_t max_storage_size(const GpuArrayView<size_t>& kv_sizes) {
+        auto max_size_ptr = thrust::max_element(thrust::device, kv_sizes.data, kv_sizes.data + kv_sizes.size);
+        auto max_size = *max_size_ptr;
+        return max_size;
+    }
+
+    void cull_storages_mark(FuzzStorageBuffer& storage_corpus, GpuArrayView<bool> passes_mask) {
+        assert(storage_corpus.size <= passes_mask.size);
+        // We calculate the max storage kv size, and then don't allow any max-size storages to be culled
+        auto max_size = max_storage_size(storage_corpus.kv_sizes.view());
+
+        auto [grid_size, block_size] = get_cuda_max_occupancy(storage_corpus.metadatas.size(), (void*)cull_storages_mark_kernel);
+        cull_storages_mark_kernel<<<grid_size, block_size>>>(storage_corpus.metadatas.view(), passes_mask, storage_corpus.kv_sizes.view(), max_size);
+    }
+
+    // Given a pass mask, removes & compacts the storage corpus
+    // TODO think about whether ancestor handling should be done in a GC-like way instead
+    void cull_storages_filter(FuzzerState& fuzz_state, GpuArrayView<bool> passes_mask) {
+        // We get back the culled storages from the compaction
+
+        auto culled_storages = fuzz_state.storage_corpus.filter(passes_mask);
+        GpuArray<bool> ancestor_mask = fuzz_state.storage_corpus.get_ancestor_mask(culled_storages);
+        // NB ignoring return value here, we just want the ones that remain after keeping ancestors
+        culled_storages.filter(ancestor_mask);
+        // TODO make sure batch actually checks hashes)
+        fuzz_state.ancestor_corpus.push_batch(culled_storages);
+    }
+
+}
+
 void fuzz_cull(FuzzerState& fuzz_state) {
-    std::cout << "hello world I am culling!" << std::endl;
-#ifdef GPUEVM_DEBUG
-    std::cout << "Culling from corpus size " << fuzz_state.corpus.size() << std::endl;
-#endif
+    gpuevm_nvtx_push("fuzz_cull");
+
+    long long num_passes_before = fuzz_state.storage_corpus.size;
+
+    // TODO cache the various tempbuffers here
+    GpuArray<bool> passes_mask(fuzz_state.storage_corpus.size);
+
+    // We get back a mask that has true if the storage passed, and false if it should be culled (with no regard for ancestor-keeping)
+    fuzz_culling::cull_storages_mark(fuzz_state.storage_corpus, passes_mask.view());
+    safeStreamSync();
+    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+    // Don't allow removing all states, so check that enough are kept before actually culling
+    if (device_array_count<bool>(passes_mask, true) >= 32) {
+        // We take care of ancestors here
+        fuzz_culling::cull_storages_filter(fuzz_state, passes_mask.view());
+        safeStreamSync();
+
+        CUDA_SAFE_CALL(cudaDeviceSynchronize());
+        // Count and print some stats about the passes_mask
+        long long num_passes_after = device_array_count<bool>(passes_mask, true);
+        long long failed = num_passes_before - num_passes_after;
+        if (failed > 0) {
+            std::cout << "[Cull: " << num_passes_after << " pass, " << failed << " failed | ";
+            std::cout << "Rate " << std::fixed << std::setprecision(1) << 100 * failed / (float)num_passes_before << "%]";
+        } else {
+            std::cout << "P";
+        }
+    } else {
+        std::cout << "N";
+    }
+
+    gpuevm_nvtx_pop();
 }
diff --git a/src/common/fuzz_algos/fuzz_dedup.cpp b/src/common/fuzz_algos/fuzz_dedup.cpp
@@ -106,6 +106,20 @@ namespace fuzz_dedup {
         return marked_inputs;
     }
 
+    __global__ void add_votes_to_storages_kernel(const GpuArrayView<bool> interesting_mask, const GpuArrayView<size_t> storage_indices, GpuArrayView<StorageCaseMetadata> metadata) {
+        int tid = blockIdx.x * blockDim.x + threadIdx.x;
+        if (tid >= interesting_mask.size) return;
+        if (interesting_mask.data[tid])
+            atomicAdd((unsigned long long*)&(metadata[storage_indices[tid]].fuzz_case_metadata.votes), (unsigned long long)1);
+    }
+
+    void add_votes_to_storages(const GpuArrayView<bool> interesting_mask, const GpuArrayView<size_t> storage_indices, GpuArrayView<StorageCaseMetadata> metadata) {
+        gpuevm_nvtx_push("add_votes_to_storages");
+        auto [grid_size, block_size] = get_cuda_max_occupancy(interesting_mask.size, (void*)add_votes_to_storages_kernel);
+        add_votes_to_storages_kernel<<<grid_size, block_size>>>(interesting_mask, storage_indices, metadata);
+        gpuevm_nvtx_pop();
+    }
+
     __global__ void get_revert_mask_kernel(uint8_t* const bytecode, const GpuArrayView<evm_tracer_t> tracers_in, GpuArrayView<bool> reverting_mask) {
         int tid = blockIdx.x * blockDim.x + threadIdx.x;
         if (tid >= tracers_in.size) return;

diff --git a/src/common/fuzz_algos/fuzz_dedup.h b/src/common/fuzz_algos/fuzz_dedup.h
@@ -14,6 +14,7 @@ namespace fuzz_dedup {
     FuzzCaseInputs filter_inputs_by_mask(const FuzzCaseInputs& run_inputs, const GpuArrayView<bool>& mask);
 
     std::pair<std::optional<GpuArrayView<bool>>, std::optional<FuzzCaseInputs>> get_reverting_data(uint8_t* const bytecode, evm_tracer_t* tracer_buf, const FuzzCaseInputs& run_inputs);
+    void add_votes_to_storages(const GpuArrayView<bool> interesting_mask, const GpuArrayView<size_t> storage_indices, GpuArrayView<StorageCaseMetadata> metadata);
     void get_revert_mask(uint8_t* const bytecode, const GpuArrayView<evm_tracer_t> tracers_in, GpuArrayView<bool> reverting_mask);
     void compute_revert_discard_mask(const GpuArrayView<bool> reverting_mask, const GpuArrayView<evm_tracer_t> tracers_in, GpuArrayView<bool> revert_discard_mask);
 
@@ -22,4 +23,4 @@ namespace fuzz_dedup {
     void inplace_bool_or_gpuarrays(const GpuArrayView<bool> a_inout, const GpuArrayView<bool> b_in);
     void inplace_bool_not_gpuarrays(const GpuArrayView<bool> a_inout);
     void inplace_bool_and_gpuarrays(const GpuArrayView<bool> a_inout, const GpuArrayView<bool> b_in);
-    }
+}
diff --git a/src/common/fuzz_algos/fuzz_state.cpp b/src/common/fuzz_algos/fuzz_state.cpp
@@ -21,6 +21,7 @@ FuzzerState::FuzzerState(const ExtGpuEvmFuzzSetup& setup, const FuzzStorageCase&
                                                                                                     corpus(EvmFlatAbiTypes(fuzz_funcs.stateful_funcs[0].evm_flat_abi).calldata_size()),
                                                                                                     storage_corpus(initial_storage),
                                                                                                     storage_hashes_seen(max_corpus_size_pow_2 + 5),
+                                                                                                    ancestor_corpus(),
                                                                                                     storage_dep_map(max_corpus_size_pow_2 + 5),
                                                                                                     reverting_exit_datas_map(max_corpus_size_pow_2 + 5),
                                                                                                     reverting_corpus(EvmFlatAbiTypes(fuzz_funcs.stateful_funcs[0].evm_flat_abi).calldata_size()),
@@ -120,10 +121,12 @@ std::tuple<FuzzCaseInputs, std::optional<FuzzCaseInputs>, std::optional<FuzzStor
     gpuevm_nvtx_push("run_get_interesting");
 
     // Set the storage cases to use
-    storage_corpus.scatter_kvs(fuzz_active_ctx().storage, storage_indices);
+    // Also mark these storage scatters as being real visits in the metadata
+    storage_corpus.scatter_kvs(fuzz_active_ctx().storage, storage_indices, true);
 
     // Setup gpuevm with those calldatas
     fuzz_set_calldatas(inputs);
+
     // Put the current distmap into interpreter memory
     // This can do resizing under the hood
     dist_map.sync_to_ctx(fuzz_active_ctx());
@@ -182,7 +185,7 @@ std::tuple<FuzzCaseInputs, std::optional<FuzzCaseInputs>, std::optional<FuzzStor
     auto n_interesting_storages = device_array_count(GpuArrayView(scratch_storage_hashes_interesting_mask), true);
     gpuevm_nvtx_pop();
 
-    // Immediately collect the interesting storages
+    // Immediately collect the interesting storages (NB here interesting storage means that is has new storagehash, and is not discarded)
     std::optional<FuzzStorageCaseBatch> interesting_storages;
     if (n_interesting_storages > 0) {
         interesting_storages = FuzzStorageCaseBatch(run_ctx, scratch_storage_hashes_interesting_mask);
@@ -261,6 +264,11 @@ std::tuple<FuzzCaseInputs, std::optional<FuzzCaseInputs>, std::optional<FuzzStor
             return {fuzz_dedup::concat_fuzz_case_inputs(interesting_inputs, preserve_add_back_inputs), opt_reverting_inputs, interesting_storages};
         }
     }
+
+    // For all the interesting inputs, we want to add votes to the storages they started with
+    // So we use the general interestingness mask and add votes to those storages
+    fuzz_dedup::add_votes_to_storages(interesting_mask, input_storage_indices, storage_corpus.metadatas.view());
+
     gpuevm_nvtx_pop();
     return {interesting_inputs, opt_reverting_inputs, interesting_storages};
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,4 @@ set_target_properties(evm_fuzz_algos PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

		target_include_directories(evm_fuzz_algos PUBLIC ${GPU_EVM_SOURCE_DIR}/src/common)

		target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart)
		target_link_libraries(evm_fuzz_algos PUBLIC evm_ingest ${CUDA_LIBRARIES} libcuda.so cudart termcolor)