diff --git a/src/Makefile b/src/Makefile index a890d58f7..f3cf86606 100644 --- a/src/Makefile +++ b/src/Makefile @@ -13,6 +13,10 @@ all: pannotia rodinia_2.0-ft proxy-apps microbench rodinia-3.1 ispass-2009 pol endif endif ci: rodinia_2.0-ft rodinia-3.1 GPU_Microbenchmark cutlass_mini cuda_samples +# Keep the repository pinned to the legacy cuda-samples revision for older toolkits, +# but use the upstream CUDA 13.1 samples when building with CUDA 13. +CUDA_SAMPLES_CUDA13_REF := 4f735616ba599fe93cc2c6c85dcb4369260f9643 +CUDA_SAMPLES_PINNED_REF := $(shell git -C .. rev-parse HEAD:src/cuda/cuda-samples 2>/dev/null) accelwattch: accelwattch_validation accelwattch_hw_power accelwattch_ubench accelwattch_validation: rodinia-3.1_accelwattch_validation parboil_accelwattch_validation cutlass cuda_samples-11.0_accelwattch_validation accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw_power @@ -518,8 +522,40 @@ mlperf_training: cuda_samples: mkdir -p $(BINDIR)/$(BINSUBDIR)/ - mkdir -p ./cuda/cuda-samples/build && cd ./cuda/cuda-samples/build && cmake .. && $(MAKE) - find $(GPUAPPS_ROOT)/src/cuda/cuda-samples/build/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ; + set -eu; \ + repo_dir=./cuda/cuda-samples; \ + desired_ref="$(CUDA_SAMPLES_PINNED_REF)"; \ + if [ -z "$$desired_ref" ]; then \ + desired_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \ + fi; \ + source_dir="$$repo_dir"; \ + build_dir="$$repo_dir/build-$(CUDA_VERSION)"; \ + export_dir=; \ + cleanup_cuda_samples() { \ + if [ -n "$$export_dir" ] && [ -d "$$export_dir" ]; then \ + rm -rf "$$export_dir"; \ + fi; \ + }; \ + trap 'cleanup_cuda_samples' EXIT; \ + if [ ${CUDA_VERSION_MAJOR} -ge 13 ]; then \ + desired_ref="$(CUDA_SAMPLES_CUDA13_REF)"; \ + fi; \ + original_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \ + if [ "$$original_ref" != "$$desired_ref" ]; then \ + export_dir=./cuda/cuda-samples-export-$(CUDA_VERSION); \ + build_dir="$$repo_dir/build-$(CUDA_VERSION)-compat"; \ + git -C "$$repo_dir" rev-parse --verify "$$desired_ref^{commit}" >/dev/null; \ + rm -rf "$$export_dir"; \ + mkdir -p "$$export_dir"; \ + archive_file="$$export_dir/source.tar"; \ + git -C "$$repo_dir" archive --format=tar "$$desired_ref" > "$$archive_file"; \ + tar -xf "$$archive_file" -C "$$export_dir"; \ + rm -f "$$archive_file"; \ + source_dir="$$export_dir"; \ + fi; \ + cmake -S "$$source_dir" -B "$$build_dir"; \ + cmake --build "$$build_dir"; \ + find "$$build_dir"/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ; pytorch_examples: mkdir -p $(BINDIR)/$(BINSUBDIR)/ @@ -711,10 +747,11 @@ clean_pytorch_examples: rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae clean_cuda_samples: - $(MAKE) clean -C ./cuda/cuda-samples/build + find ./cuda/cuda-samples -maxdepth 1 -type d \( -name 'build' -o -name 'build-*' \) -exec rm -rf {} + + find ./cuda -maxdepth 1 -type d -name 'cuda-samples-export-*' -exec rm -rf {} + clean_huggingface: rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface clean_GPU_Microbenchmark: - $(MAKE) clean -C ./cuda/GPU_Microbenchmark \ No newline at end of file + $(MAKE) clean -C ./cuda/GPU_Microbenchmark diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h index 0b0306596..82caee7ea 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h @@ -46,6 +46,15 @@ struct GpuConfig unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs) }; inline GpuConfig config; + +inline int getDeviceAttributeOrZero(cudaDeviceAttr attr, int deviceID) +{ + int value = 0; + if (cudaDeviceGetAttribute(&value, attr, deviceID) != cudaSuccess) + return 0; + return value; +} + // Parses short flags like --sm 80 into a GpuConfig object inline void parseGpuConfigArgs(int argc, char *argv[]) { @@ -280,8 +289,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[]) cudaSetDevice(deviceID); cudaGetDeviceProperties(&deviceProp, deviceID); - int clockRateKHz; - cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID); + int clockRateKHz = + getDeviceAttributeOrZero(cudaDevAttrClockRate, deviceID); + int memoryClockRateKHz = + getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, deviceID); + int memoryBusWidthBits = + getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, deviceID); // core stats config.SM_NUMBER = deviceProp.multiProcessorCount; @@ -310,9 +323,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[]) // memory config.MEM_SIZE = deviceProp.totalGlobalMem; - config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f; - config.MEM_BITWIDTH = deviceProp.memoryBusWidth; - config.CLK_FREQUENCY = clockRateKHz * 1e-3f; + if (memoryClockRateKHz > 0) + config.MEM_CLK_FREQUENCY = memoryClockRateKHz * 1e-3f; + if (memoryBusWidthBits > 0) + config.MEM_BITWIDTH = memoryBusWidthBits; + if (clockRateKHz > 0) + config.CLK_FREQUENCY = clockRateKHz * 1e-3f; // Get FBP_COUNT and L2_BANKS from NVIDIA RM API config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS); diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu index 088366980..beb5e39f3 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu @@ -12,9 +12,8 @@ int main(int argc, char *argv[]) snprintf(msg, sizeof(msg), "Global memory size = %.0f GB\n", static_cast(deviceProp.totalGlobalMem / 1073741824.0f)); std::cout << msg; - std::cout << "Memory Clock rate = " << deviceProp.memoryClockRate * 1e-3f - << " Mhz\n"; - std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n"; + std::cout << "Memory Clock rate = " << config.MEM_CLK_FREQUENCY << " Mhz\n"; + std::cout << "Memory Bus Width = " << config.MEM_BITWIDTH << " bit\n"; std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n"; std::cout << "Memory channels = " << config.FBP_COUNT << "\n"; @@ -38,7 +37,7 @@ int main(int argc, char *argv[]) << dram_model_freq_ratio[DRAM_MODEL] << std::endl; // timing - float device_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) / + float device_freq_MHZ = (config.MEM_CLK_FREQUENCY * 2) / dram_model_freq_ratio[DRAM_MODEL]; if (DRAM_MODEL == dram_model::HBM) { diff --git a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu index 6ff204623..a858a8ba7 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu @@ -16,6 +16,14 @@ struct alignas(DATA_SIZE) Data { }; static_assert(sizeof(Data) == DATA_SIZE, "Data struct size mismatch"); +static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id = 0) { + int value = 0; + if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) { + return 0; + } + return value; +} + // sm_80+ required __global__ void pipeline_kernel_async(const Data* __restrict__ global, @@ -174,8 +182,11 @@ for (size_t b = 0; b < num_blocks; b++) { } double avg_cycles = static_cast(total_cycles) / num_blocks; +int sm_clock_khz = getDeviceAttributeOrZero(cudaDevAttrClockRate); +int memory_clock_khz = getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate); + // GPU frequency (kHz → Hz) -double gpu_clock_hz = static_cast(prop.clockRate) * 1000.0; +double gpu_clock_hz = static_cast(sm_clock_khz) * 1000.0; // Time in seconds double time_sec = avg_cycles / gpu_clock_hz; @@ -192,8 +203,8 @@ double bw_gbs = bytes_moved / time_sec / 1e9; double bytesclk = bytes_moved / avg_cycles; // Bytes per GPU cycle std::cout << "---------------------------------\n"; -std::cout << "SM Clock = " << prop.clockRate / 1000.0 << " MHz\n"; -std::cout << "Memory Clock = " << prop.memoryClockRate / 1000.0 << " MHz\n"; +std::cout << "SM Clock = " << sm_clock_khz / 1000.0 << " MHz\n"; +std::cout << "Memory Clock = " << memory_clock_khz / 1000.0 << " MHz\n"; std::cout << "Avg cycles (per block) = " << avg_cycles << "\n"; std::cout << "Time (s) = " << time_sec << "\n"; std::cout << "Bytes moved = " << bytes_moved / (1024.0*1024*1024) << " GB\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile index 167a71704..ef6ca28da 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile +++ b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile @@ -2,7 +2,7 @@ SRC = deviceQuery.cpp EXE = deviceQuery -INCLUDE := -I../../../../cuda-samples/Common/ +INCLUDE := -I../../../../NVIDIA_CUDA-11.0_Samples/common/inc/ NVCC_FLGAS = include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp index 4d8bb318a..6f0743dbc 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp +++ b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp @@ -10,6 +10,14 @@ from CUDA SDK #include #include +static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id) { + int value = 0; + if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) { + return 0; + } + return value; +} + int main(int argc, char **argv) { int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); @@ -32,6 +40,11 @@ int main(int argc, char **argv) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); + int smClockKHz = getDeviceAttributeOrZero(cudaDevAttrClockRate, dev); + int memoryClockKHz = + getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, dev); + int memoryBusWidthBits = + getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, dev); // device printf(" Device : \"%s\"\n\n", deviceProp.name); @@ -40,7 +53,7 @@ int main(int argc, char **argv) { // core printf(" GPU Max Clock rate : %.0f MHz \n", - deviceProp.clockRate * 1e-3f); + smClockKHz * 1e-3f); printf(" Multiprocessors Count : %d\n", deviceProp.multiProcessorCount); printf(" Maximum number of threads per multiprocessor: %d\n", @@ -81,9 +94,10 @@ int main(int argc, char **argv) { static_cast(deviceProp.totalGlobalMem / 1073741824.0f)); printf("%s", msg); printf(" Memory Clock rate : %.0f Mhz\n", - deviceProp.memoryClockRate * 1e-3f); + memoryClockKHz * 1e-3f); printf(" Memory Bus Width : %d bit\n", - deviceProp.memoryBusWidth); + memoryBusWidthBits > 0 ? memoryBusWidthBits + : deviceProp.memoryBusWidth); printf(" ////////////////////////// \n"); } diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile index 3b7aa310c..b7dd4fb56 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile @@ -2,7 +2,12 @@ SRC = mbarrier.cu EXE = mbarrier -ARCH?=sm_80 sm_90a sm_100a sm_101 sm_120 +ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),) +ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_110 sm_120 +else +ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_101 sm_120 +endif +ARCH ?= $(ARCH_DEFAULT) # Unset the CUDA_CPPFLAGS which is set based on CUDA version CUDA_CPPFLAGS= diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile index f3c8bcd00..d7a7a9c83 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile @@ -3,7 +3,11 @@ SRC = tma_bulk.cu EXE = tma_bulk # TMA is supported on SM_90a and above -ARCH?=sm_90a sm_100a sm_101 sm_120 +ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),) +ARCH?=sm_90a sm_100a sm_110 sm_120 +else +ARCH?=sm_90a sm_100a sm_101 sm_120 +endif # Unset the CUDA_CPPFLAGS which is set based on CUDA version # but TMA is only supported on SM_90a and above CUDA_CPPFLAGS= diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu index fc885b515..7190422c8 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include #include diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile index b24b4dac9..f0ceefa9d 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile @@ -3,7 +3,11 @@ SRC = tma_tensor.cu EXE = tma_tensor # TMA is supported on SM_90a and above -ARCH?=sm_90a sm_100a sm_101 sm_120 +ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),) +ARCH?=sm_90a sm_100a sm_110 sm_120 +else +ARCH?=sm_90a sm_100a sm_101 sm_120 +endif # Unset the CUDA_CPPFLAGS which is set based on CUDA version # but TMA is only supported on SM_90a and above CUDA_CPPFLAGS= diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu index d9f7879d7..7c4668c0c 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu @@ -6,6 +6,7 @@ #include #include #include +#include #include using barrier = cuda::barrier; namespace ptx = cuda::ptx; diff --git a/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu b/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu index bed9aa524..af7d87203 100755 --- a/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu +++ b/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu @@ -129,7 +129,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh) in, hid); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) { diff --git a/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu b/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu index 800c24d08..3900c99d8 100755 --- a/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu +++ b/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu @@ -211,7 +211,7 @@ int main(int argc, char* argv[]) //Add a and b, store in c euclid<<>>(data, x2, y2, z_d, REC_WINDOW, REC_LENGTH, LATITUDE_POS); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //Copy data from device memory to host memory cudaMemcpy( z, z_d, sizeof(float)*REC_WINDOW, cudaMemcpyDeviceToHost ); diff --git a/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu b/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu index feda74cda..81a20e7e7 100755 --- a/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu +++ b/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu @@ -232,7 +232,7 @@ runTest( int argc, char** argv) #endif } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); #ifdef TIMER CUT_SAFE_CALL( cutStopTimer( timer_1 )); diff --git a/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu b/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu index 0eca1ae67..f4e8510fe 100755 --- a/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu +++ b/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu @@ -219,7 +219,7 @@ float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bo center_table_d, // in: center index table switch_membership_d // out: changes in membership ); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); #ifdef PROFILE double t10 = gettime(); diff --git a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu index a89908230..49485f8db 100755 --- a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu +++ b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu @@ -75,7 +75,7 @@ kernel_gpu_cuda_wrapper(record *records, // INITIAL DRIVER OVERHEAD //====================================================================================================100 - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //====================================================================================================100 // EXECUTION PARAMETERS @@ -223,7 +223,7 @@ kernel_gpu_cuda_wrapper(record *records, offsetD, keysD, ansD); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); checkCUDAError("findK"); time4 = get_time(); diff --git a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu index a2893b7b7..6ae78068c 100755 --- a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu +++ b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu @@ -77,7 +77,7 @@ kernel_gpu_cuda_wrapper_2( knode *knodes, // INITIAL DRIVER OVERHEAD //====================================================================================================100 - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //====================================================================================================100 // EXECUTION PARAMETERS @@ -272,7 +272,7 @@ kernel_gpu_cuda_wrapper_2( knode *knodes, endD, ansDStart, ansDLength); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); checkCUDAError("findRangeK"); time4 = get_time(); diff --git a/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu b/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu index f90c41f9b..6ec79c49b 100755 --- a/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu +++ b/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu @@ -127,7 +127,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh) in, hid); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) { diff --git a/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu b/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu index 16571017a..54d251cf4 100755 --- a/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu +++ b/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu @@ -565,7 +565,7 @@ int main(int argc, char** argv) initialize_variables(nelr, fluxes); cudaMemset( (void*) step_factors, 0, sizeof(float)*nelr ); // make sure CUDA isn't still doing something before we start timing - cudaThreadSynchronize(); + cudaDeviceSynchronize(); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; @@ -595,7 +595,7 @@ int main(int argc, char** argv) } } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); // CUT_SAFE_CALL( cutStopTimer(timer) ); sdkStopTimer(&timer); diff --git a/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu b/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu index 94107f183..f36dc03e2 100755 --- a/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu +++ b/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu @@ -530,7 +530,7 @@ int main(int argc, char** argv) initialize_variables(nelr, fluxes); cudaMemset( (void*) step_factors, 0, sizeof(double)*nelr ); // make sure CUDA isn't still doing something before we start timing - cudaThreadSynchronize(); + cudaDeviceSynchronize(); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; @@ -577,7 +577,7 @@ int main(int argc, char** argv) } } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sdkStopTimer(&timer); std::cout << (sdkGetAverageTimerValue(&timer)/1000.0) / iterations << " seconds per iteration" << std::endl; diff --git a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu index 423f6bcbc..616edaa03 100755 --- a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu +++ b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu @@ -610,7 +610,7 @@ int main(int argc, char** argv) initialize_variables(nelr, fluxes); cudaMemset( (void*) step_factors, 0, sizeof(float)*nelr ); // make sure CUDA isn't still doing something before we start timing - cudaThreadSynchronize(); + cudaDeviceSynchronize(); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; @@ -664,7 +664,7 @@ int main(int argc, char** argv) } } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sdkStopTimer(&timer); std::cout << (sdkGetAverageTimerValue(&timer)/1000.0) / iterations << " seconds per iteration" << std::endl; diff --git a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu index b8cbb403e..cc5cc1d78 100755 --- a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu +++ b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu @@ -622,7 +622,7 @@ int main(int argc, char** argv) initialize_variables(nelr, fluxes); cudaMemset( (void*) step_factors, 0, sizeof(double)*nelr ); // make sure CUDA isn't still doing something before we start timing - cudaThreadSynchronize(); + cudaDeviceSynchronize(); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; @@ -647,7 +647,7 @@ int main(int argc, char** argv) } } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sdkStopTimer(&timer); std::cout << (sdkGetAverageTimerValue(&timer)/1000.0) / iterations << " seconds per iteration" << std::endl; diff --git a/src/cuda/rodinia/3.1/cuda/dwt2d/common.h b/src/cuda/rodinia/3.1/cuda/dwt2d/common.h index f64980425..39c15fab9 100755 --- a/src/cuda/rodinia/3.1/cuda/dwt2d/common.h +++ b/src/cuda/rodinia/3.1/cuda/dwt2d/common.h @@ -54,7 +54,7 @@ } } # define cudaCheckAsyncError( msg ) { \ - cudaThreadSynchronize(); \ + cudaDeviceSynchronize(); \ cudaCheckError( msg ); \ } diff --git a/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h b/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h index 273d82476..61a791601 100755 --- a/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h +++ b/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h @@ -168,7 +168,7 @@ namespace dwt_cuda { /// @return true if there was no error, false otherwise static bool checkLastKernelCall(const char * message) { #if defined(GPU_DWT_TESTING) - return testRunning ? true : check(cudaThreadSynchronize(), message); + return testRunning ? true : check(cudaDeviceSynchronize(), message); #else // GPU_DWT_TESTING return true; #endif // GPU_DWT_TESTING @@ -251,4 +251,3 @@ namespace dwt_cuda { #endif // DWT_COMMON_CUDA_H - diff --git a/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu b/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu index 0f3b21546..b80da2205 100755 --- a/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu +++ b/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu @@ -209,6 +209,8 @@ void PrintDeviceProperties(){ memset( &deviceProp, 0, sizeof(deviceProp)); if( cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) { + int clockRateKhz = 0; + cudaDeviceGetAttribute(&clockRateKhz, cudaDevAttrClockRate, nDeviceIdx); printf( "\nDevice Name \t\t - %s ", deviceProp.name ); printf( "\n**************************************"); printf( "\nTotal Global Memory\t\t\t - %lu KB", deviceProp.totalGlobalMem/1024 ); @@ -221,9 +223,9 @@ void PrintDeviceProperties(){ printf( "\nMaximum Thread Dimension (grid) \t - %d %d %d", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2] ); printf( "\nTotal constant memory \t\t\t - %zu bytes", deviceProp.totalConstMem ); printf( "\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor ); - printf( "\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate ); + printf( "\nClock rate \t\t\t\t - %d KHz", clockRateKhz ); printf( "\nTexture Alignment \t\t\t - %zu bytes", deviceProp.textureAlignment ); - printf( "\nDevice Overlap \t\t\t\t - %s", deviceProp. deviceOverlap?"Allowed":"Not Allowed" ); + printf( "\nDevice Overlap \t\t\t\t - %s", deviceProp.asyncEngineCount > 0?"Allowed":"Not Allowed" ); printf( "\nNumber of Multi processors \t\t - %d\n\n", deviceProp.multiProcessorCount ); } else @@ -363,9 +365,9 @@ void ForwardSub() gettimeofday(&time_start, NULL); for (t=0; t<(Size-1); t++) { Fan1<<>>(m_cuda,a_cuda,Size,t); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); Fan2<<>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); checkCUDAError("Fan2"); } // end timing kernels @@ -467,4 +469,3 @@ void checkCUDAError(const char *msg) exit(EXIT_FAILURE); } } - diff --git a/src/cuda/rodinia/3.1/cuda/huffman/hist.cu b/src/cuda/rodinia/3.1/cuda/huffman/hist.cu index 97e40f71a..4f37ad55f 100644 --- a/src/cuda/rodinia/3.1/cuda/huffman/hist.cu +++ b/src/cuda/rodinia/3.1/cuda/huffman/hist.cu @@ -66,7 +66,7 @@ int runHisto(char* file, unsigned int* freq, unsigned int memSize, unsigned int cudaDeviceProp prop; ( cudaGetDeviceProperties( &prop, 0 ) ); int blocks = prop.multiProcessorCount; - if(!prop.deviceOverlap) + if(prop.asyncEngineCount == 0) { cout << "No overlaps, so no speedup from streams" << endl; return 0; diff --git a/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu b/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu index f14854b3d..11c2ab840 100755 --- a/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu +++ b/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu @@ -45,7 +45,7 @@ int main(int argc, char* argv[]){ for (int i=1; i maxThreads ) @@ -153,7 +153,7 @@ int main(int argc, char* argv[]) * Execute kernel */ euclid<<< gridDim, threadsPerBlock >>>(d_locations,d_distances,numRecords,lat,lng); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //Copy data from device memory to host memory cudaMemcpy( distances, d_distances, sizeof(float)*numRecords, cudaMemcpyDeviceToHost ); diff --git a/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu b/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu index 7a2a69eeb..e693ac5d4 100755 --- a/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu +++ b/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu @@ -237,7 +237,7 @@ runTest( int argc, char** argv) #endif } - cudaThreadSynchronize(); + cudaDeviceSynchronize(); #ifdef OUTPUT //Printing output diff --git a/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu b/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu index 0a5379d48..1c95400a1 100755 --- a/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu +++ b/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu @@ -254,7 +254,7 @@ float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bo center_table_d, // in: center index table switch_membership_d // out: changes in membership ); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); // error check error = cudaGetLastError(); diff --git a/src/setup_environment b/src/setup_environment index 21b42af72..c2e7c1e10 100755 --- a/src/setup_environment +++ b/src/setup_environment @@ -16,6 +16,7 @@ export CUDA_PATH=$CUDA_INSTALL_PATH export CUDA_VERSION=`nvcc --version | grep release | sed -re 's/.*release ([0-9]+\.[0-9]+).*/\1/'`; export CUDA_VERSION_MAJOR=`nvcc --version | grep release | sed -re 's/.*release ([0-9]+)\..*/\1/'`; export CUDA_VERSION_MINOR=$(nvcc --version | grep release | sed -re 's/.*release [0-9]+\.([0-9]+).*/\1/') +export CUDA_SUPPORTED_GPU_ARCHS=$(nvcc --list-gpu-arch 2>/dev/null || true) export CUDAHOME=$CUDA_INSTALL_PATH export BINDIR=$GPUAPPS_ROOT/bin/$CUDA_VERSION @@ -30,6 +31,14 @@ export NVCC_ADDITIONAL_ARGS="" export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +append_gencode_if_supported() { + local arch="$1" + + if echo "$CUDA_SUPPORTED_GPU_ARCHS" | grep -qx "compute_${arch}"; then + export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -gencode=arch=compute_${arch},code=compute_${arch}" + fi +} + if [ $CUDA_VERSION_MAJOR -eq 4 ]; then export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -gencode=arch=compute_10,code=compute_10" export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -gencode=arch=compute_13,code=compute_13" @@ -104,6 +113,15 @@ if [ $CUDA_VERSION_MAJOR -eq 12 ]; then fi +if [ $CUDA_VERSION_MAJOR -ge 13 ]; then + append_gencode_if_supported 75 + append_gencode_if_supported 80 + append_gencode_if_supported 86 + append_gencode_if_supported 89 + append_gencode_if_supported 90 + export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -std=c++17" +fi + # Turn off the gencodes for cuda versions. Above 6 - no 10 support. @@ -132,6 +150,13 @@ if [ $CUDA_VERSION_MAJOR -gt 10 ]; then export MAKE_ARGS="$MAKE_ARGS GENCODE_SM50=" fi +if [ $CUDA_VERSION_MAJOR -ge 13 ]; then + export MAKE_ARGS="$MAKE_ARGS GENCODE_SM60=" + export MAKE_ARGS="$MAKE_ARGS GENCODE_SM61=" + export MAKE_ARGS="$MAKE_ARGS GENCODE_SM62=" + export MAKE_ARGS="$MAKE_ARGS GENCODE_SM70=" +fi + if [ $CUDA_VERSION_MAJOR -lt 5 ]; then export MAKE_ARGS="$MAKE_ARGS GENCODE_SM35=" export MAKE_ARGS="$MAKE_ARGS GENCODE_SM50="