diff --git a/src/Makefile b/src/Makefile
index a890d58f7..f3cf86606 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -13,6 +13,10 @@ all:   pannotia rodinia_2.0-ft proxy-apps microbench rodinia-3.1 ispass-2009 pol
 endif
 endif
 ci:     rodinia_2.0-ft rodinia-3.1 GPU_Microbenchmark cutlass_mini cuda_samples
+# Keep the repository pinned to the legacy cuda-samples revision for older toolkits,
+# but use the upstream CUDA 13.1 samples when building with CUDA 13.
+CUDA_SAMPLES_CUDA13_REF := 4f735616ba599fe93cc2c6c85dcb4369260f9643
+CUDA_SAMPLES_PINNED_REF := $(shell git -C .. rev-parse HEAD:src/cuda/cuda-samples 2>/dev/null)
 accelwattch: accelwattch_validation accelwattch_hw_power accelwattch_ubench
 accelwattch_validation: rodinia-3.1_accelwattch_validation parboil_accelwattch_validation cutlass cuda_samples-11.0_accelwattch_validation
 accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw_power
@@ -518,8 +522,40 @@ mlperf_training:
 
 cuda_samples:
 	mkdir -p $(BINDIR)/$(BINSUBDIR)/
-	mkdir -p ./cuda/cuda-samples/build && cd ./cuda/cuda-samples/build && cmake .. && $(MAKE)
-	find $(GPUAPPS_ROOT)/src/cuda/cuda-samples/build/Samples  -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ;
+	set -eu; \
+	repo_dir=./cuda/cuda-samples; \
+	desired_ref="$(CUDA_SAMPLES_PINNED_REF)"; \
+	if [ -z "$$desired_ref" ]; then \
+		desired_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \
+	fi; \
+	source_dir="$$repo_dir"; \
+	build_dir="$$repo_dir/build-$(CUDA_VERSION)"; \
+	export_dir=; \
+	cleanup_cuda_samples() { \
+		if [ -n "$$export_dir" ] && [ -d "$$export_dir" ]; then \
+			rm -rf "$$export_dir"; \
+		fi; \
+	}; \
+	trap 'cleanup_cuda_samples' EXIT; \
+	if [ ${CUDA_VERSION_MAJOR} -ge 13 ]; then \
+		desired_ref="$(CUDA_SAMPLES_CUDA13_REF)"; \
+	fi; \
+	original_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \
+	if [ "$$original_ref" != "$$desired_ref" ]; then \
+		export_dir=./cuda/cuda-samples-export-$(CUDA_VERSION); \
+		build_dir="$$repo_dir/build-$(CUDA_VERSION)-compat"; \
+		git -C "$$repo_dir" rev-parse --verify "$$desired_ref^{commit}" >/dev/null; \
+		rm -rf "$$export_dir"; \
+		mkdir -p "$$export_dir"; \
+		archive_file="$$export_dir/source.tar"; \
+		git -C "$$repo_dir" archive --format=tar "$$desired_ref" > "$$archive_file"; \
+		tar -xf "$$archive_file" -C "$$export_dir"; \
+		rm -f "$$archive_file"; \
+		source_dir="$$export_dir"; \
+	fi; \
+	cmake -S "$$source_dir" -B "$$build_dir"; \
+	cmake --build "$$build_dir"; \
+	find "$$build_dir"/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ;
 
 pytorch_examples:
 	mkdir -p $(BINDIR)/$(BINSUBDIR)/
@@ -711,10 +747,11 @@ clean_pytorch_examples:
 	rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae
 
 clean_cuda_samples:
-	$(MAKE) clean -C ./cuda/cuda-samples/build
+	find ./cuda/cuda-samples -maxdepth 1 -type d \( -name 'build' -o -name 'build-*' \) -exec rm -rf {} +
+	find ./cuda -maxdepth 1 -type d -name 'cuda-samples-export-*' -exec rm -rf {} +
 
 clean_huggingface:
 	rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
 
 clean_GPU_Microbenchmark:
-	$(MAKE) clean -C ./cuda/GPU_Microbenchmark
\ No newline at end of file
+	$(MAKE) clean -C ./cuda/GPU_Microbenchmark
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
index 0b0306596..82caee7ea 100644
--- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
+++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -46,6 +46,15 @@ struct GpuConfig
     unsigned L2_BANKS = 0;            // L2 Cache Banks (LTCs)
 };
 inline GpuConfig config;
+
+inline int getDeviceAttributeOrZero(cudaDeviceAttr attr, int deviceID)
+{
+    int value = 0;
+    if (cudaDeviceGetAttribute(&value, attr, deviceID) != cudaSuccess)
+        return 0;
+    return value;
+}
+
 // Parses short flags like --sm 80 into a GpuConfig object
 inline void parseGpuConfigArgs(int argc, char *argv[])
 {
@@ -280,8 +289,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
         cudaSetDevice(deviceID);
         cudaGetDeviceProperties(&deviceProp, deviceID);
 
-        int clockRateKHz;
-        cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
+        int clockRateKHz =
+            getDeviceAttributeOrZero(cudaDevAttrClockRate, deviceID);
+        int memoryClockRateKHz =
+            getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, deviceID);
+        int memoryBusWidthBits =
+            getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, deviceID);
 
         // core stats
         config.SM_NUMBER = deviceProp.multiProcessorCount;
@@ -310,9 +323,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
 
         // memory
         config.MEM_SIZE = deviceProp.totalGlobalMem;
-        config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
-        config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
-        config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
+        if (memoryClockRateKHz > 0)
+            config.MEM_CLK_FREQUENCY = memoryClockRateKHz * 1e-3f;
+        if (memoryBusWidthBits > 0)
+            config.MEM_BITWIDTH = memoryBusWidthBits;
+        if (clockRateKHz > 0)
+            config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
 
         // Get FBP_COUNT and L2_BANKS from NVIDIA RM API
         config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu
index 088366980..beb5e39f3 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu
+++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu
@@ -12,9 +12,8 @@ int main(int argc, char *argv[])
   snprintf(msg, sizeof(msg), "Global memory size = %.0f GB\n",
            static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
   std::cout << msg;
-  std::cout << "Memory Clock rate = " << deviceProp.memoryClockRate * 1e-3f
-            << " Mhz\n";
-  std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n";
+  std::cout << "Memory Clock rate = " << config.MEM_CLK_FREQUENCY << " Mhz\n";
+  std::cout << "Memory Bus Width = " << config.MEM_BITWIDTH << " bit\n";
   std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n";
   std::cout << "Memory channels = "
             << config.FBP_COUNT << "\n";
@@ -38,7 +37,7 @@ int main(int argc, char *argv[])
               << dram_model_freq_ratio[DRAM_MODEL] << std::endl;
 
     // timing
-    float device_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) /
+    float device_freq_MHZ = (config.MEM_CLK_FREQUENCY * 2) /
                             dram_model_freq_ratio[DRAM_MODEL];
     if (DRAM_MODEL == dram_model::HBM)
     {
diff --git a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu
index 6ff204623..a858a8ba7 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu
+++ b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu
@@ -16,6 +16,14 @@ struct alignas(DATA_SIZE) Data {
 };
 static_assert(sizeof(Data) == DATA_SIZE, "Data struct size mismatch");
 
+static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id = 0) {
+    int value = 0;
+    if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) {
+        return 0;
+    }
+    return value;
+}
+
 
 // sm_80+ required
 __global__ void pipeline_kernel_async(const Data* __restrict__ global,
@@ -174,8 +182,11 @@ for (size_t b = 0; b < num_blocks; b++) {
 }
 double avg_cycles = static_cast<double>(total_cycles) / num_blocks;
 
+int sm_clock_khz = getDeviceAttributeOrZero(cudaDevAttrClockRate);
+int memory_clock_khz = getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate);
+
 // GPU frequency (kHz → Hz)
-double gpu_clock_hz = static_cast<double>(prop.clockRate) * 1000.0;
+double gpu_clock_hz = static_cast<double>(sm_clock_khz) * 1000.0;
 
 // Time in seconds
 double time_sec = avg_cycles / gpu_clock_hz;
@@ -192,8 +203,8 @@ double bw_gbs   = bytes_moved / time_sec / 1e9;
 double bytesclk = bytes_moved / avg_cycles;   // Bytes per GPU cycle
 
 std::cout << "---------------------------------\n";
-std::cout << "SM Clock               = " << prop.clockRate / 1000.0 << " MHz\n";
-std::cout << "Memory Clock           = " << prop.memoryClockRate / 1000.0 << " MHz\n";
+std::cout << "SM Clock               = " << sm_clock_khz / 1000.0 << " MHz\n";
+std::cout << "Memory Clock           = " << memory_clock_khz / 1000.0 << " MHz\n";
 std::cout << "Avg cycles (per block) = " << avg_cycles << "\n";
 std::cout << "Time (s)               = " << time_sec << "\n";
 std::cout << "Bytes moved            = " << bytes_moved / (1024.0*1024*1024) << " GB\n";
diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile
index 167a71704..ef6ca28da 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile
+++ b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile
@@ -2,7 +2,7 @@
 SRC = deviceQuery.cpp
 
 EXE = deviceQuery
-INCLUDE :=  -I../../../../cuda-samples/Common/
+INCLUDE :=  -I../../../../NVIDIA_CUDA-11.0_Samples/common/inc/
 NVCC_FLGAS =
 
 include ../../../common/common.mk
diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp
index 4d8bb318a..6f0743dbc 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp
+++ b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp
@@ -10,6 +10,14 @@ from CUDA SDK
 #include <memory>
 #include <string>
 
+static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id) {
+  int value = 0;
+  if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) {
+    return 0;
+  }
+  return value;
+}
+
 int main(int argc, char **argv) {
   int deviceCount = 0;
   cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
@@ -32,6 +40,11 @@ int main(int argc, char **argv) {
     cudaSetDevice(dev);
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, dev);
+    int smClockKHz = getDeviceAttributeOrZero(cudaDevAttrClockRate, dev);
+    int memoryClockKHz =
+        getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, dev);
+    int memoryBusWidthBits =
+        getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, dev);
 
     // device
     printf("  Device : \"%s\"\n\n", deviceProp.name);
@@ -40,7 +53,7 @@ int main(int argc, char **argv) {
 
     // core
     printf("  GPU Max Clock rate                             : %.0f MHz \n",
-           deviceProp.clockRate * 1e-3f);
+           smClockKHz * 1e-3f);
     printf("  Multiprocessors Count                       : %d\n",
            deviceProp.multiProcessorCount);
     printf("  Maximum number of threads per multiprocessor: %d\n",
@@ -81,9 +94,10 @@ int main(int argc, char **argv) {
              static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
     printf("%s", msg);
     printf("  Memory Clock rate                           : %.0f Mhz\n",
-           deviceProp.memoryClockRate * 1e-3f);
+           memoryClockKHz * 1e-3f);
     printf("  Memory Bus Width                            : %d bit\n",
-           deviceProp.memoryBusWidth);
+           memoryBusWidthBits > 0 ? memoryBusWidthBits
+                                  : deviceProp.memoryBusWidth);
 
     printf(" ////////////////////////// \n");
   }
diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile
index 3b7aa310c..b7dd4fb56 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile
+++ b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile
@@ -2,7 +2,12 @@ SRC = mbarrier.cu
 
 EXE = mbarrier
 
-ARCH?=sm_80 sm_90a sm_100a sm_101 sm_120 
+ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
+ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_110 sm_120
+else
+ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_101 sm_120
+endif
+ARCH ?= $(ARCH_DEFAULT)
 # Unset the CUDA_CPPFLAGS which is set based on CUDA version
 CUDA_CPPFLAGS=
 
diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile
index f3c8bcd00..d7a7a9c83 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile
+++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile
@@ -3,7 +3,11 @@ SRC = tma_bulk.cu
 EXE = tma_bulk
 
 # TMA is supported on SM_90a and above
-ARCH?=sm_90a sm_100a sm_101 sm_120 
+ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
+ARCH?=sm_90a sm_100a sm_110 sm_120
+else
+ARCH?=sm_90a sm_100a sm_101 sm_120
+endif
 # Unset the CUDA_CPPFLAGS which is set based on CUDA version
 # but TMA is only supported on SM_90a and above
 CUDA_CPPFLAGS=
diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu
index fc885b515..7190422c8 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu
+++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu
@@ -1,6 +1,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <cuda/barrier>
 #include <cuda/ptx>
 
diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile
index b24b4dac9..f0ceefa9d 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile
+++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile
@@ -3,7 +3,11 @@ SRC = tma_tensor.cu
 EXE = tma_tensor
 
 # TMA is supported on SM_90a and above
-ARCH?=sm_90a sm_100a sm_101 sm_120 
+ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
+ARCH?=sm_90a sm_100a sm_110 sm_120
+else
+ARCH?=sm_90a sm_100a sm_101 sm_120
+endif
 # Unset the CUDA_CPPFLAGS which is set based on CUDA version
 # but TMA is only supported on SM_90a and above
 CUDA_CPPFLAGS=
diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu
index d9f7879d7..7c4668c0c 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu
+++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <unistd.h>
 #include <unordered_map>
 using barrier = cuda::barrier<cuda::thread_scope_block>;
 namespace ptx = cuda::ptx;
diff --git a/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu b/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu
index bed9aa524..af7d87203 100755
--- a/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu
+++ b/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu
@@ -129,7 +129,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
 											  in,
 											  hid);
  
-  cudaThreadSynchronize();
+  cudaDeviceSynchronize();
   
   cudaError_t error = cudaGetLastError();
 	if (error != cudaSuccess) {
diff --git a/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu b/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu
index 800c24d08..3900c99d8 100755
--- a/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu
+++ b/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
 
 		//Add a and b, store in c
 		euclid<<<dimGrid,dimBlock>>>(data, x2, y2, z_d, REC_WINDOW, REC_LENGTH, LATITUDE_POS);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		
 		//Copy data from device memory to host memory
 		cudaMemcpy( z, z_d, sizeof(float)*REC_WINDOW, cudaMemcpyDeviceToHost );
diff --git a/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu b/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu
index feda74cda..81a20e7e7 100755
--- a/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu
+++ b/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu
@@ -232,7 +232,7 @@ runTest( int argc, char** argv)
 #endif   
 }
 
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
 
 #ifdef TIMER
 		CUT_SAFE_CALL( cutStopTimer( timer_1 ));
diff --git a/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu b/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu
index 0eca1ae67..f4e8510fe 100755
--- a/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu
+++ b/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu
@@ -219,7 +219,7 @@ float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bo
 																											center_table_d,				// in:	center index table
 																											switch_membership_d		// out:  changes in membership
 																										  );
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	
 #ifdef PROFILE
 	double t10 = gettime();
diff --git a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu
index a89908230..49485f8db 100755
--- a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu
+++ b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu
@@ -75,7 +75,7 @@ kernel_gpu_cuda_wrapper(record *records,
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
@@ -223,7 +223,7 @@ kernel_gpu_cuda_wrapper(record *records,
 											offsetD,
 											keysD,
 											ansD);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	checkCUDAError("findK");
 
 	time4 = get_time();
diff --git a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu
index a2893b7b7..6ae78068c 100755
--- a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu
+++ b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu
@@ -77,7 +77,7 @@ kernel_gpu_cuda_wrapper_2(	knode *knodes,
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
@@ -272,7 +272,7 @@ kernel_gpu_cuda_wrapper_2(	knode *knodes,
 												endD,
 												ansDStart,
 												ansDLength);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	checkCUDAError("findRangeK");
 
 	time4 = get_time();
diff --git a/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu b/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu
index f90c41f9b..6ec79c49b 100755
--- a/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu
+++ b/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu
@@ -127,7 +127,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
 											  in,
 											  hid);
  
-  cudaThreadSynchronize();
+  cudaDeviceSynchronize();
   
   cudaError_t error = cudaGetLastError();
 	if (error != cudaSuccess) {
diff --git a/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu b/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu
index 16571017a..54d251cf4 100755
--- a/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu
+++ b/src/cuda/rodinia/3.1/cuda/cfd/euler3d.cu
@@ -565,7 +565,7 @@ int main(int argc, char** argv)
 	initialize_variables(nelr, fluxes);
 	cudaMemset( (void*) step_factors, 0, sizeof(float)*nelr );
 	// make sure CUDA isn't still doing something before we start timing
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	// these need to be computed the first time in order to compute time step
 	std::cout << "Starting..." << std::endl;
@@ -595,7 +595,7 @@ int main(int argc, char** argv)
 		}
 	}
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	//	CUT_SAFE_CALL( cutStopTimer(timer) );  
 	sdkStopTimer(&timer); 
 
diff --git a/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu b/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu
index 94107f183..f36dc03e2 100755
--- a/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu
+++ b/src/cuda/rodinia/3.1/cuda/cfd/euler3d_double.cu
@@ -530,7 +530,7 @@ int main(int argc, char** argv)
 	initialize_variables(nelr, fluxes);
 	cudaMemset( (void*) step_factors, 0, sizeof(double)*nelr );
 	// make sure CUDA isn't still doing something before we start timing
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	// these need to be computed the first time in order to compute time step
 	std::cout << "Starting..." << std::endl;
@@ -577,7 +577,7 @@ int main(int argc, char** argv)
 		  }
 	}
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sdkStopTimer(&timer);  
 
 	std::cout  << (sdkGetAverageTimerValue(&timer)/1000.0)  / iterations << " seconds per iteration" << std::endl;
diff --git a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu
index 423f6bcbc..616edaa03 100755
--- a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu
+++ b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d.cu
@@ -610,7 +610,7 @@ int main(int argc, char** argv)
 	initialize_variables(nelr, fluxes);
 	cudaMemset( (void*) step_factors, 0, sizeof(float)*nelr );
 	// make sure CUDA isn't still doing something before we start timing
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	// these need to be computed the first time in order to compute time step
 	std::cout << "Starting..." << std::endl;
@@ -664,7 +664,7 @@ int main(int argc, char** argv)
 	      }
 	  }
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sdkStopTimer(&timer);  
 
 	std::cout  << (sdkGetAverageTimerValue(&timer)/1000.0)  / iterations << " seconds per iteration" << std::endl;
diff --git a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu
index b8cbb403e..cc5cc1d78 100755
--- a/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu
+++ b/src/cuda/rodinia/3.1/cuda/cfd/pre_euler3d_double.cu
@@ -622,7 +622,7 @@ int main(int argc, char** argv)
 	initialize_variables(nelr, fluxes);
 	cudaMemset( (void*) step_factors, 0, sizeof(double)*nelr );
 	// make sure CUDA isn't still doing something before we start timing
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	// these need to be computed the first time in order to compute time step
 	std::cout << "Starting..." << std::endl;
@@ -647,7 +647,7 @@ int main(int argc, char** argv)
 		}
 	}
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sdkStopTimer(&timer);  
 
 	std::cout  << (sdkGetAverageTimerValue(&timer)/1000.0)  / iterations << " seconds per iteration" << std::endl;
diff --git a/src/cuda/rodinia/3.1/cuda/dwt2d/common.h b/src/cuda/rodinia/3.1/cuda/dwt2d/common.h
index f64980425..39c15fab9 100755
--- a/src/cuda/rodinia/3.1/cuda/dwt2d/common.h
+++ b/src/cuda/rodinia/3.1/cuda/dwt2d/common.h
@@ -54,7 +54,7 @@
     } }
 
 #  define cudaCheckAsyncError( msg ) {                                       \
-    cudaThreadSynchronize();                                                 \
+    cudaDeviceSynchronize();                                                 \
     cudaCheckError( msg );                                                   \
     }
 
diff --git a/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h b/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h
index 273d82476..61a791601 100755
--- a/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h
+++ b/src/cuda/rodinia/3.1/cuda/dwt2d/dwt_cuda/common.h
@@ -168,7 +168,7 @@ namespace dwt_cuda {
     /// @return true if there was no error, false otherwise
     static bool checkLastKernelCall(const char * message) {
       #if defined(GPU_DWT_TESTING)
-      return testRunning ? true : check(cudaThreadSynchronize(), message);
+      return testRunning ? true : check(cudaDeviceSynchronize(), message);
       #else // GPU_DWT_TESTING
       return true;
       #endif // GPU_DWT_TESTING
@@ -251,4 +251,3 @@ namespace dwt_cuda {
 
 
 #endif	// DWT_COMMON_CUDA_H
-
diff --git a/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu b/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu
index 0f3b21546..b80da2205 100755
--- a/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu
+++ b/src/cuda/rodinia/3.1/cuda/gaussian/gaussian.cu
@@ -209,6 +209,8 @@ void PrintDeviceProperties(){
 	    memset( &deviceProp, 0, sizeof(deviceProp));  
 	    if( cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx))  
 	        {
+				int clockRateKhz = 0;
+				cudaDeviceGetAttribute(&clockRateKhz, cudaDevAttrClockRate, nDeviceIdx);
 				printf( "\nDevice Name \t\t - %s ", deviceProp.name );  
 			    printf( "\n**************************************");  
 			    printf( "\nTotal Global Memory\t\t\t - %lu KB", deviceProp.totalGlobalMem/1024 );  
@@ -221,9 +223,9 @@ void PrintDeviceProperties(){
 			    printf( "\nMaximum Thread Dimension (grid) \t - %d %d %d", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2] );  
 			    printf( "\nTotal constant memory \t\t\t - %zu bytes", deviceProp.totalConstMem );  
 			    printf( "\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor );  
-			    printf( "\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate );  
+			    printf( "\nClock rate \t\t\t\t - %d KHz", clockRateKhz );  
 			    printf( "\nTexture Alignment \t\t\t - %zu bytes", deviceProp.textureAlignment );  
-			    printf( "\nDevice Overlap \t\t\t\t - %s", deviceProp. deviceOverlap?"Allowed":"Not Allowed" );  
+			    printf( "\nDevice Overlap \t\t\t\t - %s", deviceProp.asyncEngineCount > 0?"Allowed":"Not Allowed" );  
 			    printf( "\nNumber of Multi processors \t\t - %d\n\n", deviceProp.multiProcessorCount );  
 			}  
 	    else  
@@ -363,9 +365,9 @@ void ForwardSub()
     gettimeofday(&time_start, NULL);
 	for (t=0; t<(Size-1); t++) {
 		Fan1<<<dimGrid,dimBlock>>>(m_cuda,a_cuda,Size,t);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		checkCUDAError("Fan2");
 	}
 	// end timing kernels
@@ -467,4 +469,3 @@ void checkCUDAError(const char *msg)
         exit(EXIT_FAILURE);
     }                         
 }
-
diff --git a/src/cuda/rodinia/3.1/cuda/huffman/hist.cu b/src/cuda/rodinia/3.1/cuda/huffman/hist.cu
index 97e40f71a..4f37ad55f 100644
--- a/src/cuda/rodinia/3.1/cuda/huffman/hist.cu
+++ b/src/cuda/rodinia/3.1/cuda/huffman/hist.cu
@@ -66,7 +66,7 @@ int runHisto(char* file, unsigned int* freq, unsigned int memSize, unsigned int
     cudaDeviceProp  prop;
     ( cudaGetDeviceProperties( &prop, 0 ) );
     int blocks = prop.multiProcessorCount;
-    if(!prop.deviceOverlap)
+    if(prop.asyncEngineCount == 0)
     {
         cout << "No overlaps, so no speedup from streams" << endl;
         return 0;
diff --git a/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu b/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu
index f14854b3d..11c2ab840 100755
--- a/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu
+++ b/src/cuda/rodinia/3.1/cuda/huffman/main_test_cu.cu
@@ -45,7 +45,7 @@ int main(int argc, char* argv[]){
         for (int i=1; i<argc; i++)
             runVLCTest(argv[i], num_block_threads);
     else {	runVLCTest(NULL, num_block_threads, 1024);	}
-    CUDA_SAFE_CALL(cudaThreadExit());
+    CUDA_SAFE_CALL(cudaDeviceReset());
     return 0;
 }
 
@@ -145,7 +145,7 @@ void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
 #endif
                     d_destData, d_cindex); //testedOK2
         }
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
     cudaEventRecord( stop, 0 ) ;
     cudaEventSynchronize( stop ) ;
     float   elapsedTime;
diff --git a/src/cuda/rodinia/3.1/cuda/lavaMD/kernel/kernel_gpu_cuda_wrapper.cu b/src/cuda/rodinia/3.1/cuda/lavaMD/kernel/kernel_gpu_cuda_wrapper.cu
index 6feab499f..5bf4cdd50 100755
--- a/src/cuda/rodinia/3.1/cuda/lavaMD/kernel/kernel_gpu_cuda_wrapper.cu
+++ b/src/cuda/rodinia/3.1/cuda/lavaMD/kernel/kernel_gpu_cuda_wrapper.cu
@@ -63,7 +63,7 @@ kernel_gpu_cuda_wrapper(par_str par_cpu,
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	//====================================================================================================100
 	//	VARIABLES
@@ -193,7 +193,7 @@ kernel_gpu_cuda_wrapper(par_str par_cpu,
 											d_fv_gpu);
 
 	checkCUDAError("Start");
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	time4 = get_time();
 
diff --git a/src/cuda/rodinia/3.1/cuda/lud/cuda/Makefile b/src/cuda/rodinia/3.1/cuda/lud/cuda/Makefile
index 7d451ce68..62da9def2 100755
--- a/src/cuda/rodinia/3.1/cuda/lud/cuda/Makefile
+++ b/src/cuda/rodinia/3.1/cuda/lud/cuda/Makefile
@@ -47,7 +47,7 @@ all : $(EXECUTABLE)
 
 
 $(EXECUTABLE) : $(OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@  $?
+	$(NVCC) $(NVCCFLAGS) -o $@  $^
 
 clean:
 	rm -f $(EXECUTABLE) $(OBJS) *.linkinfo
diff --git a/src/cuda/rodinia/3.1/cuda/nn/nn_cuda.cu b/src/cuda/rodinia/3.1/cuda/nn/nn_cuda.cu
index a0531a463..65158db7d 100644
--- a/src/cuda/rodinia/3.1/cuda/nn/nn_cuda.cu
+++ b/src/cuda/rodinia/3.1/cuda/nn/nn_cuda.cu
@@ -96,7 +96,7 @@ int main(int argc, char* argv[])
 	// Scaling calculations - added by Sam Kauffman
 	cudaDeviceProp deviceProp;
 	cudaGetDeviceProperties( &deviceProp, 0 );
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	unsigned long maxGridX = deviceProp.maxGridSize[0];
 	unsigned long threadsPerBlock = min( deviceProp.maxThreadsPerBlock, DEFAULT_THREADS_PER_BLOCK );
 	size_t totalDeviceMemory;
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
     cudaMemGetInfo(  &freeDeviceMemory, &totalDeviceMemory );
 #endif
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	unsigned long usableDeviceMemory = freeDeviceMemory * 85 / 100; // 85% arbitrary throttle to compensate for known CUDA bug
 	unsigned long maxThreads = usableDeviceMemory / 12; // 4 bytes in 3 vectors per thread
 	if ( numRecords > maxThreads )
@@ -153,7 +153,7 @@ int main(int argc, char* argv[])
     * Execute kernel
     */
     euclid<<< gridDim, threadsPerBlock >>>(d_locations,d_distances,numRecords,lat,lng);
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
 
     //Copy data from device memory to host memory
     cudaMemcpy( distances, d_distances, sizeof(float)*numRecords, cudaMemcpyDeviceToHost );
diff --git a/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu b/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu
index 7a2a69eeb..e693ac5d4 100755
--- a/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu
+++ b/src/cuda/rodinia/3.1/cuda/srad/srad_v2/srad.cu
@@ -237,7 +237,7 @@ runTest( int argc, char** argv)
 #endif   
 }
 
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
 
 #ifdef OUTPUT
     //Printing output	
diff --git a/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu b/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu
index 0a5379d48..1c95400a1 100755
--- a/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu
+++ b/src/cuda/rodinia/3.1/cuda/streamcluster/streamcluster_cuda.cu
@@ -254,7 +254,7 @@ float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bo
 															center_table_d,			// in:	center index table
 															switch_membership_d		// out:  changes in membership
 															);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	
 	// error check
 	error = cudaGetLastError();
diff --git a/src/setup_environment b/src/setup_environment
index 21b42af72..c2e7c1e10 100755
--- a/src/setup_environment
+++ b/src/setup_environment
@@ -16,6 +16,7 @@ export CUDA_PATH=$CUDA_INSTALL_PATH
 export CUDA_VERSION=`nvcc --version | grep release | sed -re 's/.*release ([0-9]+\.[0-9]+).*/\1/'`;
 export CUDA_VERSION_MAJOR=`nvcc --version | grep release | sed -re 's/.*release ([0-9]+)\..*/\1/'`;
 export CUDA_VERSION_MINOR=$(nvcc --version | grep release | sed -re 's/.*release [0-9]+\.([0-9]+).*/\1/')
+export CUDA_SUPPORTED_GPU_ARCHS=$(nvcc --list-gpu-arch 2>/dev/null || true)
 export CUDAHOME=$CUDA_INSTALL_PATH
 
 export BINDIR=$GPUAPPS_ROOT/bin/$CUDA_VERSION
@@ -30,6 +31,14 @@ export NVCC_ADDITIONAL_ARGS=""
 
 export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
 
+append_gencode_if_supported() {
+    local arch="$1"
+
+    if echo "$CUDA_SUPPORTED_GPU_ARCHS" | grep -qx "compute_${arch}"; then
+        export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -gencode=arch=compute_${arch},code=compute_${arch}"
+    fi
+}
+
 if [ $CUDA_VERSION_MAJOR -eq 4 ]; then
     export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -gencode=arch=compute_10,code=compute_10"
     export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -gencode=arch=compute_13,code=compute_13"
@@ -104,6 +113,15 @@ if [ $CUDA_VERSION_MAJOR -eq 12 ]; then
 
 fi
 
+if [ $CUDA_VERSION_MAJOR -ge 13 ]; then
+    append_gencode_if_supported 75
+    append_gencode_if_supported 80
+    append_gencode_if_supported 86
+    append_gencode_if_supported 89
+    append_gencode_if_supported 90
+    export CUDA_CPPFLAGS="$CUDA_CPPFLAGS -std=c++17"
+fi
+
 
 
 # Turn off the gencodes for cuda versions. Above 6 - no 10 support.
@@ -132,6 +150,13 @@ if [ $CUDA_VERSION_MAJOR -gt 10 ]; then
     export MAKE_ARGS="$MAKE_ARGS GENCODE_SM50="
 fi
 
+if [ $CUDA_VERSION_MAJOR -ge 13 ]; then
+    export MAKE_ARGS="$MAKE_ARGS GENCODE_SM60="
+    export MAKE_ARGS="$MAKE_ARGS GENCODE_SM61="
+    export MAKE_ARGS="$MAKE_ARGS GENCODE_SM62="
+    export MAKE_ARGS="$MAKE_ARGS GENCODE_SM70="
+fi
+
 if [ $CUDA_VERSION_MAJOR -lt 5 ]; then
     export MAKE_ARGS="$MAKE_ARGS GENCODE_SM35="
     export MAKE_ARGS="$MAKE_ARGS GENCODE_SM50="