-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhello.c
More file actions
249 lines (232 loc) · 15.2 KB
/
hello.c
File metadata and controls
249 lines (232 loc) · 15.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
// gcc -g -o hello.exe hello.c -lcuda -lcudart -L"/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/lib/x64" && ./hello.exe
#include <stdio.h>
#include <stdlib.h>
// TODO: debug this code and see if we can see more about what's happening
// maybe we can output cubins
typedef int CUdevice;
typedef struct CUctx_st* CUcontext;
typedef struct CUmod_st* CUmodule;
typedef struct CUfunc_st* CUfunction;
typedef unsigned long long CUdeviceptr;
typedef int i32;
typedef struct cudaDeviceProp {
char name[256]; /**< ASCII string identifying device */
char uuid[16]; /**< 16-byte unique identifier */
char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
size_t totalGlobalMem; /**< Global memory available on device in bytes */
size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */
int regsPerBlock; /**< 32-bit registers available per block */
int warpSize; /**< Warp size in threads */
size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */
int maxThreadsPerBlock; /**< Maximum number of threads per block */
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
int clockRate; /**< Deprecated, Clock frequency in kilohertz */
size_t totalConstMem; /**< Constant memory available on device in bytes */
int major; /**< Major compute capability */
int minor; /**< Minor compute capability */
size_t textureAlignment; /**< Alignment requirement for textures */
size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */
int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
int multiProcessorCount; /**< Number of multiprocessors on device */
int kernelExecTimeoutEnabled; /**< Deprecated, Specified whether there is a run time limit on kernels */
int integrated; /**< Device is integrated as opposed to discrete */
int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
int computeMode; /**< Deprecated, Compute mode (See ::cudaComputeMode) */
int maxTexture1D; /**< Maximum 1D texture size */
int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */
int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
int maxTexture2D[2]; /**< Maximum 2D texture dimensions */
int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */
int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
int maxTexture3D[3]; /**< Maximum 3D texture dimensions */
int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */
int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */
int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */
int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */
int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
int maxSurface1D; /**< Maximum 1D surface size */
int maxSurface2D[2]; /**< Maximum 2D surface dimensions */
int maxSurface3D[3]; /**< Maximum 3D surface dimensions */
int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */
int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */
int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */
int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
size_t surfaceAlignment; /**< Alignment requirements for surfaces */
int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */
int ECCEnabled; /**< Device has ECC support enabled */
int pciBusID; /**< PCI bus ID of the device */
int pciDeviceID; /**< PCI device ID of the device */
int pciDomainID; /**< PCI domain ID of the device */
int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
int asyncEngineCount; /**< Number of asynchronous engines */
int unifiedAddressing; /**< Device shares a unified address space with the host */
int memoryClockRate; /**< Deprecated, Peak memory clock frequency in kilohertz */
int memoryBusWidth; /**< Global memory bus width in bits */
int l2CacheSize; /**< Size of L2 cache in bytes */
int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */
int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
int streamPrioritiesSupported; /**< Device supports stream priorities */
int globalL1CacheSupported; /**< Device supports caching globals in L1 */
int localL1CacheSupported; /**< Device supports caching locals in L1 */
size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */
int managedMemory; /**< Device supports allocating managed memory on this system */
int isMultiGpuBoard; /**< Device is on a multi-GPU board */
int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */
int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */
int singleToDoublePrecisionPerfRatio; /**< Deprecated, Ratio of single precision performance (in floating-point operations per second) to double precision performance */
int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */
int computePreemptionSupported; /**< Device supports Compute Preemption */
int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */
int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */
int hostRegisterSupported; /**< Device supports host memory registration via ::cudaHostRegister. */
int sparseCudaArraySupported; /**< 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise */
int hostRegisterReadOnlySupported; /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
int timelineSemaphoreInteropSupported; /**< External timeline semaphore interop is supported on the device */
int memoryPoolsSupported; /**< 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise */
int gpuDirectRDMASupported; /**< 1 if the device supports GPUDirect RDMA APIs, 0 otherwise */
unsigned int gpuDirectRDMAFlushWritesOptions; /**< Bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum */
int gpuDirectRDMAWritesOrdering;/**< See the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values */
unsigned int memoryPoolSupportedHandleTypes; /**< Bitmask of handle types supported with mempool-based IPC */
int deferredMappingCudaArraySupported; /**< 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
int ipcEventSupported; /**< Device supports IPC Events. */
int clusterLaunch; /**< Indicates device supports cluster launch */
int unifiedFunctionPointers; /**< Indicates device supports unified pointers */
int reserved[63]; /**< Reserved for future use */
} cudaDeviceProp;
void printCudaDeviceProperties(cudaDeviceProp prop) {
printf(" Compute Capability: %d.%d\n", prop.major, prop.minor);
printf(" Total Global Memory: %zu bytes\n", prop.totalGlobalMem);
printf(" Shared Memory per Block: %zu bytes\n", prop.sharedMemPerBlock);
printf(" Shared Memory per Multiprocessor: %zu bytes\n", prop.sharedMemPerMultiprocessor);
printf(" Registers per Block: %d\n", prop.regsPerBlock);
printf(" Registers per Multiprocessor: %d\n", prop.regsPerMultiprocessor);
printf(" Warp Size: %d\n", prop.warpSize);
printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
printf(" Max Threads per Multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" Max Blocks per Multiprocessor: %d\n", prop.maxBlocksPerMultiProcessor);
printf(" Max Threads Dimensions: [%d, %d, %d]\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(" Max Grid Size: [%d, %d, %d]\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(" Clock Rate: %d kHz\n", prop.clockRate);
printf(" Memory Clock Rate: %d kHz\n", prop.memoryClockRate);
printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth);
printf(" L2 Cache Size: %d bytes\n", prop.l2CacheSize);
printf(" Multiprocessor Count: %d\n", prop.multiProcessorCount);
printf(" Compute Preemption Supported: %d\n", prop.computePreemptionSupported);
printf(" Concurrent Kernels: %d\n", prop.concurrentKernels);
printf(" Concurrent Managed Access: %d\n", prop.concurrentManagedAccess);
printf(" Unified Addressing: %d\n", prop.unifiedAddressing);
printf(" Async Engine Count: %d\n", prop.asyncEngineCount);
printf(" Can Map Host Memory: %d\n", prop.canMapHostMemory);
printf(" PCI Bus ID: %d\n", prop.pciBusID);
printf(" PCI Device ID: %d\n", prop.pciDeviceID);
printf(" PCI Domain ID: %d\n", prop.pciDomainID);
printf(" TCC Driver: %d\n", prop.tccDriver);
printf(" UUID: ");
for (int j = 0; j < 16; ++j)
printf("%02x", (unsigned char)prop.uuid[j]);
printf("\n");
printf("\n");
}
extern i32 cuInit(unsigned int Flags);
extern i32 cuDeviceGet(CUdevice* device, int ordinal);
extern i32 cuCtxCreate(CUcontext* pctx, unsigned int flags, CUdevice dev);
extern i32 cuModuleLoad(CUmodule *module, const char *fname);
extern i32 cuModuleLoadData(CUmodule* module, const void* image);
extern i32 cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name);
extern i32 cuMemAlloc(CUdeviceptr* dptr, size_t bytesize);
extern i32 cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
extern i32 cuMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
extern i32 cuMemFree(CUdeviceptr dptr);
extern i32 cuLaunchKernel(CUfunction f,
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes,
CUcontext hStream,
void** kernelParams,
void** extra);
extern i32 cudaGetDeviceProperties(cudaDeviceProp *prop, int device);
// Hardcoded PTX code for a simple kernel "add_one"
const char* ptx_code =
".version 6.0\n"
".target sm_30\n"
".address_size 64\n"
"\n"
".visible .entry add_one(\n"
" .param .u64 _param_data\n"
")\n"
"{\n"
" .reg .u32 %r<3>;\n"
" .reg .u64 %rd<3>;\n"
" ld.param.u64 %rd1, [_param_data];\n"
" mov.u32 %r1, %tid.x;\n"
" mul.wide.u32 %rd2, %r1, 4;\n"
" add.u64 %rd1, %rd1, %rd2;\n"
" ld.global.u32 %r2, [%rd1];\n"
" add.u32 %r2, %r2, 1;\n"
" st.global.u32 [%rd1], %r2;\n"
" ret;\n"
"}\n";
// Load cubin file into memory
void* load_file(const char* filename, size_t* size_out) {
FILE* f = fopen(filename, "rb");
if (!f) return NULL;
fseek(f, 0, SEEK_END);
size_t size = ftell(f);
rewind(f);
void* data = malloc(size);
fread(data, 1, size, f);
fclose(f);
if (size_out) *size_out = size;
return data;
}
int main() {
CUdevice dev;
CUcontext ctx;
CUmodule mod;
CUfunction func;
cuInit(0);
cuDeviceGet(&dev, 0);
cuCtxCreate(&ctx, 0, dev);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printCudaDeviceProperties(prop);
//printf("Compute capability: %d.%d\n", prop.major, prop.minor);
size_t cubin_size;
void* cubin_data = load_file("hello.cubin", &cubin_size);
i32 res = cuModuleLoadData(&mod, cubin_data);
//i32 res = cuModuleLoadData(&mod, ptx_code);
//i32 res = cuModuleLoad(&mod, "hello.cubin");
if (res != 0) {
fprintf(stderr, "Failed to load PTX module\n");
return 1;
}
res = cuModuleGetFunction(&func, mod, "add_one");
if (res != 0) {
fprintf(stderr, "Failed to get kernel function\n");
return 1;
}
int data[4] = {1, 2, 3, 4};
CUdeviceptr dev_data;
cuMemAlloc(&dev_data, sizeof(data));
cuMemcpyHtoD(dev_data, data, sizeof(data));
void* args[] = { &dev_data };
cuLaunchKernel(func, 1, 1, 1, 4, 1, 1, 0, 0, args, NULL);
cuMemcpyDtoH(data, dev_data, sizeof(data));
cuMemFree(dev_data);
for (int i = 0; i < 4; ++i) {
printf("data[%d] = %d\n", i, data[i]);
}
return 0;
}