|
7 | 7 | #include "deepx/tensorfunc/cuda.hpp" |
8 | 8 | namespace deepx::tensorfunc |
9 | 9 | { |
| 10 | + // 填充 |
| 11 | + // constant |
10 | 12 | template <typename T> |
11 | 13 | __global__ void kernel_constant(T *data, const T value, const int size) |
12 | 14 | { |
@@ -40,7 +42,47 @@ namespace deepx::tensorfunc |
40 | 42 | template void launch_constant<int8_t>(int8_t *a, const int8_t value, const int size); |
41 | 43 | template void launch_constant<bool>(bool *a, const bool value, const int size); |
42 | 44 |
|
43 | | - // 添加kernel函数 |
| 45 | + // dropout |
| 46 | + template <typename T> |
| 47 | + __global__ void dropout_kernel(T *A, const float p, const unsigned int seed, const int size) |
| 48 | + { |
| 49 | + int stride = blockDim.x * gridDim.x; |
| 50 | + curandState state; |
| 51 | + curand_init(seed, threadIdx.x, 0, &state); // 仅初始化一次 |
| 52 | + |
| 53 | + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride) |
| 54 | + { |
| 55 | + float rand = curand_uniform(&state); |
| 56 | + if (rand < p) |
| 57 | + { |
| 58 | + A[idx] = 0; |
| 59 | + } |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + template <typename T> |
| 64 | + void launch_dropout(T *a, const float p, const unsigned int seed, const int size) |
| 65 | + { |
| 66 | + auto [numBlocks, blockSize] = BestDims(size); |
| 67 | + dropout_kernel<<<numBlocks, blockSize>>>(a, p, seed, size); |
| 68 | + cudaError_t err = cudaGetLastError(); |
| 69 | + if (err != cudaSuccess) |
| 70 | + { |
| 71 | + throw std::runtime_error("Failed to launch dropout kernel: " + |
| 72 | + std::string(cudaGetErrorString(err))); |
| 73 | + } |
| 74 | + } |
| 75 | + template void launch_dropout<double>(double *a, const float p, const unsigned int seed, const int size); |
| 76 | + template void launch_dropout<float>(float *a, const float p, const unsigned int seed, const int size); |
| 77 | + template void launch_dropout<half>(half *a, const float p, const unsigned int seed, const int size); |
| 78 | + template void launch_dropout<nv_bfloat16>(nv_bfloat16 *a, const float p, const unsigned int seed, const int size); |
| 79 | + template void launch_dropout<int64_t>(int64_t *a, const float p, const unsigned int seed, const int size); |
| 80 | + template void launch_dropout<int32_t>(int32_t *a, const float p, const unsigned int seed, const int size); |
| 81 | + template void launch_dropout<int16_t>(int16_t *a, const float p, const unsigned int seed, const int size); |
| 82 | + template void launch_dropout<int8_t>(int8_t *a, const float p, const unsigned int seed, const int size); |
| 83 | + |
| 84 | + // 初始化 |
| 85 | + // arange |
44 | 86 | template <typename T> |
45 | 87 | __global__ void kernel_arange(T *data, const float start, const float step, const int size) |
46 | 88 | { |
@@ -133,7 +175,7 @@ namespace deepx::tensorfunc |
133 | 175 | void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size) |
134 | 176 | { |
135 | 177 | auto [numBlocks, blockSize] = BestDims(size); |
136 | | - kernel_normal<<<numBlocks, blockSize>>>(a,float(mean), float(stddev), seed, size); |
| 178 | + kernel_normal<<<numBlocks, blockSize>>>(a, float(mean), float(stddev), seed, size); |
137 | 179 | cudaError_t err = cudaGetLastError(); |
138 | 180 | if (err != cudaSuccess) |
139 | 181 | throw std::runtime_error("Failed to launch normal kernel"); |
|
0 commit comments