init
This commit is contained in:
commit
ea7dcba939
6 changed files with 481 additions and 0 deletions
109
cuda/main.cu
Normal file
109
cuda/main.cu
Normal file
|
@ -0,0 +1,109 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
__global__ void add_kernel(const float* __restrict__ A,
|
||||
const float* __restrict__ B,
|
||||
float* __restrict__ D, size_t N) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < N) D[i] = A[i] + B[i];
|
||||
}
|
||||
|
||||
__global__ void fma_kernel(const float* __restrict__ D,
|
||||
const float* __restrict__ C,
|
||||
const float* __restrict__ B,
|
||||
float* __restrict__ E, size_t N) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < N) E[i] = fmaf(D[i], C[i], B[i]);
|
||||
}
|
||||
|
||||
__global__ void reduce_sum_kernel(const float* __restrict__ E,
|
||||
double* __restrict__ out, size_t N) {
|
||||
extern __shared__ double s[];
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t tid = threadIdx.x;
|
||||
double val = 0.0;
|
||||
if (i < N) val = static_cast<double>(E[i]);
|
||||
s[tid] = val;
|
||||
__syncthreads();
|
||||
|
||||
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
|
||||
if (tid < stride) s[tid] += s[tid + stride];
|
||||
__syncthreads();
|
||||
}
|
||||
if (tid == 0) atomicAdd(out, s[0]);
|
||||
}
|
||||
|
||||
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||
for (auto &x : v) x = dist(rng);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
size_t N = 100000000;
|
||||
int iters = 10;
|
||||
if (argc >= 2) N = static_cast<size_t>(atoll(argv[1]));
|
||||
if (argc >= 3) iters = atoi(argv[2]);
|
||||
|
||||
std::vector<float> hA(N), hB(N), hC(N);
|
||||
init_vec(hA, 1);
|
||||
init_vec(hB, 2);
|
||||
init_vec(hC, 3);
|
||||
|
||||
float *dA, *dB, *dC, *dD, *dE;
|
||||
double *dSum;
|
||||
cudaMalloc(&dA, N * sizeof(float));
|
||||
cudaMalloc(&dB, N * sizeof(float));
|
||||
cudaMalloc(&dC, N * sizeof(float));
|
||||
cudaMalloc(&dD, N * sizeof(float));
|
||||
cudaMalloc(&dE, N * sizeof(float));
|
||||
cudaMalloc(&dSum, sizeof(double));
|
||||
|
||||
cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
int block = 256;
|
||||
int grid = (int)((N + block - 1) / block);
|
||||
|
||||
// Warmup
|
||||
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
||||
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double hostSum = 0.0;
|
||||
for (int it = 0; it < iters; ++it) {
|
||||
cudaMemset(dSum, 0, sizeof(double));
|
||||
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
||||
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
||||
reduce_sum_kernel<<<grid, block, block * sizeof(double)>>>(dE, dSum, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
double partial = 0.0;
|
||||
cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost);
|
||||
hostSum += partial;
|
||||
}
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||
|
||||
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||
|
||||
std::cout << "CUDA\n";
|
||||
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||
std::cout << "time_ms=" << ms << "\n";
|
||||
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||
std::cout << "result=" << hostSum << "\n";
|
||||
|
||||
cudaFree(dA); cudaFree(dB); cudaFree(dC);
|
||||
cudaFree(dD); cudaFree(dE); cudaFree(dSum);
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue