109 lines
3.4 KiB
Text
109 lines
3.4 KiB
Text
#include <cuda_runtime.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <chrono>
|
|
#include <iostream>
|
|
#include <random>
|
|
#include <vector>
|
|
|
|
__global__ void add_kernel(const float* __restrict__ A,
|
|
const float* __restrict__ B,
|
|
float* __restrict__ D, size_t N) {
|
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (i < N) D[i] = A[i] + B[i];
|
|
}
|
|
|
|
__global__ void fma_kernel(const float* __restrict__ D,
|
|
const float* __restrict__ C,
|
|
const float* __restrict__ B,
|
|
float* __restrict__ E, size_t N) {
|
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (i < N) E[i] = fmaf(D[i], C[i], B[i]);
|
|
}
|
|
|
|
__global__ void reduce_sum_kernel(const float* __restrict__ E,
|
|
double* __restrict__ out, size_t N) {
|
|
extern __shared__ double s[];
|
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t tid = threadIdx.x;
|
|
double val = 0.0;
|
|
if (i < N) val = static_cast<double>(E[i]);
|
|
s[tid] = val;
|
|
__syncthreads();
|
|
|
|
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
|
|
if (tid < stride) s[tid] += s[tid + stride];
|
|
__syncthreads();
|
|
}
|
|
if (tid == 0) atomicAdd(out, s[0]);
|
|
}
|
|
|
|
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
|
std::mt19937 rng(static_cast<uint32_t>(seed));
|
|
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
|
for (auto &x : v) x = dist(rng);
|
|
}
|
|
|
|
int main(int argc, char** argv) {
|
|
size_t N = 100000000;
|
|
int iters = 10;
|
|
if (argc >= 2) N = static_cast<size_t>(atoll(argv[1]));
|
|
if (argc >= 3) iters = atoi(argv[2]);
|
|
|
|
std::vector<float> hA(N), hB(N), hC(N);
|
|
init_vec(hA, 1);
|
|
init_vec(hB, 2);
|
|
init_vec(hC, 3);
|
|
|
|
float *dA, *dB, *dC, *dD, *dE;
|
|
double *dSum;
|
|
cudaMalloc(&dA, N * sizeof(float));
|
|
cudaMalloc(&dB, N * sizeof(float));
|
|
cudaMalloc(&dC, N * sizeof(float));
|
|
cudaMalloc(&dD, N * sizeof(float));
|
|
cudaMalloc(&dE, N * sizeof(float));
|
|
cudaMalloc(&dSum, sizeof(double));
|
|
|
|
cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
|
|
|
int block = 256;
|
|
int grid = (int)((N + block - 1) / block);
|
|
|
|
// Warmup
|
|
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
|
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
|
cudaDeviceSynchronize();
|
|
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
|
|
double hostSum = 0.0;
|
|
for (int it = 0; it < iters; ++it) {
|
|
cudaMemset(dSum, 0, sizeof(double));
|
|
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
|
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
|
reduce_sum_kernel<<<grid, block, block * sizeof(double)>>>(dE, dSum, N);
|
|
cudaDeviceSynchronize();
|
|
|
|
double partial = 0.0;
|
|
cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost);
|
|
hostSum += partial;
|
|
}
|
|
|
|
auto end = std::chrono::high_resolution_clock::now();
|
|
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
|
|
|
double bytes_per_iter = 7.0 * N * sizeof(float);
|
|
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
|
|
|
std::cout << "CUDA\n";
|
|
std::cout << "N=" << N << " iters=" << iters << "\n";
|
|
std::cout << "time_ms=" << ms << "\n";
|
|
std::cout << "throughput_GBps=" << gbps << "\n";
|
|
std::cout << "result=" << hostSum << "\n";
|
|
|
|
cudaFree(dA); cudaFree(dB); cudaFree(dC);
|
|
cudaFree(dD); cudaFree(dE); cudaFree(dSum);
|
|
return 0;
|
|
}
|