#include #include #include #include #include #include #include __global__ void add_kernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ D, size_t N) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) D[i] = A[i] + B[i]; } __global__ void fma_kernel(const float* __restrict__ D, const float* __restrict__ C, const float* __restrict__ B, float* __restrict__ E, size_t N) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < N) E[i] = fmaf(D[i], C[i], B[i]); } __global__ void reduce_sum_kernel(const float* __restrict__ E, double* __restrict__ out, size_t N) { extern __shared__ double s[]; size_t i = blockIdx.x * blockDim.x + threadIdx.x; size_t tid = threadIdx.x; double val = 0.0; if (i < N) val = static_cast(E[i]); s[tid] = val; __syncthreads(); for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) atomicAdd(out, s[0]); } static void init_vec(std::vector &v, uint64_t seed) { std::mt19937 rng(static_cast(seed)); std::uniform_real_distribution dist(-1.0f, 1.0f); for (auto &x : v) x = dist(rng); } int main(int argc, char** argv) { size_t N = 100000000; int iters = 10; if (argc >= 2) N = static_cast(atoll(argv[1])); if (argc >= 3) iters = atoi(argv[2]); std::vector hA(N), hB(N), hC(N); init_vec(hA, 1); init_vec(hB, 2); init_vec(hC, 3); float *dA, *dB, *dC, *dD, *dE; double *dSum; cudaMalloc(&dA, N * sizeof(float)); cudaMalloc(&dB, N * sizeof(float)); cudaMalloc(&dC, N * sizeof(float)); cudaMalloc(&dD, N * sizeof(float)); cudaMalloc(&dE, N * sizeof(float)); cudaMalloc(&dSum, sizeof(double)); cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice); int block = 256; int grid = (int)((N + block - 1) / block); // Warmup add_kernel<<>>(dA, dB, dD, N); fma_kernel<<>>(dD, dC, dB, dE, N); cudaDeviceSynchronize(); auto start = std::chrono::high_resolution_clock::now(); double hostSum = 0.0; for (int it = 0; it < iters; ++it) { cudaMemset(dSum, 0, sizeof(double)); add_kernel<<>>(dA, dB, dD, N); fma_kernel<<>>(dD, dC, dB, dE, N); reduce_sum_kernel<<>>(dE, dSum, N); cudaDeviceSynchronize(); double partial = 0.0; cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost); hostSum += partial; } auto end = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration(end - start).count(); double bytes_per_iter = 7.0 * N * sizeof(float); double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9; std::cout << "CUDA\n"; std::cout << "N=" << N << " iters=" << iters << "\n"; std::cout << "time_ms=" << ms << "\n"; std::cout << "throughput_GBps=" << gbps << "\n"; std::cout << "result=" << hostSum << "\n"; cudaFree(dA); cudaFree(dB); cudaFree(dC); cudaFree(dD); cudaFree(dE); cudaFree(dSum); return 0; }