workshopgpu/cuda/main.cu
2025-09-06 09:41:18 +05:30

109 lines
3.4 KiB
Text

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <iostream>
#include <random>
#include <vector>
__global__ void add_kernel(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ D, size_t N) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) D[i] = A[i] + B[i];
}
__global__ void fma_kernel(const float* __restrict__ D,
const float* __restrict__ C,
const float* __restrict__ B,
float* __restrict__ E, size_t N) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) E[i] = fmaf(D[i], C[i], B[i]);
}
__global__ void reduce_sum_kernel(const float* __restrict__ E,
double* __restrict__ out, size_t N) {
extern __shared__ double s[];
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
size_t tid = threadIdx.x;
double val = 0.0;
if (i < N) val = static_cast<double>(E[i]);
s[tid] = val;
__syncthreads();
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) s[tid] += s[tid + stride];
__syncthreads();
}
if (tid == 0) atomicAdd(out, s[0]);
}
static void init_vec(std::vector<float> &v, uint64_t seed) {
std::mt19937 rng(static_cast<uint32_t>(seed));
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto &x : v) x = dist(rng);
}
int main(int argc, char** argv) {
size_t N = 100000000;
int iters = 10;
if (argc >= 2) N = static_cast<size_t>(atoll(argv[1]));
if (argc >= 3) iters = atoi(argv[2]);
std::vector<float> hA(N), hB(N), hC(N);
init_vec(hA, 1);
init_vec(hB, 2);
init_vec(hC, 3);
float *dA, *dB, *dC, *dD, *dE;
double *dSum;
cudaMalloc(&dA, N * sizeof(float));
cudaMalloc(&dB, N * sizeof(float));
cudaMalloc(&dC, N * sizeof(float));
cudaMalloc(&dD, N * sizeof(float));
cudaMalloc(&dE, N * sizeof(float));
cudaMalloc(&dSum, sizeof(double));
cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice);
int block = 256;
int grid = (int)((N + block - 1) / block);
// Warmup
add_kernel<<<grid, block>>>(dA, dB, dD, N);
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
double hostSum = 0.0;
for (int it = 0; it < iters; ++it) {
cudaMemset(dSum, 0, sizeof(double));
add_kernel<<<grid, block>>>(dA, dB, dD, N);
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
reduce_sum_kernel<<<grid, block, block * sizeof(double)>>>(dE, dSum, N);
cudaDeviceSynchronize();
double partial = 0.0;
cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost);
hostSum += partial;
}
auto end = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(end - start).count();
double bytes_per_iter = 7.0 * N * sizeof(float);
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
std::cout << "CUDA\n";
std::cout << "N=" << N << " iters=" << iters << "\n";
std::cout << "time_ms=" << ms << "\n";
std::cout << "throughput_GBps=" << gbps << "\n";
std::cout << "result=" << hostSum << "\n";
cudaFree(dA); cudaFree(dB); cudaFree(dC);
cudaFree(dD); cudaFree(dE); cudaFree(dSum);
return 0;
}