This commit is contained in:
sherlock 2025-09-06 09:41:18 +05:30
commit ea7dcba939
6 changed files with 481 additions and 0 deletions

75
README.md Normal file
View file

@ -0,0 +1,75 @@
GPU Programming Performance Workshop
Problem
- Compute for vectors A, B, C of size N (float32):
1) D = A + B
2) E = D * C + B
3) result = sum(E)
- Repeat for --iters iterations. Report time and estimated GB/s.
Directory layout
- cpp_single/main.cpp
- cpp_omp/main.cpp
- cuda/main.cu
- pytorch/baseline.py
- pytorch/optimized.py
Prereqs
- C++17 compiler (g++/clang++)
- OpenMP (optional for cpp_omp)
- NVIDIA CUDA toolkit for building cuda/main.cu
- Python 3.9+ and PyTorch (with CUDA for GPU runs)
Build
- Single-threaded C++:
g++ -O3 -march=native -std=c++17 -DNDEBUG cpp_single/main.cpp -o bin_cpp_single
- OpenMP C++:
Linux/macOS (clang may need -Xpreprocessor -fopenmp and libomp):
g++ -O3 -march=native -std=c++17 -fopenmp -DNDEBUG cpp_omp/main.cpp -o bin_cpp_omp
- CUDA:
nvcc -O3 -arch=native cuda/main.cu -o bin_cuda
If -arch=native not supported, use e.g.:
nvcc -O3 -arch=sm_80 cuda/main.cu -o bin_cuda
Run
- CPU single-thread:
./bin_cpp_single 100000000 10
- CPU OpenMP (set threads):
export OMP_NUM_THREADS=8
./bin_cpp_omp 100000000 10
- CUDA:
./bin_cuda 100000000 10
- PyTorch baseline (CPU or GPU auto-detect):
python pytorch/baseline.py --N 100000000 --iters 10 --device cuda
python pytorch/baseline.py --N 100000000 --iters 10 --device cpu
- PyTorch optimized:
python pytorch/optimized.py --N 100000000 --iters 10
Notes
- Memory: N=100M uses ~400 MB for A,B,C and ~400 MB for D,E. Ensure enough RAM/GPU memory.
- If you hit OOM on GPU, reduce N (e.g., 50_000_000).
- Throughput model assumes 7 floats per element per iter moved; actual may vary.
- For fair GPU timing, we synchronize after each iter.
- To compare kernel launch overhead, try small N (e.g., 1_000_000) and more iters.
- To compare bandwidth limits, try large N (e.g., 200_000_000) and fewer iters.
- PyTorch optimized uses pinned memory, in-place ops, preallocation, and CUDA Graphs.
- You can profile with:
- nsight systems: nsys profile ./bin_cuda 50000000 50
- nvprof (legacy): nvprof ./bin_cuda 50000000 50
- torch profiler: see torch.profiler in docs.
Validation
- All variants print "result" which should be numerically close across methods
(tiny differences expected due to different reduction orders and precision).
Extensions (optional for class)
- Fuse add+fma into one CUDA kernel to show fewer memory passes.
- Use thrust or cub for reductions.
- Try half-precision (float16/bfloat16) on GPU for bandwidth gains.
- Add vectorized loads (float4) on CPU and CUDA to show further speedups.

81
cpp_omp/main.cpp Normal file
View file

@ -0,0 +1,81 @@
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <random>
#include <string>
#include <vector>
#ifdef _OPENMP
#include <omp.h>
#endif
static void init_vec(std::vector<float> &v, uint64_t seed) {
std::mt19937 rng(static_cast<uint32_t>(seed));
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto &x : v) x = dist(rng);
}
int main(int argc, char **argv) {
size_t N = 100000000;
int iters = 10;
if (argc >= 2) N = static_cast<size_t>(std::stoull(argv[1]));
if (argc >= 3) iters = std::stoi(argv[2]);
std::vector<float> A(N), B(N), C(N), D(N), E(N);
init_vec(A, 1);
init_vec(B, 2);
init_vec(C, 3);
volatile float sink = 0.0f;
#pragma omp parallel for reduction(+:sink) if(N>10000)
for (int i = 0; i < std::min<size_t>(N, 1000); ++i) {
float d = A[i] + B[i];
float e = d * C[i] + B[i];
sink += e;
}
auto start = std::chrono::high_resolution_clock::now();
double sum_result = 0.0;
for (int it = 0; it < iters; ++it) {
#pragma omp parallel for if(N>10000)
for (long long i = 0; i < static_cast<long long>(N); ++i) {
D[i] = A[i] + B[i];
}
#pragma omp parallel for if(N>10000)
for (long long i = 0; i < static_cast<long long>(N); ++i) {
E[i] = D[i] * C[i] + B[i];
}
double local_sum = 0.0;
#pragma omp parallel
{
double thread_sum = 0.0;
#pragma omp for nowait
for (long long i = 0; i < static_cast<long long>(N); ++i) {
thread_sum += static_cast<double>(E[i]);
}
#pragma omp atomic
sum_result += thread_sum;
}
}
auto end = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(end - start).count();
double bytes_per_iter = 7.0 * N * sizeof(float);
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
int threads = 1;
#ifdef _OPENMP
threads = omp_get_max_threads();
#endif
std::cout << "CPP OpenMP\n";
std::cout << "threads=" << threads << "\n";
std::cout << "N=" << N << " iters=" << iters << "\n";
std::cout << "time_ms=" << ms << "\n";
std::cout << "throughput_GBps=" << gbps << "\n";
std::cout << "result=" << sum_result << "\n";
return (sink == 12345.678f) ? 0 : 0;
}

66
cpp_single/main.cpp Normal file
View file

@ -0,0 +1,66 @@
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <random>
#include <string>
#include <vector>
static void init_vec(std::vector<float> &v, uint64_t seed) {
std::mt19937 rng(static_cast<uint32_t>(seed));
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto &x : v) x = dist(rng);
}
int main(int argc, char **argv) {
size_t N = 100000000; // 1e8 by default (~400 MB across 4 arrays)
int iters = 10;
if (argc >= 2) N = static_cast<size_t>(std::stoull(argv[1]));
if (argc >= 3) iters = std::stoi(argv[2]);
std::vector<float> A(N), B(N), C(N), D(N), E(N);
init_vec(A, 1);
init_vec(B, 2);
init_vec(C, 3);
// Warmup
volatile float sink = 0.0f;
for (size_t i = 0; i < std::min<size_t>(N, 1000); ++i) {
float d = A[i] + B[i];
float e = d * C[i] + B[i];
sink += e;
}
auto start = std::chrono::high_resolution_clock::now();
double sum_result = 0.0;
for (int it = 0; it < iters; ++it) {
for (size_t i = 0; i < N; ++i) {
D[i] = A[i] + B[i];
}
for (size_t i = 0; i < N; ++i) {
E[i] = D[i] * C[i] + B[i];
}
double local_sum = 0.0;
for (size_t i = 0; i < N; ++i) {
local_sum += static_cast<double>(E[i]);
}
sum_result += local_sum;
}
auto end = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(end - start).count();
// Bytes moved (approx): reads A,B then D,C,B, and write D,E each iter
// Reads: A,B,D,C,B ~ 5N floats; Writes: D,E ~ 2N floats => 7N * 4 bytes
double bytes_per_iter = 7.0 * N * sizeof(float);
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
std::cout << "CPP single-threaded\n";
std::cout << "N=" << N << " iters=" << iters << "\n";
std::cout << "time_ms=" << ms << "\n";
std::cout << "throughput_GBps=" << gbps << "\n";
std::cout << "result=" << sum_result << "\n";
return (sink == 12345.678f) ? 0 : 0; // prevent optimizing away
}

109
cuda/main.cu Normal file
View file

@ -0,0 +1,109 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono>
#include <iostream>
#include <random>
#include <vector>
__global__ void add_kernel(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ D, size_t N) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) D[i] = A[i] + B[i];
}
__global__ void fma_kernel(const float* __restrict__ D,
const float* __restrict__ C,
const float* __restrict__ B,
float* __restrict__ E, size_t N) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) E[i] = fmaf(D[i], C[i], B[i]);
}
__global__ void reduce_sum_kernel(const float* __restrict__ E,
double* __restrict__ out, size_t N) {
extern __shared__ double s[];
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
size_t tid = threadIdx.x;
double val = 0.0;
if (i < N) val = static_cast<double>(E[i]);
s[tid] = val;
__syncthreads();
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) s[tid] += s[tid + stride];
__syncthreads();
}
if (tid == 0) atomicAdd(out, s[0]);
}
static void init_vec(std::vector<float> &v, uint64_t seed) {
std::mt19937 rng(static_cast<uint32_t>(seed));
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto &x : v) x = dist(rng);
}
int main(int argc, char** argv) {
size_t N = 100000000;
int iters = 10;
if (argc >= 2) N = static_cast<size_t>(atoll(argv[1]));
if (argc >= 3) iters = atoi(argv[2]);
std::vector<float> hA(N), hB(N), hC(N);
init_vec(hA, 1);
init_vec(hB, 2);
init_vec(hC, 3);
float *dA, *dB, *dC, *dD, *dE;
double *dSum;
cudaMalloc(&dA, N * sizeof(float));
cudaMalloc(&dB, N * sizeof(float));
cudaMalloc(&dC, N * sizeof(float));
cudaMalloc(&dD, N * sizeof(float));
cudaMalloc(&dE, N * sizeof(float));
cudaMalloc(&dSum, sizeof(double));
cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice);
int block = 256;
int grid = (int)((N + block - 1) / block);
// Warmup
add_kernel<<<grid, block>>>(dA, dB, dD, N);
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
double hostSum = 0.0;
for (int it = 0; it < iters; ++it) {
cudaMemset(dSum, 0, sizeof(double));
add_kernel<<<grid, block>>>(dA, dB, dD, N);
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
reduce_sum_kernel<<<grid, block, block * sizeof(double)>>>(dE, dSum, N);
cudaDeviceSynchronize();
double partial = 0.0;
cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost);
hostSum += partial;
}
auto end = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(end - start).count();
double bytes_per_iter = 7.0 * N * sizeof(float);
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
std::cout << "CUDA\n";
std::cout << "N=" << N << " iters=" << iters << "\n";
std::cout << "time_ms=" << ms << "\n";
std::cout << "throughput_GBps=" << gbps << "\n";
std::cout << "result=" << hostSum << "\n";
cudaFree(dA); cudaFree(dB); cudaFree(dC);
cudaFree(dD); cudaFree(dE); cudaFree(dSum);
return 0;
}

55
pytorch/baseline.py Normal file
View file

@ -0,0 +1,55 @@
import argparse
import time
import torch
def main():
p = argparse.ArgumentParser()
p.add_argument("--N", type=int, default=100_000_000)
p.add_argument("--iters", type=int, default=10)
p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
args = p.parse_args()
torch.manual_seed(0)
N = args.N
iters = args.iters
device = torch.device(args.device)
A = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
B = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
C = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
A = A.to(device)
B = B.to(device)
C = C.to(device)
# warmup
D = A + B
E = D * C + B
_ = E.sum()
if device.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
total = 0.0
for _ in range(iters):
D = A + B
E = D * C + B
s = E.sum()
if device.type == "cuda":
torch.cuda.synchronize()
total += float(s.item())
t1 = time.perf_counter()
ms = (t1 - t0) * 1000.0
bytes_per_iter = 7.0 * N * 4.0
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
print("PyTorch baseline")
print(f"device={device}")
print(f"N={N} iters={iters}")
print(f"time_ms={ms:.3f}")
print(f"throughput_GBps={gbps:.3f}")
print(f"result={total}")
if __name__ == "__main__":
main()

95
pytorch/optimized.py Normal file
View file

@ -0,0 +1,95 @@
import argparse
import time
import torch
def main():
p = argparse.ArgumentParser()
p.add_argument("--N", type=int, default=100_000_000)
p.add_argument("--iters", type=int, default=10)
args = p.parse_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)
N = args.N
iters = args.iters
# Use pinned memory for faster H2D, ensure contiguity and dtype
A_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
B_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
C_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
non_blocking = True if device.type == "cuda" else False
A = A_cpu.to(device, non_blocking=non_blocking)
B = B_cpu.to(device, non_blocking=non_blocking)
C = C_cpu.to(device, non_blocking=non_blocking)
# Pre-allocate outputs to reduce allocations
D = torch.empty_like(A)
E = torch.empty_like(A)
# Optional: enable TF32 on Ampere+ for throughput (math still float32-ish)
if device.type == "cuda":
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# CUDA Graphs to reduce launch overhead when shapes are static
graph = None
s_out = torch.zeros((), dtype=torch.float32, device=device)
if device.type == "cuda":
stream = torch.cuda.Stream()
torch.cuda.synchronize()
with torch.cuda.stream(stream):
D.copy_(A).add_(B, alpha=1.0) # warm
E.copy_(D).mul_(C).add_(B)
_ = E.sum()
torch.cuda.synchronize()
g = torch.cuda.CUDAGraph()
s_buf = torch.zeros((), dtype=torch.float32, device=device)
torch.cuda.synchronize()
g.capture_begin()
D.copy_(A).add_(B, alpha=1.0)
E.copy_(D).mul_(C).add_(B)
s_tmp = E.sum()
s_buf.copy_(s_tmp)
g.capture_end()
graph = (g, s_buf)
# timing
if device.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
total = 0.0
for _ in range(iters):
if graph is not None:
g, s_buf = graph
g.replay()
if device.type == "cuda":
torch.cuda.synchronize()
total += float(s_buf.item())
else:
# CPU path or no graph
D.copy_(A).add_(B, alpha=1.0)
E.copy_(D).mul_(C).add_(B)
s = E.sum()
total += float(s.item())
t1 = time.perf_counter()
if device.type == "cuda":
torch.cuda.synchronize()
ms = (t1 - t0) * 1000.0
bytes_per_iter = 7.0 * N * 4.0
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
print("PyTorch optimized")
print(f"device={device}")
print(f"N={N} iters={iters}")
print(f"time_ms={ms:.3f}")
print(f"throughput_GBps={gbps:.3f}")
print(f"result={total}")
if __name__ == "__main__":
main()