commit ea7dcba939a7319cd526ce2b27d13d3e3637f3f4 Author: sherlock Date: Sat Sep 6 09:41:18 2025 +0530 init diff --git a/README.md b/README.md new file mode 100644 index 0000000..7316770 --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +GPU Programming Performance Workshop + +Problem +- Compute for vectors A, B, C of size N (float32): + 1) D = A + B + 2) E = D * C + B + 3) result = sum(E) +- Repeat for --iters iterations. Report time and estimated GB/s. + +Directory layout +- cpp_single/main.cpp +- cpp_omp/main.cpp +- cuda/main.cu +- pytorch/baseline.py +- pytorch/optimized.py + +Prereqs +- C++17 compiler (g++/clang++) +- OpenMP (optional for cpp_omp) +- NVIDIA CUDA toolkit for building cuda/main.cu +- Python 3.9+ and PyTorch (with CUDA for GPU runs) + +Build +- Single-threaded C++: + g++ -O3 -march=native -std=c++17 -DNDEBUG cpp_single/main.cpp -o bin_cpp_single + +- OpenMP C++: + Linux/macOS (clang may need -Xpreprocessor -fopenmp and libomp): + g++ -O3 -march=native -std=c++17 -fopenmp -DNDEBUG cpp_omp/main.cpp -o bin_cpp_omp + +- CUDA: + nvcc -O3 -arch=native cuda/main.cu -o bin_cuda + If -arch=native not supported, use e.g.: + nvcc -O3 -arch=sm_80 cuda/main.cu -o bin_cuda + +Run +- CPU single-thread: + ./bin_cpp_single 100000000 10 + +- CPU OpenMP (set threads): + export OMP_NUM_THREADS=8 + ./bin_cpp_omp 100000000 10 + +- CUDA: + ./bin_cuda 100000000 10 + +- PyTorch baseline (CPU or GPU auto-detect): + python pytorch/baseline.py --N 100000000 --iters 10 --device cuda + python pytorch/baseline.py --N 100000000 --iters 10 --device cpu + +- PyTorch optimized: + python pytorch/optimized.py --N 100000000 --iters 10 + +Notes +- Memory: N=100M uses ~400 MB for A,B,C and ~400 MB for D,E. Ensure enough RAM/GPU memory. +- If you hit OOM on GPU, reduce N (e.g., 50_000_000). +- Throughput model assumes 7 floats per element per iter moved; actual may vary. +- For fair GPU timing, we synchronize after each iter. +- To compare kernel launch overhead, try small N (e.g., 1_000_000) and more iters. +- To compare bandwidth limits, try large N (e.g., 200_000_000) and fewer iters. +- PyTorch optimized uses pinned memory, in-place ops, preallocation, and CUDA Graphs. +- You can profile with: + - nsight systems: nsys profile ./bin_cuda 50000000 50 + - nvprof (legacy): nvprof ./bin_cuda 50000000 50 + - torch profiler: see torch.profiler in docs. + +Validation +- All variants print "result" which should be numerically close across methods + (tiny differences expected due to different reduction orders and precision). + +Extensions (optional for class) +- Fuse add+fma into one CUDA kernel to show fewer memory passes. +- Use thrust or cub for reductions. +- Try half-precision (float16/bfloat16) on GPU for bandwidth gains. +- Add vectorized loads (float4) on CPU and CUDA to show further speedups. diff --git a/cpp_omp/main.cpp b/cpp_omp/main.cpp new file mode 100644 index 0000000..605b741 --- /dev/null +++ b/cpp_omp/main.cpp @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _OPENMP +#include +#endif + +static void init_vec(std::vector &v, uint64_t seed) { + std::mt19937 rng(static_cast(seed)); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto &x : v) x = dist(rng); +} + +int main(int argc, char **argv) { + size_t N = 100000000; + int iters = 10; + if (argc >= 2) N = static_cast(std::stoull(argv[1])); + if (argc >= 3) iters = std::stoi(argv[2]); + + std::vector A(N), B(N), C(N), D(N), E(N); + init_vec(A, 1); + init_vec(B, 2); + init_vec(C, 3); + + volatile float sink = 0.0f; +#pragma omp parallel for reduction(+:sink) if(N>10000) + for (int i = 0; i < std::min(N, 1000); ++i) { + float d = A[i] + B[i]; + float e = d * C[i] + B[i]; + sink += e; + } + + auto start = std::chrono::high_resolution_clock::now(); + + double sum_result = 0.0; + for (int it = 0; it < iters; ++it) { +#pragma omp parallel for if(N>10000) + for (long long i = 0; i < static_cast(N); ++i) { + D[i] = A[i] + B[i]; + } +#pragma omp parallel for if(N>10000) + for (long long i = 0; i < static_cast(N); ++i) { + E[i] = D[i] * C[i] + B[i]; + } + double local_sum = 0.0; +#pragma omp parallel + { + double thread_sum = 0.0; +#pragma omp for nowait + for (long long i = 0; i < static_cast(N); ++i) { + thread_sum += static_cast(E[i]); + } +#pragma omp atomic + sum_result += thread_sum; + } + } + + auto end = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(end - start).count(); + + double bytes_per_iter = 7.0 * N * sizeof(float); + double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9; + + int threads = 1; +#ifdef _OPENMP + threads = omp_get_max_threads(); +#endif + + std::cout << "CPP OpenMP\n"; + std::cout << "threads=" << threads << "\n"; + std::cout << "N=" << N << " iters=" << iters << "\n"; + std::cout << "time_ms=" << ms << "\n"; + std::cout << "throughput_GBps=" << gbps << "\n"; + std::cout << "result=" << sum_result << "\n"; + return (sink == 12345.678f) ? 0 : 0; +} diff --git a/cpp_single/main.cpp b/cpp_single/main.cpp new file mode 100644 index 0000000..72ca54a --- /dev/null +++ b/cpp_single/main.cpp @@ -0,0 +1,66 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static void init_vec(std::vector &v, uint64_t seed) { + std::mt19937 rng(static_cast(seed)); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto &x : v) x = dist(rng); +} + +int main(int argc, char **argv) { + size_t N = 100000000; // 1e8 by default (~400 MB across 4 arrays) + int iters = 10; + if (argc >= 2) N = static_cast(std::stoull(argv[1])); + if (argc >= 3) iters = std::stoi(argv[2]); + + std::vector A(N), B(N), C(N), D(N), E(N); + init_vec(A, 1); + init_vec(B, 2); + init_vec(C, 3); + + // Warmup + volatile float sink = 0.0f; + for (size_t i = 0; i < std::min(N, 1000); ++i) { + float d = A[i] + B[i]; + float e = d * C[i] + B[i]; + sink += e; + } + + auto start = std::chrono::high_resolution_clock::now(); + + double sum_result = 0.0; + for (int it = 0; it < iters; ++it) { + for (size_t i = 0; i < N; ++i) { + D[i] = A[i] + B[i]; + } + for (size_t i = 0; i < N; ++i) { + E[i] = D[i] * C[i] + B[i]; + } + double local_sum = 0.0; + for (size_t i = 0; i < N; ++i) { + local_sum += static_cast(E[i]); + } + sum_result += local_sum; + } + + auto end = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(end - start).count(); + + // Bytes moved (approx): reads A,B then D,C,B, and write D,E each iter + // Reads: A,B,D,C,B ~ 5N floats; Writes: D,E ~ 2N floats => 7N * 4 bytes + double bytes_per_iter = 7.0 * N * sizeof(float); + double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9; + + std::cout << "CPP single-threaded\n"; + std::cout << "N=" << N << " iters=" << iters << "\n"; + std::cout << "time_ms=" << ms << "\n"; + std::cout << "throughput_GBps=" << gbps << "\n"; + std::cout << "result=" << sum_result << "\n"; + return (sink == 12345.678f) ? 0 : 0; // prevent optimizing away +} diff --git a/cuda/main.cu b/cuda/main.cu new file mode 100644 index 0000000..f836cf5 --- /dev/null +++ b/cuda/main.cu @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include + +__global__ void add_kernel(const float* __restrict__ A, + const float* __restrict__ B, + float* __restrict__ D, size_t N) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) D[i] = A[i] + B[i]; +} + +__global__ void fma_kernel(const float* __restrict__ D, + const float* __restrict__ C, + const float* __restrict__ B, + float* __restrict__ E, size_t N) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) E[i] = fmaf(D[i], C[i], B[i]); +} + +__global__ void reduce_sum_kernel(const float* __restrict__ E, + double* __restrict__ out, size_t N) { + extern __shared__ double s[]; + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + size_t tid = threadIdx.x; + double val = 0.0; + if (i < N) val = static_cast(E[i]); + s[tid] = val; + __syncthreads(); + + for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) s[tid] += s[tid + stride]; + __syncthreads(); + } + if (tid == 0) atomicAdd(out, s[0]); +} + +static void init_vec(std::vector &v, uint64_t seed) { + std::mt19937 rng(static_cast(seed)); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto &x : v) x = dist(rng); +} + +int main(int argc, char** argv) { + size_t N = 100000000; + int iters = 10; + if (argc >= 2) N = static_cast(atoll(argv[1])); + if (argc >= 3) iters = atoi(argv[2]); + + std::vector hA(N), hB(N), hC(N); + init_vec(hA, 1); + init_vec(hB, 2); + init_vec(hC, 3); + + float *dA, *dB, *dC, *dD, *dE; + double *dSum; + cudaMalloc(&dA, N * sizeof(float)); + cudaMalloc(&dB, N * sizeof(float)); + cudaMalloc(&dC, N * sizeof(float)); + cudaMalloc(&dD, N * sizeof(float)); + cudaMalloc(&dE, N * sizeof(float)); + cudaMalloc(&dSum, sizeof(double)); + + cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice); + + int block = 256; + int grid = (int)((N + block - 1) / block); + + // Warmup + add_kernel<<>>(dA, dB, dD, N); + fma_kernel<<>>(dD, dC, dB, dE, N); + cudaDeviceSynchronize(); + + auto start = std::chrono::high_resolution_clock::now(); + + double hostSum = 0.0; + for (int it = 0; it < iters; ++it) { + cudaMemset(dSum, 0, sizeof(double)); + add_kernel<<>>(dA, dB, dD, N); + fma_kernel<<>>(dD, dC, dB, dE, N); + reduce_sum_kernel<<>>(dE, dSum, N); + cudaDeviceSynchronize(); + + double partial = 0.0; + cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost); + hostSum += partial; + } + + auto end = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(end - start).count(); + + double bytes_per_iter = 7.0 * N * sizeof(float); + double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9; + + std::cout << "CUDA\n"; + std::cout << "N=" << N << " iters=" << iters << "\n"; + std::cout << "time_ms=" << ms << "\n"; + std::cout << "throughput_GBps=" << gbps << "\n"; + std::cout << "result=" << hostSum << "\n"; + + cudaFree(dA); cudaFree(dB); cudaFree(dC); + cudaFree(dD); cudaFree(dE); cudaFree(dSum); + return 0; +} diff --git a/pytorch/baseline.py b/pytorch/baseline.py new file mode 100644 index 0000000..85098cb --- /dev/null +++ b/pytorch/baseline.py @@ -0,0 +1,55 @@ +import argparse +import time +import torch + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--N", type=int, default=100_000_000) + p.add_argument("--iters", type=int, default=10) + p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu") + args = p.parse_args() + + torch.manual_seed(0) + N = args.N + iters = args.iters + device = torch.device(args.device) + + A = torch.empty(N, dtype=torch.float32).uniform_(-1, 1) + B = torch.empty(N, dtype=torch.float32).uniform_(-1, 1) + C = torch.empty(N, dtype=torch.float32).uniform_(-1, 1) + + A = A.to(device) + B = B.to(device) + C = C.to(device) + + # warmup + D = A + B + E = D * C + B + _ = E.sum() + if device.type == "cuda": + torch.cuda.synchronize() + + t0 = time.perf_counter() + total = 0.0 + for _ in range(iters): + D = A + B + E = D * C + B + s = E.sum() + if device.type == "cuda": + torch.cuda.synchronize() + total += float(s.item()) + t1 = time.perf_counter() + + ms = (t1 - t0) * 1000.0 + bytes_per_iter = 7.0 * N * 4.0 + gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9 + + print("PyTorch baseline") + print(f"device={device}") + print(f"N={N} iters={iters}") + print(f"time_ms={ms:.3f}") + print(f"throughput_GBps={gbps:.3f}") + print(f"result={total}") + +if __name__ == "__main__": + main() diff --git a/pytorch/optimized.py b/pytorch/optimized.py new file mode 100644 index 0000000..8c70785 --- /dev/null +++ b/pytorch/optimized.py @@ -0,0 +1,95 @@ +import argparse +import time +import torch + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--N", type=int, default=100_000_000) + p.add_argument("--iters", type=int, default=10) + args = p.parse_args() + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + torch.manual_seed(0) + + N = args.N + iters = args.iters + + # Use pinned memory for faster H2D, ensure contiguity and dtype + A_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1) + B_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1) + C_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1) + + non_blocking = True if device.type == "cuda" else False + A = A_cpu.to(device, non_blocking=non_blocking) + B = B_cpu.to(device, non_blocking=non_blocking) + C = C_cpu.to(device, non_blocking=non_blocking) + + # Pre-allocate outputs to reduce allocations + D = torch.empty_like(A) + E = torch.empty_like(A) + + # Optional: enable TF32 on Ampere+ for throughput (math still float32-ish) + if device.type == "cuda": + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + # CUDA Graphs to reduce launch overhead when shapes are static + graph = None + s_out = torch.zeros((), dtype=torch.float32, device=device) + if device.type == "cuda": + stream = torch.cuda.Stream() + torch.cuda.synchronize() + with torch.cuda.stream(stream): + D.copy_(A).add_(B, alpha=1.0) # warm + E.copy_(D).mul_(C).add_(B) + _ = E.sum() + torch.cuda.synchronize() + + g = torch.cuda.CUDAGraph() + s_buf = torch.zeros((), dtype=torch.float32, device=device) + torch.cuda.synchronize() + g.capture_begin() + D.copy_(A).add_(B, alpha=1.0) + E.copy_(D).mul_(C).add_(B) + s_tmp = E.sum() + s_buf.copy_(s_tmp) + g.capture_end() + graph = (g, s_buf) + + # timing + if device.type == "cuda": + torch.cuda.synchronize() + t0 = time.perf_counter() + + total = 0.0 + for _ in range(iters): + if graph is not None: + g, s_buf = graph + g.replay() + if device.type == "cuda": + torch.cuda.synchronize() + total += float(s_buf.item()) + else: + # CPU path or no graph + D.copy_(A).add_(B, alpha=1.0) + E.copy_(D).mul_(C).add_(B) + s = E.sum() + total += float(s.item()) + + t1 = time.perf_counter() + if device.type == "cuda": + torch.cuda.synchronize() + + ms = (t1 - t0) * 1000.0 + bytes_per_iter = 7.0 * N * 4.0 + gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9 + + print("PyTorch optimized") + print(f"device={device}") + print(f"N={N} iters={iters}") + print(f"time_ms={ms:.3f}") + print(f"throughput_GBps={gbps:.3f}") + print(f"result={total}") + +if __name__ == "__main__": + main()