init
This commit is contained in:
commit
ea7dcba939
6 changed files with 481 additions and 0 deletions
75
README.md
Normal file
75
README.md
Normal file
|
@ -0,0 +1,75 @@
|
|||
GPU Programming Performance Workshop
|
||||
|
||||
Problem
|
||||
- Compute for vectors A, B, C of size N (float32):
|
||||
1) D = A + B
|
||||
2) E = D * C + B
|
||||
3) result = sum(E)
|
||||
- Repeat for --iters iterations. Report time and estimated GB/s.
|
||||
|
||||
Directory layout
|
||||
- cpp_single/main.cpp
|
||||
- cpp_omp/main.cpp
|
||||
- cuda/main.cu
|
||||
- pytorch/baseline.py
|
||||
- pytorch/optimized.py
|
||||
|
||||
Prereqs
|
||||
- C++17 compiler (g++/clang++)
|
||||
- OpenMP (optional for cpp_omp)
|
||||
- NVIDIA CUDA toolkit for building cuda/main.cu
|
||||
- Python 3.9+ and PyTorch (with CUDA for GPU runs)
|
||||
|
||||
Build
|
||||
- Single-threaded C++:
|
||||
g++ -O3 -march=native -std=c++17 -DNDEBUG cpp_single/main.cpp -o bin_cpp_single
|
||||
|
||||
- OpenMP C++:
|
||||
Linux/macOS (clang may need -Xpreprocessor -fopenmp and libomp):
|
||||
g++ -O3 -march=native -std=c++17 -fopenmp -DNDEBUG cpp_omp/main.cpp -o bin_cpp_omp
|
||||
|
||||
- CUDA:
|
||||
nvcc -O3 -arch=native cuda/main.cu -o bin_cuda
|
||||
If -arch=native not supported, use e.g.:
|
||||
nvcc -O3 -arch=sm_80 cuda/main.cu -o bin_cuda
|
||||
|
||||
Run
|
||||
- CPU single-thread:
|
||||
./bin_cpp_single 100000000 10
|
||||
|
||||
- CPU OpenMP (set threads):
|
||||
export OMP_NUM_THREADS=8
|
||||
./bin_cpp_omp 100000000 10
|
||||
|
||||
- CUDA:
|
||||
./bin_cuda 100000000 10
|
||||
|
||||
- PyTorch baseline (CPU or GPU auto-detect):
|
||||
python pytorch/baseline.py --N 100000000 --iters 10 --device cuda
|
||||
python pytorch/baseline.py --N 100000000 --iters 10 --device cpu
|
||||
|
||||
- PyTorch optimized:
|
||||
python pytorch/optimized.py --N 100000000 --iters 10
|
||||
|
||||
Notes
|
||||
- Memory: N=100M uses ~400 MB for A,B,C and ~400 MB for D,E. Ensure enough RAM/GPU memory.
|
||||
- If you hit OOM on GPU, reduce N (e.g., 50_000_000).
|
||||
- Throughput model assumes 7 floats per element per iter moved; actual may vary.
|
||||
- For fair GPU timing, we synchronize after each iter.
|
||||
- To compare kernel launch overhead, try small N (e.g., 1_000_000) and more iters.
|
||||
- To compare bandwidth limits, try large N (e.g., 200_000_000) and fewer iters.
|
||||
- PyTorch optimized uses pinned memory, in-place ops, preallocation, and CUDA Graphs.
|
||||
- You can profile with:
|
||||
- nsight systems: nsys profile ./bin_cuda 50000000 50
|
||||
- nvprof (legacy): nvprof ./bin_cuda 50000000 50
|
||||
- torch profiler: see torch.profiler in docs.
|
||||
|
||||
Validation
|
||||
- All variants print "result" which should be numerically close across methods
|
||||
(tiny differences expected due to different reduction orders and precision).
|
||||
|
||||
Extensions (optional for class)
|
||||
- Fuse add+fma into one CUDA kernel to show fewer memory passes.
|
||||
- Use thrust or cub for reductions.
|
||||
- Try half-precision (float16/bfloat16) on GPU for bandwidth gains.
|
||||
- Add vectorized loads (float4) on CPU and CUDA to show further speedups.
|
81
cpp_omp/main.cpp
Normal file
81
cpp_omp/main.cpp
Normal file
|
@ -0,0 +1,81 @@
|
|||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||
for (auto &x : v) x = dist(rng);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
size_t N = 100000000;
|
||||
int iters = 10;
|
||||
if (argc >= 2) N = static_cast<size_t>(std::stoull(argv[1]));
|
||||
if (argc >= 3) iters = std::stoi(argv[2]);
|
||||
|
||||
std::vector<float> A(N), B(N), C(N), D(N), E(N);
|
||||
init_vec(A, 1);
|
||||
init_vec(B, 2);
|
||||
init_vec(C, 3);
|
||||
|
||||
volatile float sink = 0.0f;
|
||||
#pragma omp parallel for reduction(+:sink) if(N>10000)
|
||||
for (int i = 0; i < std::min<size_t>(N, 1000); ++i) {
|
||||
float d = A[i] + B[i];
|
||||
float e = d * C[i] + B[i];
|
||||
sink += e;
|
||||
}
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double sum_result = 0.0;
|
||||
for (int it = 0; it < iters; ++it) {
|
||||
#pragma omp parallel for if(N>10000)
|
||||
for (long long i = 0; i < static_cast<long long>(N); ++i) {
|
||||
D[i] = A[i] + B[i];
|
||||
}
|
||||
#pragma omp parallel for if(N>10000)
|
||||
for (long long i = 0; i < static_cast<long long>(N); ++i) {
|
||||
E[i] = D[i] * C[i] + B[i];
|
||||
}
|
||||
double local_sum = 0.0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
double thread_sum = 0.0;
|
||||
#pragma omp for nowait
|
||||
for (long long i = 0; i < static_cast<long long>(N); ++i) {
|
||||
thread_sum += static_cast<double>(E[i]);
|
||||
}
|
||||
#pragma omp atomic
|
||||
sum_result += thread_sum;
|
||||
}
|
||||
}
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||
|
||||
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||
|
||||
int threads = 1;
|
||||
#ifdef _OPENMP
|
||||
threads = omp_get_max_threads();
|
||||
#endif
|
||||
|
||||
std::cout << "CPP OpenMP\n";
|
||||
std::cout << "threads=" << threads << "\n";
|
||||
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||
std::cout << "time_ms=" << ms << "\n";
|
||||
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||
std::cout << "result=" << sum_result << "\n";
|
||||
return (sink == 12345.678f) ? 0 : 0;
|
||||
}
|
66
cpp_single/main.cpp
Normal file
66
cpp_single/main.cpp
Normal file
|
@ -0,0 +1,66 @@
|
|||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||
for (auto &x : v) x = dist(rng);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
size_t N = 100000000; // 1e8 by default (~400 MB across 4 arrays)
|
||||
int iters = 10;
|
||||
if (argc >= 2) N = static_cast<size_t>(std::stoull(argv[1]));
|
||||
if (argc >= 3) iters = std::stoi(argv[2]);
|
||||
|
||||
std::vector<float> A(N), B(N), C(N), D(N), E(N);
|
||||
init_vec(A, 1);
|
||||
init_vec(B, 2);
|
||||
init_vec(C, 3);
|
||||
|
||||
// Warmup
|
||||
volatile float sink = 0.0f;
|
||||
for (size_t i = 0; i < std::min<size_t>(N, 1000); ++i) {
|
||||
float d = A[i] + B[i];
|
||||
float e = d * C[i] + B[i];
|
||||
sink += e;
|
||||
}
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double sum_result = 0.0;
|
||||
for (int it = 0; it < iters; ++it) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
D[i] = A[i] + B[i];
|
||||
}
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
E[i] = D[i] * C[i] + B[i];
|
||||
}
|
||||
double local_sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
local_sum += static_cast<double>(E[i]);
|
||||
}
|
||||
sum_result += local_sum;
|
||||
}
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||
|
||||
// Bytes moved (approx): reads A,B then D,C,B, and write D,E each iter
|
||||
// Reads: A,B,D,C,B ~ 5N floats; Writes: D,E ~ 2N floats => 7N * 4 bytes
|
||||
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||
|
||||
std::cout << "CPP single-threaded\n";
|
||||
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||
std::cout << "time_ms=" << ms << "\n";
|
||||
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||
std::cout << "result=" << sum_result << "\n";
|
||||
return (sink == 12345.678f) ? 0 : 0; // prevent optimizing away
|
||||
}
|
109
cuda/main.cu
Normal file
109
cuda/main.cu
Normal file
|
@ -0,0 +1,109 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
__global__ void add_kernel(const float* __restrict__ A,
|
||||
const float* __restrict__ B,
|
||||
float* __restrict__ D, size_t N) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < N) D[i] = A[i] + B[i];
|
||||
}
|
||||
|
||||
__global__ void fma_kernel(const float* __restrict__ D,
|
||||
const float* __restrict__ C,
|
||||
const float* __restrict__ B,
|
||||
float* __restrict__ E, size_t N) {
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < N) E[i] = fmaf(D[i], C[i], B[i]);
|
||||
}
|
||||
|
||||
__global__ void reduce_sum_kernel(const float* __restrict__ E,
|
||||
double* __restrict__ out, size_t N) {
|
||||
extern __shared__ double s[];
|
||||
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t tid = threadIdx.x;
|
||||
double val = 0.0;
|
||||
if (i < N) val = static_cast<double>(E[i]);
|
||||
s[tid] = val;
|
||||
__syncthreads();
|
||||
|
||||
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
|
||||
if (tid < stride) s[tid] += s[tid + stride];
|
||||
__syncthreads();
|
||||
}
|
||||
if (tid == 0) atomicAdd(out, s[0]);
|
||||
}
|
||||
|
||||
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||
for (auto &x : v) x = dist(rng);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
size_t N = 100000000;
|
||||
int iters = 10;
|
||||
if (argc >= 2) N = static_cast<size_t>(atoll(argv[1]));
|
||||
if (argc >= 3) iters = atoi(argv[2]);
|
||||
|
||||
std::vector<float> hA(N), hB(N), hC(N);
|
||||
init_vec(hA, 1);
|
||||
init_vec(hB, 2);
|
||||
init_vec(hC, 3);
|
||||
|
||||
float *dA, *dB, *dC, *dD, *dE;
|
||||
double *dSum;
|
||||
cudaMalloc(&dA, N * sizeof(float));
|
||||
cudaMalloc(&dB, N * sizeof(float));
|
||||
cudaMalloc(&dC, N * sizeof(float));
|
||||
cudaMalloc(&dD, N * sizeof(float));
|
||||
cudaMalloc(&dE, N * sizeof(float));
|
||||
cudaMalloc(&dSum, sizeof(double));
|
||||
|
||||
cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
int block = 256;
|
||||
int grid = (int)((N + block - 1) / block);
|
||||
|
||||
// Warmup
|
||||
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
||||
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double hostSum = 0.0;
|
||||
for (int it = 0; it < iters; ++it) {
|
||||
cudaMemset(dSum, 0, sizeof(double));
|
||||
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
||||
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
||||
reduce_sum_kernel<<<grid, block, block * sizeof(double)>>>(dE, dSum, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
double partial = 0.0;
|
||||
cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost);
|
||||
hostSum += partial;
|
||||
}
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||
|
||||
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||
|
||||
std::cout << "CUDA\n";
|
||||
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||
std::cout << "time_ms=" << ms << "\n";
|
||||
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||
std::cout << "result=" << hostSum << "\n";
|
||||
|
||||
cudaFree(dA); cudaFree(dB); cudaFree(dC);
|
||||
cudaFree(dD); cudaFree(dE); cudaFree(dSum);
|
||||
return 0;
|
||||
}
|
55
pytorch/baseline.py
Normal file
55
pytorch/baseline.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
import argparse
|
||||
import time
|
||||
import torch
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--N", type=int, default=100_000_000)
|
||||
p.add_argument("--iters", type=int, default=10)
|
||||
p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
|
||||
args = p.parse_args()
|
||||
|
||||
torch.manual_seed(0)
|
||||
N = args.N
|
||||
iters = args.iters
|
||||
device = torch.device(args.device)
|
||||
|
||||
A = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
|
||||
B = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
|
||||
C = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
|
||||
|
||||
A = A.to(device)
|
||||
B = B.to(device)
|
||||
C = C.to(device)
|
||||
|
||||
# warmup
|
||||
D = A + B
|
||||
E = D * C + B
|
||||
_ = E.sum()
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize()
|
||||
|
||||
t0 = time.perf_counter()
|
||||
total = 0.0
|
||||
for _ in range(iters):
|
||||
D = A + B
|
||||
E = D * C + B
|
||||
s = E.sum()
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize()
|
||||
total += float(s.item())
|
||||
t1 = time.perf_counter()
|
||||
|
||||
ms = (t1 - t0) * 1000.0
|
||||
bytes_per_iter = 7.0 * N * 4.0
|
||||
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
|
||||
|
||||
print("PyTorch baseline")
|
||||
print(f"device={device}")
|
||||
print(f"N={N} iters={iters}")
|
||||
print(f"time_ms={ms:.3f}")
|
||||
print(f"throughput_GBps={gbps:.3f}")
|
||||
print(f"result={total}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
95
pytorch/optimized.py
Normal file
95
pytorch/optimized.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
import argparse
|
||||
import time
|
||||
import torch
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--N", type=int, default=100_000_000)
|
||||
p.add_argument("--iters", type=int, default=10)
|
||||
args = p.parse_args()
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
torch.manual_seed(0)
|
||||
|
||||
N = args.N
|
||||
iters = args.iters
|
||||
|
||||
# Use pinned memory for faster H2D, ensure contiguity and dtype
|
||||
A_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
|
||||
B_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
|
||||
C_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
|
||||
|
||||
non_blocking = True if device.type == "cuda" else False
|
||||
A = A_cpu.to(device, non_blocking=non_blocking)
|
||||
B = B_cpu.to(device, non_blocking=non_blocking)
|
||||
C = C_cpu.to(device, non_blocking=non_blocking)
|
||||
|
||||
# Pre-allocate outputs to reduce allocations
|
||||
D = torch.empty_like(A)
|
||||
E = torch.empty_like(A)
|
||||
|
||||
# Optional: enable TF32 on Ampere+ for throughput (math still float32-ish)
|
||||
if device.type == "cuda":
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
|
||||
# CUDA Graphs to reduce launch overhead when shapes are static
|
||||
graph = None
|
||||
s_out = torch.zeros((), dtype=torch.float32, device=device)
|
||||
if device.type == "cuda":
|
||||
stream = torch.cuda.Stream()
|
||||
torch.cuda.synchronize()
|
||||
with torch.cuda.stream(stream):
|
||||
D.copy_(A).add_(B, alpha=1.0) # warm
|
||||
E.copy_(D).mul_(C).add_(B)
|
||||
_ = E.sum()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
g = torch.cuda.CUDAGraph()
|
||||
s_buf = torch.zeros((), dtype=torch.float32, device=device)
|
||||
torch.cuda.synchronize()
|
||||
g.capture_begin()
|
||||
D.copy_(A).add_(B, alpha=1.0)
|
||||
E.copy_(D).mul_(C).add_(B)
|
||||
s_tmp = E.sum()
|
||||
s_buf.copy_(s_tmp)
|
||||
g.capture_end()
|
||||
graph = (g, s_buf)
|
||||
|
||||
# timing
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
total = 0.0
|
||||
for _ in range(iters):
|
||||
if graph is not None:
|
||||
g, s_buf = graph
|
||||
g.replay()
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize()
|
||||
total += float(s_buf.item())
|
||||
else:
|
||||
# CPU path or no graph
|
||||
D.copy_(A).add_(B, alpha=1.0)
|
||||
E.copy_(D).mul_(C).add_(B)
|
||||
s = E.sum()
|
||||
total += float(s.item())
|
||||
|
||||
t1 = time.perf_counter()
|
||||
if device.type == "cuda":
|
||||
torch.cuda.synchronize()
|
||||
|
||||
ms = (t1 - t0) * 1000.0
|
||||
bytes_per_iter = 7.0 * N * 4.0
|
||||
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
|
||||
|
||||
print("PyTorch optimized")
|
||||
print(f"device={device}")
|
||||
print(f"N={N} iters={iters}")
|
||||
print(f"time_ms={ms:.3f}")
|
||||
print(f"throughput_GBps={gbps:.3f}")
|
||||
print(f"result={total}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue