init
This commit is contained in:
commit
ea7dcba939
6 changed files with 481 additions and 0 deletions
75
README.md
Normal file
75
README.md
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
GPU Programming Performance Workshop
|
||||||
|
|
||||||
|
Problem
|
||||||
|
- Compute for vectors A, B, C of size N (float32):
|
||||||
|
1) D = A + B
|
||||||
|
2) E = D * C + B
|
||||||
|
3) result = sum(E)
|
||||||
|
- Repeat for --iters iterations. Report time and estimated GB/s.
|
||||||
|
|
||||||
|
Directory layout
|
||||||
|
- cpp_single/main.cpp
|
||||||
|
- cpp_omp/main.cpp
|
||||||
|
- cuda/main.cu
|
||||||
|
- pytorch/baseline.py
|
||||||
|
- pytorch/optimized.py
|
||||||
|
|
||||||
|
Prereqs
|
||||||
|
- C++17 compiler (g++/clang++)
|
||||||
|
- OpenMP (optional for cpp_omp)
|
||||||
|
- NVIDIA CUDA toolkit for building cuda/main.cu
|
||||||
|
- Python 3.9+ and PyTorch (with CUDA for GPU runs)
|
||||||
|
|
||||||
|
Build
|
||||||
|
- Single-threaded C++:
|
||||||
|
g++ -O3 -march=native -std=c++17 -DNDEBUG cpp_single/main.cpp -o bin_cpp_single
|
||||||
|
|
||||||
|
- OpenMP C++:
|
||||||
|
Linux/macOS (clang may need -Xpreprocessor -fopenmp and libomp):
|
||||||
|
g++ -O3 -march=native -std=c++17 -fopenmp -DNDEBUG cpp_omp/main.cpp -o bin_cpp_omp
|
||||||
|
|
||||||
|
- CUDA:
|
||||||
|
nvcc -O3 -arch=native cuda/main.cu -o bin_cuda
|
||||||
|
If -arch=native not supported, use e.g.:
|
||||||
|
nvcc -O3 -arch=sm_80 cuda/main.cu -o bin_cuda
|
||||||
|
|
||||||
|
Run
|
||||||
|
- CPU single-thread:
|
||||||
|
./bin_cpp_single 100000000 10
|
||||||
|
|
||||||
|
- CPU OpenMP (set threads):
|
||||||
|
export OMP_NUM_THREADS=8
|
||||||
|
./bin_cpp_omp 100000000 10
|
||||||
|
|
||||||
|
- CUDA:
|
||||||
|
./bin_cuda 100000000 10
|
||||||
|
|
||||||
|
- PyTorch baseline (CPU or GPU auto-detect):
|
||||||
|
python pytorch/baseline.py --N 100000000 --iters 10 --device cuda
|
||||||
|
python pytorch/baseline.py --N 100000000 --iters 10 --device cpu
|
||||||
|
|
||||||
|
- PyTorch optimized:
|
||||||
|
python pytorch/optimized.py --N 100000000 --iters 10
|
||||||
|
|
||||||
|
Notes
|
||||||
|
- Memory: N=100M uses ~400 MB for A,B,C and ~400 MB for D,E. Ensure enough RAM/GPU memory.
|
||||||
|
- If you hit OOM on GPU, reduce N (e.g., 50_000_000).
|
||||||
|
- Throughput model assumes 7 floats per element per iter moved; actual may vary.
|
||||||
|
- For fair GPU timing, we synchronize after each iter.
|
||||||
|
- To compare kernel launch overhead, try small N (e.g., 1_000_000) and more iters.
|
||||||
|
- To compare bandwidth limits, try large N (e.g., 200_000_000) and fewer iters.
|
||||||
|
- PyTorch optimized uses pinned memory, in-place ops, preallocation, and CUDA Graphs.
|
||||||
|
- You can profile with:
|
||||||
|
- nsight systems: nsys profile ./bin_cuda 50000000 50
|
||||||
|
- nvprof (legacy): nvprof ./bin_cuda 50000000 50
|
||||||
|
- torch profiler: see torch.profiler in docs.
|
||||||
|
|
||||||
|
Validation
|
||||||
|
- All variants print "result" which should be numerically close across methods
|
||||||
|
(tiny differences expected due to different reduction orders and precision).
|
||||||
|
|
||||||
|
Extensions (optional for class)
|
||||||
|
- Fuse add+fma into one CUDA kernel to show fewer memory passes.
|
||||||
|
- Use thrust or cub for reductions.
|
||||||
|
- Try half-precision (float16/bfloat16) on GPU for bandwidth gains.
|
||||||
|
- Add vectorized loads (float4) on CPU and CUDA to show further speedups.
|
81
cpp_omp/main.cpp
Normal file
81
cpp_omp/main.cpp
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <iostream>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#include <omp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||||
|
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||||
|
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||||
|
for (auto &x : v) x = dist(rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
size_t N = 100000000;
|
||||||
|
int iters = 10;
|
||||||
|
if (argc >= 2) N = static_cast<size_t>(std::stoull(argv[1]));
|
||||||
|
if (argc >= 3) iters = std::stoi(argv[2]);
|
||||||
|
|
||||||
|
std::vector<float> A(N), B(N), C(N), D(N), E(N);
|
||||||
|
init_vec(A, 1);
|
||||||
|
init_vec(B, 2);
|
||||||
|
init_vec(C, 3);
|
||||||
|
|
||||||
|
volatile float sink = 0.0f;
|
||||||
|
#pragma omp parallel for reduction(+:sink) if(N>10000)
|
||||||
|
for (int i = 0; i < std::min<size_t>(N, 1000); ++i) {
|
||||||
|
float d = A[i] + B[i];
|
||||||
|
float e = d * C[i] + B[i];
|
||||||
|
sink += e;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
double sum_result = 0.0;
|
||||||
|
for (int it = 0; it < iters; ++it) {
|
||||||
|
#pragma omp parallel for if(N>10000)
|
||||||
|
for (long long i = 0; i < static_cast<long long>(N); ++i) {
|
||||||
|
D[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
#pragma omp parallel for if(N>10000)
|
||||||
|
for (long long i = 0; i < static_cast<long long>(N); ++i) {
|
||||||
|
E[i] = D[i] * C[i] + B[i];
|
||||||
|
}
|
||||||
|
double local_sum = 0.0;
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
double thread_sum = 0.0;
|
||||||
|
#pragma omp for nowait
|
||||||
|
for (long long i = 0; i < static_cast<long long>(N); ++i) {
|
||||||
|
thread_sum += static_cast<double>(E[i]);
|
||||||
|
}
|
||||||
|
#pragma omp atomic
|
||||||
|
sum_result += thread_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||||
|
|
||||||
|
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||||
|
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||||
|
|
||||||
|
int threads = 1;
|
||||||
|
#ifdef _OPENMP
|
||||||
|
threads = omp_get_max_threads();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::cout << "CPP OpenMP\n";
|
||||||
|
std::cout << "threads=" << threads << "\n";
|
||||||
|
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||||
|
std::cout << "time_ms=" << ms << "\n";
|
||||||
|
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||||
|
std::cout << "result=" << sum_result << "\n";
|
||||||
|
return (sink == 12345.678f) ? 0 : 0;
|
||||||
|
}
|
66
cpp_single/main.cpp
Normal file
66
cpp_single/main.cpp
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <iostream>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||||
|
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||||
|
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||||
|
for (auto &x : v) x = dist(rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
size_t N = 100000000; // 1e8 by default (~400 MB across 4 arrays)
|
||||||
|
int iters = 10;
|
||||||
|
if (argc >= 2) N = static_cast<size_t>(std::stoull(argv[1]));
|
||||||
|
if (argc >= 3) iters = std::stoi(argv[2]);
|
||||||
|
|
||||||
|
std::vector<float> A(N), B(N), C(N), D(N), E(N);
|
||||||
|
init_vec(A, 1);
|
||||||
|
init_vec(B, 2);
|
||||||
|
init_vec(C, 3);
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
volatile float sink = 0.0f;
|
||||||
|
for (size_t i = 0; i < std::min<size_t>(N, 1000); ++i) {
|
||||||
|
float d = A[i] + B[i];
|
||||||
|
float e = d * C[i] + B[i];
|
||||||
|
sink += e;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
double sum_result = 0.0;
|
||||||
|
for (int it = 0; it < iters; ++it) {
|
||||||
|
for (size_t i = 0; i < N; ++i) {
|
||||||
|
D[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < N; ++i) {
|
||||||
|
E[i] = D[i] * C[i] + B[i];
|
||||||
|
}
|
||||||
|
double local_sum = 0.0;
|
||||||
|
for (size_t i = 0; i < N; ++i) {
|
||||||
|
local_sum += static_cast<double>(E[i]);
|
||||||
|
}
|
||||||
|
sum_result += local_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||||
|
|
||||||
|
// Bytes moved (approx): reads A,B then D,C,B, and write D,E each iter
|
||||||
|
// Reads: A,B,D,C,B ~ 5N floats; Writes: D,E ~ 2N floats => 7N * 4 bytes
|
||||||
|
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||||
|
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||||
|
|
||||||
|
std::cout << "CPP single-threaded\n";
|
||||||
|
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||||
|
std::cout << "time_ms=" << ms << "\n";
|
||||||
|
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||||
|
std::cout << "result=" << sum_result << "\n";
|
||||||
|
return (sink == 12345.678f) ? 0 : 0; // prevent optimizing away
|
||||||
|
}
|
109
cuda/main.cu
Normal file
109
cuda/main.cu
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <chrono>
|
||||||
|
#include <iostream>
|
||||||
|
#include <random>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
__global__ void add_kernel(const float* __restrict__ A,
|
||||||
|
const float* __restrict__ B,
|
||||||
|
float* __restrict__ D, size_t N) {
|
||||||
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (i < N) D[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void fma_kernel(const float* __restrict__ D,
|
||||||
|
const float* __restrict__ C,
|
||||||
|
const float* __restrict__ B,
|
||||||
|
float* __restrict__ E, size_t N) {
|
||||||
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (i < N) E[i] = fmaf(D[i], C[i], B[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void reduce_sum_kernel(const float* __restrict__ E,
|
||||||
|
double* __restrict__ out, size_t N) {
|
||||||
|
extern __shared__ double s[];
|
||||||
|
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
size_t tid = threadIdx.x;
|
||||||
|
double val = 0.0;
|
||||||
|
if (i < N) val = static_cast<double>(E[i]);
|
||||||
|
s[tid] = val;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
|
||||||
|
if (tid < stride) s[tid] += s[tid + stride];
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
if (tid == 0) atomicAdd(out, s[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void init_vec(std::vector<float> &v, uint64_t seed) {
|
||||||
|
std::mt19937 rng(static_cast<uint32_t>(seed));
|
||||||
|
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
||||||
|
for (auto &x : v) x = dist(rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
size_t N = 100000000;
|
||||||
|
int iters = 10;
|
||||||
|
if (argc >= 2) N = static_cast<size_t>(atoll(argv[1]));
|
||||||
|
if (argc >= 3) iters = atoi(argv[2]);
|
||||||
|
|
||||||
|
std::vector<float> hA(N), hB(N), hC(N);
|
||||||
|
init_vec(hA, 1);
|
||||||
|
init_vec(hB, 2);
|
||||||
|
init_vec(hC, 3);
|
||||||
|
|
||||||
|
float *dA, *dB, *dC, *dD, *dE;
|
||||||
|
double *dSum;
|
||||||
|
cudaMalloc(&dA, N * sizeof(float));
|
||||||
|
cudaMalloc(&dB, N * sizeof(float));
|
||||||
|
cudaMalloc(&dC, N * sizeof(float));
|
||||||
|
cudaMalloc(&dD, N * sizeof(float));
|
||||||
|
cudaMalloc(&dE, N * sizeof(float));
|
||||||
|
cudaMalloc(&dSum, sizeof(double));
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dC, hC.data(), N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
int block = 256;
|
||||||
|
int grid = (int)((N + block - 1) / block);
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
||||||
|
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
double hostSum = 0.0;
|
||||||
|
for (int it = 0; it < iters; ++it) {
|
||||||
|
cudaMemset(dSum, 0, sizeof(double));
|
||||||
|
add_kernel<<<grid, block>>>(dA, dB, dD, N);
|
||||||
|
fma_kernel<<<grid, block>>>(dD, dC, dB, dE, N);
|
||||||
|
reduce_sum_kernel<<<grid, block, block * sizeof(double)>>>(dE, dSum, N);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
double partial = 0.0;
|
||||||
|
cudaMemcpy(&partial, dSum, sizeof(double), cudaMemcpyDeviceToHost);
|
||||||
|
hostSum += partial;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
double ms = std::chrono::duration<double, std::milli>(end - start).count();
|
||||||
|
|
||||||
|
double bytes_per_iter = 7.0 * N * sizeof(float);
|
||||||
|
double gbps = (bytes_per_iter * iters) / (ms / 1000.0) / 1e9;
|
||||||
|
|
||||||
|
std::cout << "CUDA\n";
|
||||||
|
std::cout << "N=" << N << " iters=" << iters << "\n";
|
||||||
|
std::cout << "time_ms=" << ms << "\n";
|
||||||
|
std::cout << "throughput_GBps=" << gbps << "\n";
|
||||||
|
std::cout << "result=" << hostSum << "\n";
|
||||||
|
|
||||||
|
cudaFree(dA); cudaFree(dB); cudaFree(dC);
|
||||||
|
cudaFree(dD); cudaFree(dE); cudaFree(dSum);
|
||||||
|
return 0;
|
||||||
|
}
|
55
pytorch/baseline.py
Normal file
55
pytorch/baseline.py
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--N", type=int, default=100_000_000)
|
||||||
|
p.add_argument("--iters", type=int, default=10)
|
||||||
|
p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
torch.manual_seed(0)
|
||||||
|
N = args.N
|
||||||
|
iters = args.iters
|
||||||
|
device = torch.device(args.device)
|
||||||
|
|
||||||
|
A = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
|
||||||
|
B = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
|
||||||
|
C = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
|
||||||
|
|
||||||
|
A = A.to(device)
|
||||||
|
B = B.to(device)
|
||||||
|
C = C.to(device)
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
D = A + B
|
||||||
|
E = D * C + B
|
||||||
|
_ = E.sum()
|
||||||
|
if device.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
total = 0.0
|
||||||
|
for _ in range(iters):
|
||||||
|
D = A + B
|
||||||
|
E = D * C + B
|
||||||
|
s = E.sum()
|
||||||
|
if device.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
total += float(s.item())
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
|
ms = (t1 - t0) * 1000.0
|
||||||
|
bytes_per_iter = 7.0 * N * 4.0
|
||||||
|
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
|
||||||
|
|
||||||
|
print("PyTorch baseline")
|
||||||
|
print(f"device={device}")
|
||||||
|
print(f"N={N} iters={iters}")
|
||||||
|
print(f"time_ms={ms:.3f}")
|
||||||
|
print(f"throughput_GBps={gbps:.3f}")
|
||||||
|
print(f"result={total}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
95
pytorch/optimized.py
Normal file
95
pytorch/optimized.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--N", type=int, default=100_000_000)
|
||||||
|
p.add_argument("--iters", type=int, default=10)
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
torch.manual_seed(0)
|
||||||
|
|
||||||
|
N = args.N
|
||||||
|
iters = args.iters
|
||||||
|
|
||||||
|
# Use pinned memory for faster H2D, ensure contiguity and dtype
|
||||||
|
A_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
|
||||||
|
B_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
|
||||||
|
C_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
|
||||||
|
|
||||||
|
non_blocking = True if device.type == "cuda" else False
|
||||||
|
A = A_cpu.to(device, non_blocking=non_blocking)
|
||||||
|
B = B_cpu.to(device, non_blocking=non_blocking)
|
||||||
|
C = C_cpu.to(device, non_blocking=non_blocking)
|
||||||
|
|
||||||
|
# Pre-allocate outputs to reduce allocations
|
||||||
|
D = torch.empty_like(A)
|
||||||
|
E = torch.empty_like(A)
|
||||||
|
|
||||||
|
# Optional: enable TF32 on Ampere+ for throughput (math still float32-ish)
|
||||||
|
if device.type == "cuda":
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
|
||||||
|
# CUDA Graphs to reduce launch overhead when shapes are static
|
||||||
|
graph = None
|
||||||
|
s_out = torch.zeros((), dtype=torch.float32, device=device)
|
||||||
|
if device.type == "cuda":
|
||||||
|
stream = torch.cuda.Stream()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
with torch.cuda.stream(stream):
|
||||||
|
D.copy_(A).add_(B, alpha=1.0) # warm
|
||||||
|
E.copy_(D).mul_(C).add_(B)
|
||||||
|
_ = E.sum()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
s_buf = torch.zeros((), dtype=torch.float32, device=device)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
g.capture_begin()
|
||||||
|
D.copy_(A).add_(B, alpha=1.0)
|
||||||
|
E.copy_(D).mul_(C).add_(B)
|
||||||
|
s_tmp = E.sum()
|
||||||
|
s_buf.copy_(s_tmp)
|
||||||
|
g.capture_end()
|
||||||
|
graph = (g, s_buf)
|
||||||
|
|
||||||
|
# timing
|
||||||
|
if device.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
|
||||||
|
total = 0.0
|
||||||
|
for _ in range(iters):
|
||||||
|
if graph is not None:
|
||||||
|
g, s_buf = graph
|
||||||
|
g.replay()
|
||||||
|
if device.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
total += float(s_buf.item())
|
||||||
|
else:
|
||||||
|
# CPU path or no graph
|
||||||
|
D.copy_(A).add_(B, alpha=1.0)
|
||||||
|
E.copy_(D).mul_(C).add_(B)
|
||||||
|
s = E.sum()
|
||||||
|
total += float(s.item())
|
||||||
|
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
if device.type == "cuda":
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
ms = (t1 - t0) * 1000.0
|
||||||
|
bytes_per_iter = 7.0 * N * 4.0
|
||||||
|
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
|
||||||
|
|
||||||
|
print("PyTorch optimized")
|
||||||
|
print(f"device={device}")
|
||||||
|
print(f"N={N} iters={iters}")
|
||||||
|
print(f"time_ms={ms:.3f}")
|
||||||
|
print(f"throughput_GBps={gbps:.3f}")
|
||||||
|
print(f"result={total}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Add table
Add a link
Reference in a new issue