This commit is contained in:
sherlock 2025-09-06 09:41:18 +05:30
commit ea7dcba939
6 changed files with 481 additions and 0 deletions

55
pytorch/baseline.py Normal file
View file

@ -0,0 +1,55 @@
import argparse
import time
import torch
def main():
p = argparse.ArgumentParser()
p.add_argument("--N", type=int, default=100_000_000)
p.add_argument("--iters", type=int, default=10)
p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
args = p.parse_args()
torch.manual_seed(0)
N = args.N
iters = args.iters
device = torch.device(args.device)
A = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
B = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
C = torch.empty(N, dtype=torch.float32).uniform_(-1, 1)
A = A.to(device)
B = B.to(device)
C = C.to(device)
# warmup
D = A + B
E = D * C + B
_ = E.sum()
if device.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
total = 0.0
for _ in range(iters):
D = A + B
E = D * C + B
s = E.sum()
if device.type == "cuda":
torch.cuda.synchronize()
total += float(s.item())
t1 = time.perf_counter()
ms = (t1 - t0) * 1000.0
bytes_per_iter = 7.0 * N * 4.0
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
print("PyTorch baseline")
print(f"device={device}")
print(f"N={N} iters={iters}")
print(f"time_ms={ms:.3f}")
print(f"throughput_GBps={gbps:.3f}")
print(f"result={total}")
if __name__ == "__main__":
main()

95
pytorch/optimized.py Normal file
View file

@ -0,0 +1,95 @@
import argparse
import time
import torch
def main():
p = argparse.ArgumentParser()
p.add_argument("--N", type=int, default=100_000_000)
p.add_argument("--iters", type=int, default=10)
args = p.parse_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)
N = args.N
iters = args.iters
# Use pinned memory for faster H2D, ensure contiguity and dtype
A_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
B_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
C_cpu = torch.empty(N, dtype=torch.float32, pin_memory=True).uniform_(-1, 1)
non_blocking = True if device.type == "cuda" else False
A = A_cpu.to(device, non_blocking=non_blocking)
B = B_cpu.to(device, non_blocking=non_blocking)
C = C_cpu.to(device, non_blocking=non_blocking)
# Pre-allocate outputs to reduce allocations
D = torch.empty_like(A)
E = torch.empty_like(A)
# Optional: enable TF32 on Ampere+ for throughput (math still float32-ish)
if device.type == "cuda":
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# CUDA Graphs to reduce launch overhead when shapes are static
graph = None
s_out = torch.zeros((), dtype=torch.float32, device=device)
if device.type == "cuda":
stream = torch.cuda.Stream()
torch.cuda.synchronize()
with torch.cuda.stream(stream):
D.copy_(A).add_(B, alpha=1.0) # warm
E.copy_(D).mul_(C).add_(B)
_ = E.sum()
torch.cuda.synchronize()
g = torch.cuda.CUDAGraph()
s_buf = torch.zeros((), dtype=torch.float32, device=device)
torch.cuda.synchronize()
g.capture_begin()
D.copy_(A).add_(B, alpha=1.0)
E.copy_(D).mul_(C).add_(B)
s_tmp = E.sum()
s_buf.copy_(s_tmp)
g.capture_end()
graph = (g, s_buf)
# timing
if device.type == "cuda":
torch.cuda.synchronize()
t0 = time.perf_counter()
total = 0.0
for _ in range(iters):
if graph is not None:
g, s_buf = graph
g.replay()
if device.type == "cuda":
torch.cuda.synchronize()
total += float(s_buf.item())
else:
# CPU path or no graph
D.copy_(A).add_(B, alpha=1.0)
E.copy_(D).mul_(C).add_(B)
s = E.sum()
total += float(s.item())
t1 = time.perf_counter()
if device.type == "cuda":
torch.cuda.synchronize()
ms = (t1 - t0) * 1000.0
bytes_per_iter = 7.0 * N * 4.0
gbps = (bytes_per_iter * iters) / (t1 - t0) / 1e9
print("PyTorch optimized")
print(f"device={device}")
print(f"N={N} iters={iters}")
print(f"time_ms={ms:.3f}")
print(f"throughput_GBps={gbps:.3f}")
print(f"result={total}")
if __name__ == "__main__":
main()