NsightCompute教程入门
一、定义
- 使用pycuda 获取显卡的性能
- 手动绘制rootline model.
- nsight system 安装教程
- nsight system 使用教程–nsight system 分析pytorch 神经网络
- nsight compute 安装教程
- nsight compute 使用教程
- nsight system 与nsight compute 应用区别
- 遇到的问题
二、实现
https://blog.csdn.net/weixin_40653140/article/details/136238420
https://blog.csdn.net/m0_61864577/article/details/140022323
https://developer.nvidia.cn/tools-overview
- 使用pycuda 获取显卡的性能
test.py
import pycuda.driver as cuda
import pycuda.autoinitdef get_gpu_compute_capability_and_clock_rate():device = cuda.Device(0)compute_capability = device.compute_capability()clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE) # in kHzsm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)cores_per_sm = get_cuda_cores_per_sm(compute_capability)return compute_capability, clock_rate, sm_count, cores_per_smdef get_cuda_cores_per_sm(compute_capability):major, minor = compute_capabilityif major == 2:return 32elif major == 3:return 192elif major == 5:return 128elif major == 6 and minor in [0, 1]:return 64elif major == 6 and minor == 2:return 128elif major == 7 and minor in [0, 5]:return 64elif major == 7 and minor == 2:return 64elif major == 8 and minor in [0, 6]:return 128else:raise ValueError("Unknown compute capability")def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):clock_rate_hz = clock_rate * 1e3 # Convert kHz to Hzflops = clock_rate_hz * sm_count * cores_per_sm * 2 # 2 FLOPs per clock per core (FMA)return flopscompute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)print(f"GPU compute capability: {compute_capability}") #计算能力
print(f"Clock rate (kHz): {clock_rate}") #最大时钟频率
print(f"Number of SMs: {sm_count}") #流多处理器数量
print(f"Cores per SM: {cores_per_sm}") #流多处理器核数
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS") #理论浮点运算能力
2. 手动绘制rootline model.
roofline_model.py
绘图字体下载:https://blog.csdn.net/qq_35165004/article/details/132738991
import sysimport torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse# 定义一个测试模型
class SimpleModel(nn.Module):def __init__(self, input_features, output_features):super(SimpleModel, self).__init__()self.fc1 = nn.Linear(input_features, output_features, bias=False)def forward(self, x):x = self.fc1(x)return xdef main(args):# 模型和输入数据input_features = 8192output_features = 8192batch_size = 8192model = SimpleModel(input_features, output_features)input_data = torch.randn(batch_size, input_features)test_count = 10# 计算 FLOPs 和内存访问量flops = FlopCountAnalysis(model, input_data).total() * test_countactivations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()print("activations:", activations)# 计算参数个数params = sum(p.numel() for p in model.parameters())# 内存访问量假定为 activations 和params 乘以 4 字节(假设 activations 和 params 是 float32 类型)activation_memory_access = activations * 4params_memory_access = params * 4memory_access = activation_memory_access + params_memory_accessmemory_access = memory_access * test_countmodel = model.cuda()input_data = input_data.float().cuda()for i in range(5):output = model(input_data)torch.cuda.synchronize()# 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:for _ in range(test_count):output = model(input_data)key_averages = prof.key_averages()for ev in key_averages:print(ev)total_cuda_time = sum([event.self_cuda_time_total for event in key_averages if event.key.find("sgemm") >= 0]) / 1e6 # 转换至秒# FLOPs 转换至 GFLOPsflops_measured_glops = flops / 1e9# 内存带宽测量memory_access_gb = memory_access / 1e9bandwidth_measured = memory_access_gb / total_cuda_time # 单位:GB/sprint("bandwidth_measured:", bandwidth_measured)# GPU 的峰值性能和带宽peak_performance = 38.98368 * 1e3 # 单位:GFLOPsmemory_bandwidth = 1555.0 # 单位:GB/s# 计算 Roofline 模型中的数据点Io = np.logspace(-2, 4, 100) # GFLOPs/GBperformance = np.minimum(peak_performance, Io * memory_bandwidth) # 不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance#==============================================================================# 绘制 Roofline 模型plt.figure(figsize=(10, 6))thresold = 0.75 #设定使用率# 设置字体以支持中文font_path = 'simsun.ttc' # 在这里替换为你的字体路径font_prop = FontProperties(fname=font_path)# Bandwidth Boundx = Io[Io < (peak_performance / memory_bandwidth)]plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth) * thresold,np.minimum(peak_performance, x * memory_bandwidth),color='lightblue', alpha=0.6, label='Bandwidth Bound')# Compute Boundx2 = Io[Io >= (peak_performance / memory_bandwidth)]plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth) * thresold,np.minimum(peak_performance, x2 * memory_bandwidth),color='green', alpha=0.6, label='Compute Bound')# 绘制低性能区域plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth) * thresold,color='gray', alpha=0.6, label='poor performance')plt.axhline(y=peak_performance, color='b', linestyle='--',label=f'峰值计算能力:{peak_performance / 1e3:.2f}TFLOPs')plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--',label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')plt.loglog(Io, performance, label='Roofline')arithmetic_intensity_measured = flops_measured_glops / memory_access_gb # GFLOPs/GB(算法的静态属性)point_y = arithmetic_intensity_measured * bandwidth_measuredplt.scatter(arithmetic_intensity_measured, point_y, c='r',label=f'Measured Points {point_y / 1e3:.2f} TFLOPs/sec {point_y * 100 / peak_performance:.2f}%')plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)plt.title('Roofline 模型', fontproperties=font_prop)plt.legend(prop=font_prop)# 保存图片而不显示plt.savefig('roofline_model.png')plt.close()print(f"FLOPs: {flops} FLOPs")print(f"内存访问量: {memory_access} 字节")print(f"执行时间: {total_cuda_time:.4f} 秒")print(f"理论值的:{point_y * 100 / peak_performance:.2f}%")parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")args = parser.parse_args()
main(args)
python roofline_model.py
理论值24.32%,即算力只使用了24%。
如下图,29.87% 为pytorch 实现,37.80%为triton 算子实现,因此,pytorch改进方向为提升模型的算力,降低执行时间。
提升模型的flops,即改进算子的速度、冗余度。
减少内存读取大小,可以量化、改进算子计算冗余度。
3. nsight system 安装教程
官方文档:https://docs.nvidia.com/nsight-systems/UserGuide/index.html#profiling-windows-targets-from-the-gui
nsight system 需要安装服务器段 和客户端
方式一: linux服务器端采用命令行执行,获取结果文件,window客户端读取文件
linux服务器端安装:cli 文件
apt install ./NsightSystems-linux-cli-public-2023.2.1.122-3259852.deb
windows 端安装:选择Windows Host下载安装
- 使用:demo:1. linux 生成可执行文件
#示例,产生的nsight_t5_fp16下载到本地用nsight工具查看
nsys profile -o nsight_t5_fp16 -f true python temp.py
- window 端加载文件
文件-->打开:加载nsys-rep文件
方式二: window 服务器执行命令,window客户端远程连接服务器,实时读取文件
1. 以管理员的方式打开cmd,并执行指令
>>D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py
2. windows host 客户端打开nsys-rep软件
》》>>file–>open–>baseline.nsys-rep
4. nsight system 使用教程–nsight system 分析pytorch 神经网络
import torch.cuda.nvtx as nvtx
nvtx.range_push("copy to device") #添加注释
nvtx.range_pop() #弹出
也可以使用
import nvtx
@nvtx.annotate(color="blue")
def my_function():for i in range(5):with nvtx.annotate("my_loop", color="red"):time.sleep(i)my_function()
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())
import torch.cuda.nvtx as nvtxclass Net(nn.Module):def __init__(self):super(Net, self).__init__()self.conv1 = nn.Conv2d(1, 32, 3, 1)self.conv2 = nn.Conv2d(32, 64, 3, 1)self.dropout1 = nn.Dropout(0.25)self.dropout2 = nn.Dropout(0.5)self.fc1 = nn.Linear(9216, 128)self.fc2 = nn.Linear(128, 10)def forward(self, x):x = self.conv1(x)x = F.relu(x)x = self.conv2(x)x = F.relu(x)x = F.max_pool2d(x, 2)x = self.dropout1(x)x = torch.flatten(x, 1)x = self.fc1(x)x = F.relu(x)x = self.dropout2(x)x = self.fc2(x)output = F.log_softmax(x, dim=1)return outputdef train(args, model, device, train_loader, optimizer, epoch):model.train()nvtx.range_push("data loading")for batch_idx, (data, target) in enumerate(train_loader):nvtx.range_pop()nvtx.range_push("batch:"+str(batch_idx))nvtx.range_push("copy to device")data, target = data.to(device), target.to(device)nvtx.range_pop()nvtx.range_push("forward pass")optimizer.zero_grad()output = model(data)loss = F.nll_loss(output, target)nvtx.range_pop()nvtx.range_push("backward pass")loss.backward()optimizer.step()nvtx.range_pop()nvtx.range_pop()if batch_idx % args.log_interval == 0:print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:breakdef main():# Training settingsparser = argparse.ArgumentParser(description='PyTorch MNIST Example')parser.add_argument('--batch-size', type=int, default=64, metavar='N',help='input batch size for training (default: 64)')parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',help='input batch size for testing (default: 1000)')parser.add_argument('--epochs', type=int, default=3, metavar='N',help='number of epochs to train (default: 14)')parser.add_argument('--lr', type=float, default=1.0, metavar='LR',help='learning rate (default: 1.0)')parser.add_argument('--gamma', type=float, default=0.7, metavar='M',help='Learning rate step gamma (default: 0.7)')parser.add_argument('--no-cuda', action='store_true', default=False,help='disables CUDA training')parser.add_argument('--no-mps', action='store_true', default=False,help='disables macOS GPU training')parser.add_argument('--dry-run', action='store_true', default=False,help='quickly check a single pass')parser.add_argument('--seed', type=int, default=1, metavar='S',help='random seed (default: 1)')parser.add_argument('--log-interval', type=int, default=10, metavar='N',help='how many batches to wait before logging training status')parser.add_argument('--save-model', action='store_true', default=False,help='For Saving the current Model')args = parser.parse_args()use_cuda = not args.no_cuda and torch.cuda.is_available()use_mps = not args.no_mps and torch.backends.mps.is_available()torch.manual_seed(args.seed)if use_cuda:device = torch.device("cuda")elif use_mps:device = torch.device("mps")else:device = torch.device("cpu")train_kwargs = {'batch_size': args.batch_size}test_kwargs = {'batch_size': args.test_batch_size}if use_cuda:cuda_kwargs = {'num_workers': 1,'pin_memory': True,'shuffle': True}train_kwargs.update(cuda_kwargs)test_kwargs.update(cuda_kwargs)transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])dataset1 = datasets.MNIST('../data', train=True, download=True,transform=transform)dataset2 = datasets.MNIST('../data', train=False,transform=transform)train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)model = Net().to(device)optimizer = optim.Adadelta(model.parameters(), lr=args.lr)scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)for epoch in range(1, args.epochs + 1):train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)scheduler.step()if args.save_model:torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ == '__main__':main()# D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda,nvtx,cudnn -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py
客户端加载baseline.nsys-rep 文件。
5. nsight compute 安装教程
https://developer.nvidia.com/tools-overview/nsight-compute/get-started
下载linux 版本,sh nsight-compute-linux-2024.2.1.2-34372528.run
安装默认路径: /usr/local/NVIDIA-Nsight-Compute
window 安装:
安装windows 版本
6 nsight compute 使用教程
》》 >>ncu --list-sets
采用c++ 代码测试
#include <stdio.h>__global__ void kernel_A(double* A, int N, int M)
{double d = 0.0;int idx = threadIdx.x + blockIdx.x * blockDim.x;// printf("Kernel A\n");if (idx < N) {#pragma unroll(100)for (int j = 0; j < M; ++j) {d += A[idx];}A[idx] = d;}
}__global__ void kernel_B(double* A, int N, int M)
{double d = 0.0;int idx = threadIdx.x + blockIdx.x * blockDim.x;if (idx < N) {#pragma unroll(100)for (int j = 0; j < M; ++j) {d += A[idx];}A[idx] = d;}
}__global__ void kernel_C(double* A, const double* B, int N)
{int idx = threadIdx.x + blockIdx.x * blockDim.x;// printf("Kernel C\n");// Strided memory access: warp 0 accesses (0, stride, 2*stride, ...), warp 1 accesses// (1, stride + 1, 2*stride + 1, ...).const int stride = 16;int strided_idx = threadIdx.x * stride + blockIdx.x % stride + (blockIdx.x / stride) * stride * blockDim.x;if (strided_idx < N) {A[idx] = B[strided_idx] + B[strided_idx];}
}int main() {double* A;double* B;int N = 80 * 2048 * 100;size_t sz = N * sizeof(double);cudaMalloc((void**) &A, sz);cudaMalloc((void**) &B, sz);cudaMemset(A, 0, sz);cudaMemset(B, 0, sz);int threadsPerBlock = 64;int numBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;int M = 10000;kernel_A<<<numBlocks, threadsPerBlock>>>(A, N, M);cudaFuncSetAttribute(kernel_B, cudaFuncAttributeMaxDynamicSharedMemorySize, 48 * 1024);kernel_B<<<numBlocks, threadsPerBlock, 48 * 1024>>>(A, N, M);kernel_C<<<numBlocks, threadsPerBlock>>>(A, B, N);cudaDeviceSynchronize();
将c++ 文件编译
》》>>nvcc -o kernel_abc kernel_abc.cu
采用nsight compute 进行profile
》》>>ncu --set detailed -o kernel_abc ./test/kernel_abc
window 客户端打开
python 测试
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())
#print(torch.cuda.list_gpu_processes())
import torch.cuda.nvtx as nvtx
print(torch.cuda.device_count())class Net(nn.Module):def __init__(self):super(Net, self).__init__()self.conv1 = nn.Conv2d(1, 32, 3, 1)self.conv2 = nn.Conv2d(32, 64, 3, 1)self.dropout1 = nn.Dropout(0.25)self.dropout2 = nn.Dropout(0.5)self.fc1 = nn.Linear(9216, 128)self.fc2 = nn.Linear(128, 10)def forward(self, x):x = self.conv1(x)x = F.relu(x)x = self.conv2(x)x = F.relu(x)x = F.max_pool2d(x, 2)x = self.dropout1(x)x = torch.flatten(x, 1)x = self.fc1(x)x = F.relu(x)x = self.dropout2(x)x = self.fc2(x)output = F.log_softmax(x, dim=1)return outputdef train(args, model, device, train_loader, optimizer, epoch):model.train()nvtx.range_push("data loading")for batch_idx, (data, target) in enumerate(train_loader):nvtx.range_pop()nvtx.range_push("batch:"+str(batch_idx))nvtx.range_push("copy to device")data, target = data.to(device), target.to(device)nvtx.range_pop()nvtx.range_push("forward pass")optimizer.zero_grad()output = model(data)loss = F.nll_loss(output, target)nvtx.range_pop()nvtx.range_push("backward pass")loss.backward()optimizer.step()nvtx.range_pop()nvtx.range_pop()if batch_idx % args.log_interval == 0:print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:breakif batch_idx>10:breakdef main():# Training settingsparser = argparse.ArgumentParser(description='PyTorch MNIST Example')parser.add_argument('--batch-size', type=int, default=64, metavar='N',help='input batch size for training (default: 64)')parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',help='input batch size for testing (default: 1000)')parser.add_argument('--epochs', type=int, default=1, metavar='N',help='number of epochs to train (default: 14)')parser.add_argument('--lr', type=float, default=1.0, metavar='LR',help='learning rate (default: 1.0)')parser.add_argument('--gamma', type=float, default=0.7, metavar='M',help='Learning rate step gamma (default: 0.7)')parser.add_argument('--no-cuda', action='store_true', default=False,help='disables CUDA training')parser.add_argument('--no-mps', action='store_true', default=False,help='disables macOS GPU training')parser.add_argument('--dry-run', action='store_true', default=False,help='quickly check a single pass')parser.add_argument('--seed', type=int, default=1, metavar='S',help='random seed (default: 1)')parser.add_argument('--log-interval', type=int, default=10, metavar='N',help='how many batches to wait before logging training status')parser.add_argument('--save-model', action='store_true', default=False,help='For Saving the current Model')args = parser.parse_args()use_cuda = not args.no_cuda and torch.cuda.is_available()use_mps = not args.no_mps and torch.backends.mps.is_available()torch.manual_seed(args.seed)if use_cuda:device = torch.device("cuda")elif use_mps:device = torch.device("mps")else:device = torch.device("cpu")train_kwargs = {'batch_size': args.batch_size}test_kwargs = {'batch_size': args.test_batch_size}if use_cuda:cuda_kwargs = {'num_workers': 1,'pin_memory': True,'shuffle': True}train_kwargs.update(cuda_kwargs)test_kwargs.update(cuda_kwargs)transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])dataset1 = datasets.MNIST('../data', train=True, download=True,transform=transform)dataset2 = datasets.MNIST('../data', train=False,transform=transform)train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)model = Net().to(device)optimizer = optim.Adadelta(model.parameters(), lr=args.lr)scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)for epoch in range(1, args.epochs + 1):train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)scheduler.step()if args.save_model:torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ == '__main__':main()ncu --set roofline -o models python test/test1.py
windows 客户端加载models.ncu-rep
7 nsight system 与nsight compute 应用区别
nsight system 倾向于全局角度分析核函数的读写、调度、内存占用率,cpu与gpu的异步调用
nsight compute 倾向于核函数内部操作,不同模块的执行时间、吞吐量、带宽分析等。
8 遇到的问题
8.1. 加载到98%,卡住不动。原因:笔记本电脑性能低,带不动。换台式机即可。
8.2 ERROR ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
解决:添加root 权限
docker run -it --gpus all --name llm-04 -v /jiayafei_linux/:/home/ -p38014:8014 --privileged=true pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel /bin/bash