当前位置：首页 > news >正文

NsightCompute教程入门

news 来源：原创 2024/9/21 1:37:29

一、定义

使用pycuda 获取显卡的性能
手动绘制rootline model.
nsight system 安装教程
nsight system 使用教程–nsight system 分析pytorch 神经网络
nsight compute 安装教程
nsight compute 使用教程
nsight system 与nsight compute 应用区别
遇到的问题

二、实现

https://blog.csdn.net/weixin_40653140/article/details/136238420
https://blog.csdn.net/m0_61864577/article/details/140022323
https://developer.nvidia.cn/tools-overview

使用pycuda 获取显卡的性能
test.py

import pycuda.driver as cuda
import pycuda.autoinitdef get_gpu_compute_capability_and_clock_rate():device = cuda.Device(0)compute_capability = device.compute_capability()clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE)  # in kHzsm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)cores_per_sm = get_cuda_cores_per_sm(compute_capability)return compute_capability, clock_rate, sm_count, cores_per_smdef get_cuda_cores_per_sm(compute_capability):major, minor = compute_capabilityif major == 2:return 32elif major == 3:return 192elif major == 5:return 128elif major == 6 and minor in [0, 1]:return 64elif major == 6 and minor == 2:return 128elif major == 7 and minor in [0, 5]:return 64elif major == 7 and minor == 2:return 64elif major == 8 and minor in [0, 6]:return 128else:raise ValueError("Unknown compute capability")def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):clock_rate_hz = clock_rate * 1e3  # Convert kHz to Hzflops = clock_rate_hz * sm_count * cores_per_sm * 2  # 2 FLOPs per clock per core (FMA)return flopscompute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)print(f"GPU compute capability: {compute_capability}")        #计算能力
print(f"Clock rate (kHz): {clock_rate}")                     #最大时钟频率        
print(f"Number of SMs: {sm_count}")                           #流多处理器数量
print(f"Cores per SM: {cores_per_sm}")                        #流多处理器核数
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS")    #理论浮点运算能力

在这里插入图片描述
2. 手动绘制rootline model.
roofline_model.py
绘图字体下载：https://blog.csdn.net/qq_35165004/article/details/132738991

import sysimport torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse# 定义一个测试模型
class SimpleModel(nn.Module):def __init__(self, input_features, output_features):super(SimpleModel, self).__init__()self.fc1 = nn.Linear(input_features, output_features, bias=False)def forward(self, x):x = self.fc1(x)return xdef main(args):# 模型和输入数据input_features = 8192output_features = 8192batch_size = 8192model = SimpleModel(input_features, output_features)input_data = torch.randn(batch_size, input_features)test_count = 10# 计算 FLOPs 和内存访问量flops = FlopCountAnalysis(model, input_data).total() * test_countactivations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()print("activations:", activations)# 计算参数个数params = sum(p.numel() for p in model.parameters())# 内存访问量假定为 activations 和params 乘以 4 字节（假设 activations 和 params 是 float32 类型）activation_memory_access = activations * 4params_memory_access = params * 4memory_access = activation_memory_access + params_memory_accessmemory_access = memory_access * test_countmodel = model.cuda()input_data = input_data.float().cuda()for i in range(5):output = model(input_data)torch.cuda.synchronize()# 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:for _ in range(test_count):output = model(input_data)key_averages = prof.key_averages()for ev in key_averages:print(ev)total_cuda_time = sum([event.self_cuda_time_total for event in key_averages if event.key.find("sgemm") >= 0]) / 1e6  # 转换至秒# FLOPs 转换至 GFLOPsflops_measured_glops = flops / 1e9# 内存带宽测量memory_access_gb = memory_access / 1e9bandwidth_measured = memory_access_gb / total_cuda_time  # 单位：GB/sprint("bandwidth_measured:", bandwidth_measured)# GPU 的峰值性能和带宽peak_performance = 38.98368 * 1e3  # 单位：GFLOPsmemory_bandwidth = 1555.0  # 单位：GB/s# 计算 Roofline 模型中的数据点Io = np.logspace(-2, 4, 100)  # GFLOPs/GBperformance = np.minimum(peak_performance, Io * memory_bandwidth)  # 不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance#==============================================================================# 绘制 Roofline 模型plt.figure(figsize=(10, 6))thresold = 0.75         #设定使用率# 设置字体以支持中文font_path = 'simsun.ttc'  # 在这里替换为你的字体路径font_prop = FontProperties(fname=font_path)# Bandwidth Boundx = Io[Io < (peak_performance / memory_bandwidth)]plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth) * thresold,np.minimum(peak_performance, x * memory_bandwidth),color='lightblue', alpha=0.6, label='Bandwidth Bound')# Compute Boundx2 = Io[Io >= (peak_performance / memory_bandwidth)]plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth) * thresold,np.minimum(peak_performance, x2 * memory_bandwidth),color='green', alpha=0.6, label='Compute Bound')# 绘制低性能区域plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth) * thresold,color='gray', alpha=0.6, label='poor performance')plt.axhline(y=peak_performance, color='b', linestyle='--',label=f'峰值计算能力:{peak_performance / 1e3:.2f}TFLOPs')plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--',label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')plt.loglog(Io, performance, label='Roofline')arithmetic_intensity_measured = flops_measured_glops / memory_access_gb  # GFLOPs/GB(算法的静态属性)point_y = arithmetic_intensity_measured * bandwidth_measuredplt.scatter(arithmetic_intensity_measured, point_y, c='r',label=f'Measured Points {point_y / 1e3:.2f} TFLOPs/sec {point_y * 100 / peak_performance:.2f}%')plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)plt.title('Roofline 模型', fontproperties=font_prop)plt.legend(prop=font_prop)# 保存图片而不显示plt.savefig('roofline_model.png')plt.close()print(f"FLOPs: {flops} FLOPs")print(f"内存访问量: {memory_access} 字节")print(f"执行时间: {total_cuda_time:.4f} 秒")print(f"理论值的:{point_y * 100 / peak_performance:.2f}%")parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")args = parser.parse_args()
main(args)

python roofline_model.py

在这里插入图片描述

理论值24.32%，即算力只使用了24%。
如下图，29.87% 为pytorch 实现，37.80%为triton 算子实现，因此，pytorch改进方向为提升模型的算力，降低执行时间。
提升模型的flops,即改进算子的速度、冗余度。
减少内存读取大小，可以量化、改进算子计算冗余度。在这里插入图片描述
3. nsight system 安装教程
官方文档：https://docs.nvidia.com/nsight-systems/UserGuide/index.html#profiling-windows-targets-from-the-gui
nsight system 需要安装服务器段和客户端
方式一： linux服务器端采用命令行执行，获取结果文件，window客户端读取文件
linux服务器端安装：cli 文件

apt install ./NsightSystems-linux-cli-public-2023.2.1.122-3259852.deb

在这里插入图片描述
windows 端安装：选择Windows Host下载安装

使用：demo:1. linux 生成可执行文件

#示例，产生的nsight_t5_fp16下载到本地用nsight工具查看
nsys profile -o nsight_t5_fp16 -f true python temp.py

window 端加载文件

文件-->打开：加载nsys-rep文件

方式二： window 服务器执行命令，window客户端远程连接服务器，实时读取文件
1. 以管理员的方式打开cmd,并执行指令

>>D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py

在这里插入图片描述
2. windows host 客户端打开nsys-rep软件
》》>>file–>open–>baseline.nsys-rep

在这里插入图片描述
4. nsight system 使用教程–nsight system 分析pytorch 神经网络

import torch.cuda.nvtx as nvtx
nvtx.range_push("copy to device")    #添加注释
nvtx.range_pop()                     #弹出
也可以使用
import nvtx
@nvtx.annotate(color="blue")
def my_function():for i in range(5):with nvtx.annotate("my_loop", color="red"):time.sleep(i)my_function()

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())
import torch.cuda.nvtx as nvtxclass Net(nn.Module):def __init__(self):super(Net, self).__init__()self.conv1 = nn.Conv2d(1, 32, 3, 1)self.conv2 = nn.Conv2d(32, 64, 3, 1)self.dropout1 = nn.Dropout(0.25)self.dropout2 = nn.Dropout(0.5)self.fc1 = nn.Linear(9216, 128)self.fc2 = nn.Linear(128, 10)def forward(self, x):x = self.conv1(x)x = F.relu(x)x = self.conv2(x)x = F.relu(x)x = F.max_pool2d(x, 2)x = self.dropout1(x)x = torch.flatten(x, 1)x = self.fc1(x)x = F.relu(x)x = self.dropout2(x)x = self.fc2(x)output = F.log_softmax(x, dim=1)return outputdef train(args, model, device, train_loader, optimizer, epoch):model.train()nvtx.range_push("data loading")for batch_idx, (data, target) in enumerate(train_loader):nvtx.range_pop()nvtx.range_push("batch:"+str(batch_idx))nvtx.range_push("copy to device")data, target = data.to(device), target.to(device)nvtx.range_pop()nvtx.range_push("forward pass")optimizer.zero_grad()output = model(data)loss = F.nll_loss(output, target)nvtx.range_pop()nvtx.range_push("backward pass")loss.backward()optimizer.step()nvtx.range_pop()nvtx.range_pop()if batch_idx % args.log_interval == 0:print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:breakdef main():# Training settingsparser = argparse.ArgumentParser(description='PyTorch MNIST Example')parser.add_argument('--batch-size', type=int, default=64, metavar='N',help='input batch size for training (default: 64)')parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',help='input batch size for testing (default: 1000)')parser.add_argument('--epochs', type=int, default=3, metavar='N',help='number of epochs to train (default: 14)')parser.add_argument('--lr', type=float, default=1.0, metavar='LR',help='learning rate (default: 1.0)')parser.add_argument('--gamma', type=float, default=0.7, metavar='M',help='Learning rate step gamma (default: 0.7)')parser.add_argument('--no-cuda', action='store_true', default=False,help='disables CUDA training')parser.add_argument('--no-mps', action='store_true', default=False,help='disables macOS GPU training')parser.add_argument('--dry-run', action='store_true', default=False,help='quickly check a single pass')parser.add_argument('--seed', type=int, default=1, metavar='S',help='random seed (default: 1)')parser.add_argument('--log-interval', type=int, default=10, metavar='N',help='how many batches to wait before logging training status')parser.add_argument('--save-model', action='store_true', default=False,help='For Saving the current Model')args = parser.parse_args()use_cuda = not args.no_cuda and torch.cuda.is_available()use_mps = not args.no_mps and torch.backends.mps.is_available()torch.manual_seed(args.seed)if use_cuda:device = torch.device("cuda")elif use_mps:device = torch.device("mps")else:device = torch.device("cpu")train_kwargs = {'batch_size': args.batch_size}test_kwargs = {'batch_size': args.test_batch_size}if use_cuda:cuda_kwargs = {'num_workers': 1,'pin_memory': True,'shuffle': True}train_kwargs.update(cuda_kwargs)test_kwargs.update(cuda_kwargs)transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])dataset1 = datasets.MNIST('../data', train=True, download=True,transform=transform)dataset2 = datasets.MNIST('../data', train=False,transform=transform)train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)model = Net().to(device)optimizer = optim.Adadelta(model.parameters(), lr=args.lr)scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)for epoch in range(1, args.epochs + 1):train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)scheduler.step()if args.save_model:torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ == '__main__':main()# D:\softs\nsightsystem\target-windows-x64\nsys profile -t cuda,nvtx,cudnn -o baseline -w true D:/softs/anacondas/envs/chatgml6/python D:/cnki_1/model_predict/test1.py

客户端加载baseline.nsys-rep 文件。
5. nsight compute 安装教程

https://developer.nvidia.com/tools-overview/nsight-compute/get-started
下载linux 版本，sh nsight-compute-linux-2024.2.1.2-34372528.run
安装默认路径： /usr/local/NVIDIA-Nsight-Compute
在这里插入图片描述
window 安装：
安装windows 版本

6 nsight compute 使用教程
》》 >>ncu --list-sets

在这里插入图片描述
采用c++ 代码测试

#include <stdio.h>__global__ void kernel_A(double* A, int N, int M)
{double d = 0.0;int idx = threadIdx.x + blockIdx.x * blockDim.x;// printf("Kernel A\n");if (idx < N) {#pragma unroll(100)for (int j = 0; j < M; ++j) {d += A[idx];}A[idx] = d;}
}__global__ void kernel_B(double* A, int N, int M)
{double d = 0.0;int idx = threadIdx.x + blockIdx.x * blockDim.x;if (idx < N) {#pragma unroll(100)for (int j = 0; j < M; ++j) {d += A[idx];}A[idx] = d;}
}__global__ void kernel_C(double* A, const double* B, int N)
{int idx = threadIdx.x + blockIdx.x * blockDim.x;// printf("Kernel C\n");// Strided memory access: warp 0 accesses (0, stride, 2*stride, ...), warp 1 accesses// (1, stride + 1, 2*stride + 1, ...).const int stride = 16;int strided_idx = threadIdx.x * stride + blockIdx.x % stride + (blockIdx.x / stride) * stride * blockDim.x;if (strided_idx < N) {A[idx] = B[strided_idx] + B[strided_idx];}
}int main() {double* A;double* B;int N = 80 * 2048 * 100;size_t sz = N * sizeof(double);cudaMalloc((void**) &A, sz);cudaMalloc((void**) &B, sz);cudaMemset(A, 0, sz);cudaMemset(B, 0, sz);int threadsPerBlock = 64;int numBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;int M = 10000;kernel_A<<<numBlocks, threadsPerBlock>>>(A, N, M);cudaFuncSetAttribute(kernel_B, cudaFuncAttributeMaxDynamicSharedMemorySize, 48 * 1024);kernel_B<<<numBlocks, threadsPerBlock, 48 * 1024>>>(A, N, M);kernel_C<<<numBlocks, threadsPerBlock>>>(A, B, N);cudaDeviceSynchronize();

将c++ 文件编译
》》>>nvcc -o kernel_abc kernel_abc.cu
采用nsight compute 进行profile
》》>>ncu --set detailed -o kernel_abc ./test/kernel_abc
在这里插入图片描述

window 客户端打开
在这里插入图片描述
python 测试

import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
print(torch.cuda.is_available())
#print(torch.cuda.list_gpu_processes())
import torch.cuda.nvtx as nvtx
print(torch.cuda.device_count())class Net(nn.Module):def __init__(self):super(Net, self).__init__()self.conv1 = nn.Conv2d(1, 32, 3, 1)self.conv2 = nn.Conv2d(32, 64, 3, 1)self.dropout1 = nn.Dropout(0.25)self.dropout2 = nn.Dropout(0.5)self.fc1 = nn.Linear(9216, 128)self.fc2 = nn.Linear(128, 10)def forward(self, x):x = self.conv1(x)x = F.relu(x)x = self.conv2(x)x = F.relu(x)x = F.max_pool2d(x, 2)x = self.dropout1(x)x = torch.flatten(x, 1)x = self.fc1(x)x = F.relu(x)x = self.dropout2(x)x = self.fc2(x)output = F.log_softmax(x, dim=1)return outputdef train(args, model, device, train_loader, optimizer, epoch):model.train()nvtx.range_push("data loading")for batch_idx, (data, target) in enumerate(train_loader):nvtx.range_pop()nvtx.range_push("batch:"+str(batch_idx))nvtx.range_push("copy to device")data, target = data.to(device), target.to(device)nvtx.range_pop()nvtx.range_push("forward pass")optimizer.zero_grad()output = model(data)loss = F.nll_loss(output, target)nvtx.range_pop()nvtx.range_push("backward pass")loss.backward()optimizer.step()nvtx.range_pop()nvtx.range_pop()if batch_idx % args.log_interval == 0:print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))if args.dry_run:breakif batch_idx>10:breakdef main():# Training settingsparser = argparse.ArgumentParser(description='PyTorch MNIST Example')parser.add_argument('--batch-size', type=int, default=64, metavar='N',help='input batch size for training (default: 64)')parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',help='input batch size for testing (default: 1000)')parser.add_argument('--epochs', type=int, default=1, metavar='N',help='number of epochs to train (default: 14)')parser.add_argument('--lr', type=float, default=1.0, metavar='LR',help='learning rate (default: 1.0)')parser.add_argument('--gamma', type=float, default=0.7, metavar='M',help='Learning rate step gamma (default: 0.7)')parser.add_argument('--no-cuda', action='store_true', default=False,help='disables CUDA training')parser.add_argument('--no-mps', action='store_true', default=False,help='disables macOS GPU training')parser.add_argument('--dry-run', action='store_true', default=False,help='quickly check a single pass')parser.add_argument('--seed', type=int, default=1, metavar='S',help='random seed (default: 1)')parser.add_argument('--log-interval', type=int, default=10, metavar='N',help='how many batches to wait before logging training status')parser.add_argument('--save-model', action='store_true', default=False,help='For Saving the current Model')args = parser.parse_args()use_cuda = not args.no_cuda and torch.cuda.is_available()use_mps = not args.no_mps and torch.backends.mps.is_available()torch.manual_seed(args.seed)if use_cuda:device = torch.device("cuda")elif use_mps:device = torch.device("mps")else:device = torch.device("cpu")train_kwargs = {'batch_size': args.batch_size}test_kwargs = {'batch_size': args.test_batch_size}if use_cuda:cuda_kwargs = {'num_workers': 1,'pin_memory': True,'shuffle': True}train_kwargs.update(cuda_kwargs)test_kwargs.update(cuda_kwargs)transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,), (0.3081,))])dataset1 = datasets.MNIST('../data', train=True, download=True,transform=transform)dataset2 = datasets.MNIST('../data', train=False,transform=transform)train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)model = Net().to(device)optimizer = optim.Adadelta(model.parameters(), lr=args.lr)scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)for epoch in range(1, args.epochs + 1):train(args, model, device, train_loader, optimizer, epoch)#test1111(model, device, test_loader)scheduler.step()if args.save_model:torch.save(model.state_dict(), "mnist_cnn.pt")if __name__ == '__main__':main()ncu --set roofline -o models python test/test1.py

windows 客户端加载models.ncu-rep
在这里插入图片描述

7 nsight system 与nsight compute 应用区别
nsight system 倾向于全局角度分析核函数的读写、调度、内存占用率，cpu与gpu的异步调用
nsight compute 倾向于核函数内部操作，不同模块的执行时间、吞吐量、带宽分析等。

8 遇到的问题
8.1. 加载到98%，卡住不动。原因：笔记本电脑性能低，带不动。换台式机即可。
8.2 ERROR ERR_NVGPUCTRPERM - The user does not have permission to access NVIDIA GPU Performance Counters on the target device 0. For instructions on enabling permissions and to get more information see https://developer.nvidia.com/ERR_NVGPUCTRPERM
解决：添加root 权限
docker run -it --gpus all --name llm-04 -v /jiayafei_linux/:/home/ -p38014:8014 --privileged=true pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel /bin/bash