当前位置: 首页 > news >正文

Tesla T4 P2P测试

Tesla T4 P2P测试

  • 一.测试环境
  • 二.测试步骤
    • 1.获取设备信息
    • 2.查看PCIE拓扑结构
    • 3.选择9B、9E这二张
    • 4.查看逻辑设备ID
    • 5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)
    • 6.不同地址的原子操作
    • 2.P2P与非P2P的性能差异
    • 3.GPU带宽测试

Tesla T4 P2P测试

  • 通过物理ID找到逻辑ID
  • NCU P2P相关的Metrics
  • PCIE、DRAM相关的Metrics

一.测试环境


二.测试步骤

1.获取设备信息

nvidia-smi -L
GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-c91a8013-0877-df61-53b9-016fabcd5f82)
GPU 1: NVIDIA A30 (UUID: GPU-f67790b5-bb58-614d-4190-3598a99f925e)
GPU 2: Tesla T4 (UUID: GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63)
GPU 3: Tesla T4 (UUID: GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e)
GPU 4: Tesla T4 (UUID: GPU-d59282d2-060d-270e-1c0e-f50e936ffede)
GPU 5: NVIDIA GeForce RTX 3080 Ti (UUID: GPU-9a131b18-28de-d6a1-01e9-76a133e21680)
GPU 6: NVIDIA A30 (UUID: GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02)
GPU 7: Tesla T4 (UUID: GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1)nvidia-smi  -q | grep "Bus Id"
Bus Id                            : 00000000:34:00.0
Bus Id                            : 00000000:35:00.0
Bus Id                            : 00000000:36:00.0 Tesla T4
Bus Id                            : 00000000:37:00.0 Tesla T4
Bus Id                            : 00000000:9B:00.0 Tesla T4 d59282d2
Bus Id                            : 00000000:9C:00.0
Bus Id                            : 00000000:9D:00.0
Bus Id                            : 00000000:9E:00.0 Tesla T4 1f45f1e1

2.查看PCIE拓扑结构

lstopo --ignore Core --ignore Misc --ignore PU

在这里插入图片描述

3.选择9B、9E这二张

4.查看逻辑设备ID

tee devinfo.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)int main() {int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<device_count;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));printf("Index,%d UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",deviceid,(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);}return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o devinfo devinfo.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
unset CUDA_VISIBLE_DEVICES && ./devinfo
  • 输出
Index,0 UUID:GPU-c91a8013-0877-df61-53b9-016fabcd5f82
Index,1 UUID:GPU-9a131b18-28de-d6a1-01e9-76a133e21680
Index,2 UUID:GPU-f67790b5-bb58-614d-4190-3598a99f925e
Index,3 UUID:GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02
Index,4 UUID:GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63
Index,5 UUID:GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e
Index,6 UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede  #选中
Index,7 UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1  #选中

5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)

export CUDA_VISIBLE_DEVICES=6,7

6.不同地址的原子操作

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;for(int i=0;i<102400;i++){atomicAdd(&data[idx], idx*i);}
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%8.4fms Kernel:%8.4fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;printf("UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size (32*4)size_t dataSize = block_size * sizeof(float);float *data0_dev, *data1_dev;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMemset(data1_dev, 0, dataSize));// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存TIMEIT([&data0_dev](cudaStream_t &stream)-> void {dummyKernel<1><<<1, block_size,0,stream>>>(data0_dev);},stream,start_ev,stop_ev);TIMEIT([&data1_dev](cudaStream_t &stream)-> void {dummyKernel<3><<<1, block_size,0,stream>>>(data1_dev);},stream,start_ev,stop_ev);CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./p2p/usr/local/cuda/bin/ncu --metrics \
lts__t_requests_aperture_device.sum,\
lts__t_sectors_aperture_device.sum,\
lts__t_requests_aperture_peer.sum,\
lts__t_requests_srcnode_gpc_aperture_peer.sum,\
lts__t_requests_srcunit_l1_aperture_peer.sum,\
lts__t_requests_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_aperture_peer.sum,\
lts__t_sectors_srcnode_gpc_aperture_peer.sum,\
lts__t_sectors_srcunit_l1_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
dram__bytes_read.sum,\
dram__bytes_write.sum,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
  • 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
E2E:132.1300ms Kernel:132.1245ms  #GPU1通过P2P对GPU0的设备内存进行原子操作
E2E:  3.5552ms Kernel:  3.5444ms  #GPU1对本地DRAM进行原子操作void dummyKernel<(int)1>(float *), 2024-Sep-25 17:06:47, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum                                                             Kbyte                          52.86
dram__bytes_read.sum.per_second                                           Kbyte/second                         405.11
dram__bytes_write.sum                                                            Kbyte                           1.25
dram__bytes_write.sum.per_second                                          Kbyte/second                           9.56
lts__t_requests_aperture_device.sum                                            request                          17775
lts__t_requests_aperture_peer.sum                                              request                         409600 #4个warp,每个102400 次合并访问 地址范围了4*32*4
lts__t_requests_srcnode_gpc_aperture_peer.sum                                  request                         409600
lts__t_requests_srcunit_l1_aperture_peer.sum                                   request                              0
lts__t_requests_srcunit_tex_aperture_peer.sum                                  request                         409600
lts__t_sectors_aperture_device.sum                                              sector                          93043
lts__t_sectors_aperture_peer.sum                                                sector                        1638400 
lts__t_sectors_srcnode_gpc_aperture_peer.sum                                    sector                        1638400 #合并访问一次请求4个sector 1638400*32=50MB
lts__t_sectors_srcunit_l1_aperture_peer.sum                                     sector                              0
lts__t_sectors_srcunit_tex_aperture_peer.sum                                    sector                        1638400
pcie__read_bytes.sum                                                             Mbyte                          59.10
pcie__read_bytes.sum.per_second                                           Mbyte/second                         452.93
pcie__write_bytes.sum                                                            Mbyte                          72.18 #实际PCIE读写加起来超过50MB
pcie__write_bytes.sum.per_second                                          Mbyte/second                         553.17
---------------------------------------------------------------------- --------------- ------------------------------void dummyKernel<(int)3>(float *), 2024-Sep-25 17:06:48, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum                                                             Kbyte                           6.21
dram__bytes_read.sum.per_second                                           Mbyte/second                           1.77
dram__bytes_write.sum                                                             byte                            224
dram__bytes_write.sum.per_second                                          Kbyte/second                          63.93
lts__t_requests_aperture_device.sum                                            request                         414530
lts__t_requests_aperture_peer.sum                                              request                              0
lts__t_requests_srcnode_gpc_aperture_peer.sum                                  request                              0
lts__t_requests_srcunit_l1_aperture_peer.sum                                   request                              0
lts__t_requests_srcunit_tex_aperture_peer.sum                                  request                              0
lts__t_sectors_aperture_device.sum                                              sector                        1643605
lts__t_sectors_aperture_peer.sum                                                sector                              0
lts__t_sectors_srcnode_gpc_aperture_peer.sum                                    sector                              0
lts__t_sectors_srcunit_l1_aperture_peer.sum                                     sector                              0
lts__t_sectors_srcunit_tex_aperture_peer.sum                                    sector                              0
pcie__read_bytes.sum                                                             Kbyte                           3.58
pcie__read_bytes.sum.per_second                                           Mbyte/second                           1.02
pcie__write_bytes.sum                                                            Kbyte                           3.07
pcie__write_bytes.sum.per_second                                          Kbyte/second                         876.74
---------------------------------------------------------------------- --------------- ------------------------------

2.P2P与非P2P的性能差异

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;output_data[idx]=input_data[idx];
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size 1024#define block_count 1000000Lsize_t dataSize = block_count*block_size * sizeof(float);float *data0_dev, *data1_dev,*data1_dev_ex;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));char *host;CUDA_CHECK(cudaMallocHost(&host,dataSize));printf("Init Done(%.2f)GB..\n",dataSize/1024.0/1024.0/1024.0);// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(host,data1_dev,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data1_dev_ex,data1_dev);},stream,start_ev,stop_ev);CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFreeHost(host));CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));CUDA_CHECK(cudaFree(data1_dev_ex));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./p2p/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
dram__bytes_read.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_write.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
  • 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
Init Done(3.81)GB..
E2E: 325.70ms Kernel: 325.71ms  # GPU1 cudaMemcpyHostToDevice  11.697GB/s
E2E: 307.29ms Kernel: 307.31ms  # GPU1 通过P2P从GPU0的设备内存读取数据(比H2D快) 12.39GB/s
E2E:  37.90ms Kernel:  37.89ms  # GPU1 DRAM内 D2D的拷贝 2*100.55GB/svoid dummyKernel<1>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name                                         Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed            %         0.01
dram__bytes_read.sum.per_second                         Mbyte/s        37.35
dram__bytes_write.sum.pct_of_peak_sustained_elapsed           %         4.72
dram__bytes_write.sum.per_second                        Gbyte/s        15.10
pcie__read_bytes.sum.per_second                         Gbyte/s        15.01
pcie__write_bytes.sum.per_second                        Gbyte/s         3.34
--------------------------------------------------- ----------- ------------void dummyKernel<2>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name                                         Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed            %        37.34  #同时读写时,利用率加起来75%
dram__bytes_read.sum.per_second                         Gbyte/s       119.41
dram__bytes_write.sum.pct_of_peak_sustained_elapsed           %        37.81
dram__bytes_write.sum.per_second                        Gbyte/s       120.89  #加起来239GB/s,跟后面的带宽测试一致
pcie__read_bytes.sum.per_second                         Mbyte/s        22.03
pcie__write_bytes.sum.per_second                        Mbyte/s         7.42
--------------------------------------------------- ----------- ------------

3.GPU带宽测试

git clone https://www.github.com/nvidia/cuda-samples
cd cuda-samples/Samples/1_Utilities/deviceQuery
make clean && make
./deviceQuery
cd ../bandwidthTest/
make clean && make
./bandwidthTest --device=0
  • 输出
Running on...Device 0: Tesla T4Quick ModeHost to Device Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     12.8Device to Host Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     13.1Device to Device Bandwidth, 1 Device(s)PINNED Memory TransfersTransfer Size (Bytes)        Bandwidth(GB/s)32000000                     239.4
Result = PASS

相关文章:

  • Apache Iceberg 与 Spark整合-使用教程(Iceberg 官方文档解析)
  • 重头开始嵌入式第四十二天(硬件 ARM体系架构)
  • 计算机网络(八) —— Udp协议
  • powershell@update-help更新文档和离线文档安装@并行加速安装帮助文档更新@安装报错问题
  • 【LeetCode:219. 存在重复元素 II + 哈希表】
  • Ant design vue中的提示框(a-tooltip)
  • Linux应用开发实验班——JSON-RPC
  • 大数据新视界 --大数据大厂之HBase 在大数据存储中的应用与表结构设计
  • 【有啥问啥】“弱激励学习(Weak Incentive Learning)”的原理与过程解析
  • 如何使用ssm实现基于SpringMVC网上选课系统的设计与实现
  • 努比亚z17努比亚NX563j原厂固件卡刷包下载_刷机ROM固件包下载-原厂ROM固件-安卓刷机固件网
  • Python图形用户界面设计的15个基础组件
  • 代码编码规范文档(参考)
  • GPT实现联网,NextChat插件的配置说明
  • 理解和使用语言模型的监督微调 (SFT)
  • Github访问慢解决办法
  • github指令
  • nginx 负载服务器优化
  • nodejs实现webservice问题总结
  • WordPress 获取当前文章下的所有附件/获取指定ID文章的附件(图片、文件、视频)...
  • 分享自己折腾多时的一套 vue 组件 --we-vue
  • 关键词挖掘技术哪家强(一)基于node.js技术开发一个关键字查询工具
  • 模仿 Go Sort 排序接口实现的自定义排序
  • 微信开源mars源码分析1—上层samples分析
  • 我与Jetbrains的这些年
  • 智能合约开发环境搭建及Hello World合约
  • ​字​节​一​面​
  • ### Error querying database. Cause: com.mysql.jdbc.exceptions.jdbc4.CommunicationsException
  • #laravel 通过手动安装依赖PHPExcel#
  • (70min)字节暑假实习二面(已挂)
  • (php伪随机数生成)[GWCTF 2019]枯燥的抽奖
  • (免费领源码)Java#ssm#MySQL 创意商城03663-计算机毕业设计项目选题推荐
  • (一)pytest自动化测试框架之生成测试报告(mac系统)
  • (原創) 如何動態建立二維陣列(多維陣列)? (.NET) (C#)
  • (转)Android学习系列(31)--App自动化之使用Ant编译项目多渠道打包
  • (转)linux自定义开机启动服务和chkconfig使用方法
  • (轉)JSON.stringify 语法实例讲解
  • ****** 二十三 ******、软设笔记【数据库】-数据操作-常用关系操作、关系运算
  • ***监测系统的构建(chkrootkit )
  • .NET CORE 3.1 集成JWT鉴权和授权2
  • .Net Framework 4.x 程序到底运行在哪个 CLR 版本之上
  • .NET/C# 阻止屏幕关闭,阻止系统进入睡眠状态
  • .NET命令行(CLI)常用命令
  • .NET与java的MVC模式(2):struts2核心工作流程与原理
  • @SuppressWarnings注解
  • [ C++ ] 类和对象( 下 )
  • [ CTF ]【天格】战队WriteUp- 2022年第三届“网鼎杯”网络安全大赛(青龙组)
  • [ vulhub漏洞复现篇 ] JBOSS AS 4.x以下反序列化远程代码执行漏洞CVE-2017-7504
  • [ 渗透工具篇 ] 一篇文章让你掌握神奇的shuize -- 信息收集自动化工具
  • [000-01-022].第06节:RabbitMQ中的交换机介绍
  • [3300万人的聊天室] 作为产品的上游公司该如何?
  • [AIGC codze] Kafka 的 rebalance 机制
  • [Android 数据通信] android cmwap接入点
  • [Ariticle] 厚黑之道 一 小狐狸听故事
  • [BZOJ 3282] Tree 【LCT】