当前位置: 首页 > news >正文

2D Transpose算子GPU实现和优化

2D Transpose

一种较好的做法是基于shared mem,合并内存访问读取到shared mem,读取整个warpSize x warpSize大小的矩阵块。然后基于shared mem索引变换读取实现转置效果,最后写回同样可以合并内存访问。可以考虑使用一个warp或者一个thread block读取warpSize x warpSize大小的矩阵,基于shared mem转置后写回。中小尺寸采用后者,也就是一个thread block转置一个warpSize x warpSize大小的矩阵,可以创建更多的线程提高硬件利用率,性能更优。

参考CUDA实现代码

#include <stdio.h>
#include <iostream>
using namespace std;

#include <cuda_runtime.h>
#include "utils/cuda_mem_helper.h"
#include "utils/cuda_stream_helper.h"

#define THREAD_PER_WARP 32
#define WARP_PER_BLOCK 8
#define BLOCK_RD_NUM (THREAD_PER_WARP / WARP_PER_BLOCK)

/*
using a warp to transpose warpSize*warpSize block
*/
template <typename T>
__global__ void transpose_2d_warp(const T* __restrict__ in, T* __restrict__ out,
                                  const int row, const int col,
                                  int warp_row, int warp_col, int total_warp) {
    const int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
    const int warp_bid = threadIdx.x / THREAD_PER_WARP; // warp id in thread block
    const int warp_gid = tid / THREAD_PER_WARP; // warp id in grid

    const int lane = threadIdx.x % THREAD_PER_WARP; // thread id in warp
    const int warp_id_c = warp_gid % warp_col;
    const int warp_id_r = warp_gid / warp_col;

    // add array padding to handle bank-conflict
    __shared__ T block_data[WARP_PER_BLOCK][THREAD_PER_WARP][THREAD_PER_WARP + 1];

    const int row_bias = warp_id_r * THREAD_PER_WARP;
    const int col_bias = warp_id_c * THREAD_PER_WARP;

    // read block from input
    for (int i = 0; i < THREAD_PER_WARP; i++) {
        int addr = (row_bias + i) * col + col_bias + lane;
        block_data[warp_bid][i][lane] = in[addr];
    }
    __syncthreads();

    // write block to output
    for (int i = 0; i < THREAD_PER_WARP; i++) {
        int tgt_c = col_bias + i;
        int tgt_r = row_bias + lane;
        if ((tgt_r < row) && (tgt_c < col)) {
            int addr = tgt_c * row + tgt_r;
            out[addr] = block_data[warp_bid][lane][i];
        }
    }
}

/*
using a thread block to transpose warpSize*warpSize block
*/
template <typename T>
__global__ void transpose_2d_block(const T* __restrict__ in, T* __restrict__ out,
                                   const int row, const int col) {
    int block_id = blockIdx.x;
    int block_num_col = col / warpSize;

    int block_id_row = block_id / block_num_col;
    int block_id_col = block_id % block_num_col;

    int row_offset = block_id_row * warpSize;
    int col_offset = block_id_col * warpSize;

    int row_id = threadIdx.x / warpSize;
    int col_id = threadIdx.x % warpSize;

    // add array padding to handle bank-conflict
    __shared__ T block_data[THREAD_PER_WARP][THREAD_PER_WARP + 1];

#pragma unroll
    for (int i = 0; i < BLOCK_RD_NUM; i++) {
        int row_pos = i * WARP_PER_BLOCK + row_id;
        int in_addr = (row_offset + row_pos) * col + col_offset + col_id;
        block_data[row_pos][col_id] = in[in_addr];
    }
    __syncthreads();


#pragma unroll
    for (int i = 0; i < BLOCK_RD_NUM; i++) {
        int row_pos = i * WARP_PER_BLOCK + row_id;
        int out_addr = (col_offset + row_pos) * row + row_offset + col_id;

        out[out_addr] = block_data[col_id][row_pos];
    }
}

template <typename T>
void Transpose2DWarp(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
    const int warp_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int warp_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int total_warp = warp_row * warp_col;

    const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
    const int grid_size = (total_warp + WARP_PER_BLOCK - 1) / WARP_PER_BLOCK;

    transpose_2d_warp <<< grid_size, block_size, 0, stream>>>(in, out, row, col, warp_row, warp_col, total_warp);
}


template <typename T>
void Transpose2DBlock(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
    const int block_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int block_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
    const int total_block = block_row * block_col;

    const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
    const int grid_size = total_block;

    transpose_2d_block <<< grid_size, block_size, 0, stream>>>(in, out, row, col);
}

int main(void) {
    cudaError_t err = cudaSuccess;

    int row = 256;
    int col = 256;

    CudaMemoryHelper<float> data_in({row, col});
    CudaMemoryHelper<float> data_out({row, col});

    data_in.StepInitHostMem(1.0f);
    data_in.CopyMemH2D();
    data_in.PrintElems(4, 512,  col);

    CudaStreamHelper stream_helper;
    auto & stream = stream_helper.stream;

    int eval_num = 20;

    int thread_num = row * col;
    int threadsPerBlock = std::min(128, col);
    int blocksPerGrid = (thread_num + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    for (int i = 0; i < eval_num; i++) {
        Transpose2DWarp(data_in.d_mem, data_out.d_mem,  row, col, stream);
    }
    stream_helper.Sync();

    for (int i = 0; i < eval_num; i++) {
        Transpose2DBlock(data_in.d_mem, data_out.d_mem,  row, col, stream);
    }
    stream_helper.Sync();

    data_out.CopyMemD2H();

    printf("results0:\n");
    // verify results
    data_out.PrintElems(1, 1024, row);

    return 0;
}

可以参考

CUDA学习(二)矩阵转置及优化(合并访问、共享内存、bank conflict) - 知乎

存在两个问题

端侧GPU没有shared mem怎么处理?

矩阵比较小的时候,一个warp 16/32个线程(端侧GPU可能是warp size=16),这个方法需要读取16x16 / 32x32矩阵块,导致创建的线程太少,性能比较差。

解决方案

主要针对小shape矩阵转置和没有/不使用shared mem场景。

针对warp=16的情况考虑整个warp读取4x4矩阵快,相比16x16矩阵块,创建线程数x16倍。同样,warp=32考虑读取4x8,或者8x4,或者2个4x4,或者读两次得到8x8矩阵块。

这样每个线程读写1个值,同时,每4个相邻线程读写相邻的4个元素。

第二个图的箭头表示线程之间的数据交换,其他位置类似。其特点是两个线程之间数据相互交换。

这个可以通过warp shuffle完成,每个线程给出自己的值,然后再获取自己想要的线程的值。

针对warpSize=16, 4x4读取,当前线程id lane_dst获取目标线程线程id lane_src计算:

col = lane_dst % 4 // & 0x03
row = lane_dst / 4 // >> 2
lane_src = col * 4 + row

针对warpSize=32, 4x8读取,转置后是8x4,上面更加一般化为: 

col = lane_dst % COL_SIZE_NEW // COL_SIZE_NEW = 4
row = lane_dst / COL_SIZE_NEW 
lane_src = col * COL_SIZE_OLD + row // COL_SIZE_OLD = 8

CUDA参考实现代码

#include <stdio.h>
#include <iostream>
using namespace std;

#include <cuda_runtime.h>
#include "utils/cuda_mem_helper.h"
#include "utils/cuda_stream_helper.h"

#define WARP_SIZE 32

#define ROW_SIZE_OLD 4
#define COL_SIZE_OLD 8

#define ROW_SIZE_NEW 8
#define COL_SIZE_NEW 4

// use a warp to calculate
__global__ void transpose_warp1(const float* __restrict__ in, float* __restrict__ out, int row, int col) {
    int gid = blockDim.x * blockIdx.x + threadIdx.x;
    int warp_id = gid / 32;
    int lane_id = threadIdx.x % 32;
    int _row = lane_id / COL_SIZE_NEW;
    int _col = lane_id % COL_SIZE_NEW; // COL_SIZE_NEW = 4
    int lane_src = _col * COL_SIZE_OLD + _row; // COL_SIZE_OLD = 8

    int warp_num_col = col / COL_SIZE_OLD;
    int row_id = warp_id / warp_num_col;
    int col_id = warp_id % warp_num_col;

    // addr offset of the matrix block
    int row_offset = row_id * ROW_SIZE_OLD;
    int col_offset = col_id * COL_SIZE_OLD;

    int row_th_old = lane_id / COL_SIZE_OLD;
    int col_th_old = lane_id % COL_SIZE_OLD;

    int row_th_new = lane_id / COL_SIZE_NEW;
    int col_th_new = lane_id % COL_SIZE_NEW;

    float data = in[(row_offset + row_th_old) * col + col_offset + col_th_old];

    // warp shuffle
    data = __shfl_sync(0xFFFFFFFF, data, lane_src);

    int out_addr = (col_offset + row_th_new) * row + row_offset + col_th_new;
    out[out_addr] = data;
}


int main(void) {
    cudaError_t err = cudaSuccess;

    int row = 256;
    int col = 256;

    CudaMemoryHelper<float> data_in({row, col});
    CudaMemoryHelper<float> data_out({row, col});

    data_in.StepInitHostMem(1.0f);
    data_in.CopyMemH2D();
    data_in.PrintElems(4, 512,  col);

    CudaStreamHelper stream_helper;
    auto & stream = stream_helper.stream;

    int eval_num = 20;

    int thread_num = row * col;
    int threadsPerBlock = std::min(128, col);
    int blocksPerGrid = (thread_num + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    for (int i = 0; i < eval_num; i++) {
        transpose_warp1 <<< blocksPerGrid, threadsPerBlock, 0, stream>>>(data_in.d_mem, data_out.d_mem, row, col);
    }

    stream_helper.Sync();

    data_out.CopyMemD2H();

    printf("results0:\n");
    // verify results
    data_out.PrintElems(4, 512, row);

    return 0;
}

在上面三个方法以及tensorflow transpose算子对比中,

最后的方法在32x32, 64x64等小矩阵转置性能最优,128x128时略差于基于thread block转置warpSize*warpSize。在大于128x128时最后的方法性能比较差。基于thread block转置warpSize*warpSize最优。

相关文章:

  • 软件复杂性的来源与应对
  • 11.9 表达式求值
  • 09-排序3 Insertion or Heap Sort(浙大数据结构)
  • java-python+vue社区防疫服务管理系统网站
  • sparksql insertinto 源码解析
  • 反射之获取Class
  • 【博客477】prometheus-----数值数据编码(varint与zigzag)
  • LCMXO2-2000HC-4FTG256C FPGA MachXO2系列 256-FTBGA 现场可编程门阵列
  • 初始Cpp之 四、数据类型
  • office2019如何自定义安装位置?
  • java基于ssm的汽车维修保养管理系统
  • Std::optional 源码分析
  • 目标检测——关键点检测学习记录(四):人脸和手部特征点检测
  • IMX6ULL学习笔记(5)——获取和编译U-Boot
  • 尚硅谷Vue系列教程学习笔记(14)
  • [译]前端离线指南(上)
  • 【Linux系统编程】快速查找errno错误码信息
  • ES6 学习笔记(一)let,const和解构赋值
  • HTTP传输编码增加了传输量,只为解决这一个问题 | 实用 HTTP
  • Invalidate和postInvalidate的区别
  • Java反射-动态类加载和重新加载
  • k8s如何管理Pod
  • Mysql数据库的条件查询语句
  • Octave 入门
  • 成为一名优秀的Developer的书单
  • 分布式任务队列Celery
  • 搞机器学习要哪些技能
  • 利用DataURL技术在网页上显示图片
  • 利用jquery编写加法运算验证码
  • 使用Envoy 作Sidecar Proxy的微服务模式-4.Prometheus的指标收集
  • 使用Maven插件构建SpringBoot项目,生成Docker镜像push到DockerHub上
  • 微信公众号开发小记——5.python微信红包
  • AI又要和人类“对打”,Deepmind宣布《星战Ⅱ》即将开始 ...
  • Spring Batch JSON 支持
  • 翻译 | The Principles of OOD 面向对象设计原则
  • ​secrets --- 生成管理密码的安全随机数​
  • (06)Hive——正则表达式
  • (1)(1.11) SiK Radio v2(一)
  • (2.2w字)前端单元测试之Jest详解篇
  • (2021|NIPS,扩散,无条件分数估计,条件分数估计)无分类器引导扩散
  • (DFS + 剪枝)【洛谷P1731】 [NOI1999] 生日蛋糕
  • (Mirage系列之二)VMware Horizon Mirage的经典用户用例及真实案例分析
  • (Redis使用系列) SpirngBoot中关于Redis的值的各种方式的存储与取出 三
  • (附源码)springboot助农电商系统 毕业设计 081919
  • (官网安装) 基于CentOS 7安装MangoDB和MangoDB Shell
  • (含react-draggable库以及相关BUG如何解决)固定在左上方某盒子内(如按钮)添加可拖动功能,使用react hook语法实现
  • (转)MVC3 类型“System.Web.Mvc.ModelClientValidationRule”同时存在
  • (转)德国人的记事本
  • *(长期更新)软考网络工程师学习笔记——Section 22 无线局域网
  • .NET Core工程编译事件$(TargetDir)变量为空引发的思考
  • .NET Micro Framework 4.2 beta 源码探析
  • .NET 动态调用WebService + WSE + UsernameToken
  • .Net7 环境安装配置
  • .Net多线程总结
  • .net开发时的诡异问题,button的onclick事件无效