当前位置: 首页 > news >正文

记录一个编译的LLVM 含clang 和 PTX 来支持 HIPIFY 的构建配置

llvm 18 debug 版本

build llvmorg-18.1rc4 debug

$ cd llvm-project

$ git checkout llvmorg-18.1.0-rc4

$ mkdir build_d

$ cd build_d

$ mkdir -p ../../local_d

cmake \
-DCMAKE_INSTALL_PREFIX=../../local_d \
-DLLVM_SOURCE_DIR=../llvm \
-DLLVM_ENABLE_PROJECTS="bolt;clang;clang-tools-extra;lld;mlir"  \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX"  \
-DLLVM_INCLUDE_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Debug \
../llvm

其余部分拆出来了:

cross-project-tests;libclc;lldb;polly;flang

-DLLVM_ENABLE_RUNTIMES="libunwind;libcxxabi;pstl;libcxx;openmp"      \
libc;compiler-rt;

$ make -j34

$make install

llvm 18 release版本

cd llvm-project

mkdir build_r

cd build_r

mkdir -p ../../local_r

cmake \
-DCMAKE_INSTALL_PREFIX=../../local_r \
-DLLVM_SOURCE_DIR=../llvm \
-DLLVM_ENABLE_PROJECTS="bolt;clang;clang-tools-extra;lld;mlir"  \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX"  \
-DLLVM_INCLUDE_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
../llvm

$ make -j34

效果:

$make install

build HIPIFY debug

$ mkdir /home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify

cmake  \
-DCMAKE_INSTALL_PREFIX=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify  \
-DCMAKE_BUILD_TYPE=Debug  \
-DCMAKE_PREFIX_PATH=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d  \
..

还有一种更多配置的编译配置方法,其实用不到:

cmake-DHIPIFY_CLANG_TESTS=ON \-DCMAKE_BUILD_TYPE=Release \-DCMAKE_INSTALL_PREFIX=../dist \-DCMAKE_PREFIX_PATH=/usr/llvm/17.0.6/dist \-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.3.2 \-DCUDA_DNN_ROOT_DIR=/usr/local/cudnn-8.9.7 \-DCUDA_CUB_ROOT_DIR=/usr/local/cub-2.1.0 \-DLLVM_EXTERNAL_LIT=/usr/llvm/17.0.6/build/bin/llvm-lit \..

using hipify-clang
 

hipify-clang intro.cu --cuda-path="/usr/local/cuda-12.3" --print-stats-csv

$ /home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify/bin/hipify-clang vectorAdd.cu --cuda-path="/usr/local/cuda-12.3" --clang-resource-directory="/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/lib/clang/18"

写成Makefile:

EXE := vectorAdd_hipall: $(EXE)$(EXE): vectorAdd.cu.hiphipcc $< -o $@%.hip: %/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify/bin/hipify-clang $< --cuda-path=/usr/local/cuda-12.3 --clang-resource-directory=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/lib/clang/18.PHONY: clean
clean:${RM} $(EXE) *.hip

效果:

源cu代码:

#include <stdio.h>#include <cuda_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C,int numElements) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i] + 0.0f;}if(i==7)printf("Hello kernel threadID=%d\n", i);
}int main(void)
{cudaError_t err = cudaSuccess;int numElements = 50000;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = cudaMalloc((void **)&d_A, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = cudaMalloc((void **)&d_B, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = cudaMalloc((void **)&d_C, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);err = cudaGetLastError();if (err != cudaSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = cudaFree(d_A);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaFree(d_B);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaFree(d_C);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}

生成的vectorAdd.cu.hip代码:

#include <stdio.h>#include <hip/hip_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C,int numElements) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i] + 0.0f;}if(i==7)printf("Hello kernel threadID=%d\n", i);
}int main(void)
{hipError_t err = hipSuccess;int numElements = 50000;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = hipMalloc((void **)&d_A, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = hipMalloc((void **)&d_B, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = hipMalloc((void **)&d_C, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);err = hipGetLastError();if (err != hipSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = hipFree(d_A);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_B);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_C);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}

其他参考选项示例:

指示头文件文件夹

./hipify-clang square.cu --cuda-path=/usr/local/cuda-12.3 -I /usr/local/cuda-12.3/samples/common/inc

指示C++标准

./hipify-clang cpp17.cu --cuda-path=/usr/local/cuda-12.3 -- -std=c++17

多个 .cu 文件一起编译

./hipify-clang cpp17.cu ../../square.cu /home/user/cuda/intro.cu --cuda-path=/usr/local/cuda-12.3 -- -std=c++17

统计修改的信息

$ /home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/hipify/bin/hipify-clang vectorAdd.cu --cuda-path=/usr/local/cuda-12.3 --clang-resource-directory=/home/hipper/llvm_3_4_0_ex/browse_llvm_17/local_d/lib/clang/18 --print-stats

将 统计信息存入 .csv文件中

 --print-stats
改成 --print-stats-csv

遗留问题

llvmorg-18.1.rc release 配置有问题:

cmake \
-DCMAKE_INSTALL_PREFIX=../../local \
-DLLVM_SOURCE_DIR=../llvm \
-DLLVM_ENABLE_PROJECTS="bolt;clang;clang-tools-extra;cross-project-tests;libclc;lld;mlir;polly;flang"  \
-DLLVM_ENABLE_RUNTIMES="libc;libunwind;libcxxabi;pstl;libcxx;compiler-rt;openmp"      \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX"  \
-DLLVM_INCLUDE_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
../llvm

lldb;

貌似拿掉 libc 就能行

相关文章:

  • Java的控制流语句详解
  • 网络通信另个角度的认识(进程间通信),端口号(为什么要有,和pid的关系,如何封装,和进程的定位原理+对应关系)客户端如何拿到服务端的port
  • 数据结构奇妙旅程之二叉平衡树进阶---AVL树
  • scrapy的基本使用介绍
  • CUDA入门之统一内存
  • 学习大数据,所需要Java基础(9)
  • taosdb快速入门
  • Docker的基本概念和优势
  • 【鸿蒙 HarmonyOS 4.0】常用组件:List/Grid/Tabs
  • 常见doc命令使用
  • 2024蓝桥杯每日一题(二分)
  • torchrun常见参数
  • 【论文阅读】ACM MM 2023 PatchBackdoor:不修改模型的深度神经网络后门攻击
  • 颜色检测python项目
  • xlsx.js读取本地文件,按行转成数组数据
  • python3.6+scrapy+mysql 爬虫实战
  • 《Javascript高级程序设计 (第三版)》第五章 引用类型
  • 【162天】黑马程序员27天视频学习笔记【Day02-上】
  • 【跃迁之路】【519天】程序员高效学习方法论探索系列(实验阶段276-2018.07.09)...
  • Docker 笔记(2):Dockerfile
  • el-input获取焦点 input输入框为空时高亮 el-input值非法时
  • JavaScript服务器推送技术之 WebSocket
  • JDK 6和JDK 7中的substring()方法
  • node和express搭建代理服务器(源码)
  • Spark VS Hadoop:两大大数据分析系统深度解读
  • vue2.0项目引入element-ui
  • 半理解系列--Promise的进化史
  • 蓝海存储开关机注意事项总结
  • 面试总结JavaScript篇
  • 爬虫模拟登陆 SegmentFault
  • 前端每日实战:61# 视频演示如何用纯 CSS 创作一只咖啡壶
  • 手写双向链表LinkedList的几个常用功能
  • 数据结构java版之冒泡排序及优化
  • 算法-插入排序
  • 原生JS动态加载JS、CSS文件及代码脚本
  • AI算硅基生命吗,为什么?
  • 阿里云服务器如何修改远程端口?
  • 没有任何编程基础可以直接学习python语言吗?学会后能够做什么? ...
  • ​LeetCode解法汇总1410. HTML 实体解析器
  • ​学习一下,什么是预包装食品?​
  • (0)Nginx 功能特性
  • (30)数组元素和与数字和的绝对差
  • (LeetCode) T14. Longest Common Prefix
  • (第二周)效能测试
  • (紀錄)[ASP.NET MVC][jQuery]-2 純手工打造屬於自己的 jQuery GridView (含完整程式碼下載)...
  • (蓝桥杯每日一题)平方末尾及补充(常用的字符串函数功能)
  • (牛客腾讯思维编程题)编码编码分组打印下标题目分析
  • (切换多语言)vantUI+vue-i18n进行国际化配置及新增没有的语言包
  • (一)kafka实战——kafka源码编译启动
  • (转)c++ std::pair 与 std::make
  • (转)甲方乙方——赵民谈找工作
  • .net6+aspose.words导出word并转pdf
  • .php文件都打不开,打不开php文件怎么办
  • @GlobalLock注解作用与原理解析
  • []error LNK2001: unresolved external symbol _m