DPC++

Table of Contents

1. DPC++

1.1. How DPC++ Compiler Works In Details

Use a Third-Party Compiler as a Host Compiler for DPC++ Code

https://github.com/compiler-explorer/compiler-explorer/issues/2244

可以直接使用 `dpcpp hello.cc` 生成最终的 elf, 但这个命令隐藏了许多细节, 实际上, dpc++ 编译时涉及几个步骤:

  1. 生成 llvm bc
  2. bc 转换为 spir
  3. spir 打包到另一个 llvm bc

使用 clang 的 `-ccc-print-phases` 和 `-ccc-print-bindings` 可以看到更详细一些:

$> dpcpp hello.cc -ccc-print-bindings
# "spir64-unknown-unknown-sycldevice" - "clang", inputs: ["hello.cc"], output: "/tmp/hello-6d6ed7.bc"
# "spir64-unknown-unknown-sycldevice" - "SYCL::Linker", inputs: ["/tmp/hello-6d6ed7.bc"], output: "/tmp/hello-14dd3c.bc"
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-crt.o"], outputs: ["/tmp/libsycl-crt-62d5d0.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-complex.o"], outputs: ["/tmp/libsycl-complex-0e4709.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-complex-fp64.o"], outputs: ["/tmp/libsycl-complex-fp64-837d7b.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-cmath.o"], outputs: ["/tmp/libsycl-cmath-ae81c5.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-cmath-fp64.o"], outputs: ["/tmp/libsycl-cmath-fp64-8d96bd.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-fallback-cassert.o"], outputs: ["/tmp/libsycl-fallback-cassert-21ff6c.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-fallback-cstring.o"], outputs: ["/tmp/libsycl-fallback-cstring-f6a3da.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-fallback-complex.o"], outputs: ["/tmp/libsycl-fallback-complex-c5e325.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-fallback-complex-fp64.o"], outputs: ["/tmp/libsycl-fallback-complex-fp64-ed36e0.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-fallback-cmath.o"], outputs: ["/tmp/libsycl-fallback-cmath-b6d5c8.o"]
# "spir64-unknown-unknown-sycldevice" - "offload bundler", inputs: ["/opt/intel/oneapi/compiler/2021.4.0/linux/bin/../lib/libsycl-fallback-cmath-fp64.o"], outputs: ["/tmp/libsycl-fallback-cmath-fp64-2ea1d0.o"]
# "spir64-unknown-unknown-sycldevice" - "SYCL::Linker", inputs: ["/tmp/hello-14dd3c.bc", "/tmp/libsycl-crt-62d5d0.o", "/tmp/libsycl-complex-0e4709.o", "/tmp/libsycl-complex-fp64-837d7b.o", "/tmp/libsycl-cmath-ae81c5.o", "/tmp/libsycl-cmath-fp64-8d96bd.o", "/tmp/libsycl-fallback-cassert-21ff6c.o", "/tmp/libsycl-fallback-cstring-f6a3da.o", "/tmp/libsycl-fallback-complex-c5e325.o", "/tmp/libsycl-fallback-complex-fp64-ed36e0.o", "/tmp/libsycl-fallback-cmath-b6d5c8.o", "/tmp/libsycl-fallback-cmath-fp64-2ea1d0.o"], output: "/tmp/hello-dc7804.bc"
# "spir64-unknown-unknown-sycldevice" - "SYCL post link", inputs: ["/tmp/hello-dc7804.bc"], output: "/tmp/hello-2ca00c.table"
# "spir64-unknown-unknown-sycldevice" - "File table transformation", inputs: ["/tmp/hello-2ca00c.table"], output: "/tmp/hello-6e9957.txt"
# "spir64-unknown-unknown-sycldevice" - "SPIR-V translator", inputs: ["/tmp/hello-6e9957.txt"], output: "/tmp/hello-90e75a.txt"
# "spir64-unknown-unknown-sycldevice" - "File table transformation", inputs: ["/tmp/hello-2ca00c.table", "/tmp/hello-90e75a.txt"], output: "/tmp/hello-f79986.table"
# "spir64-unknown-unknown-sycldevice" - "offload wrapper", inputs: ["/tmp/hello-f79986.table"], output: "/tmp/a-a631e3.o"
# "x86_64-unknown-linux-gnu" - "Append Footer to source", inputs: ["hello.cc"], output: "/tmp/hello-50ff1f.cpp"
# "x86_64-unknown-linux-gnu" - "clang", inputs: ["/tmp/hello-50ff1f.cpp", "/tmp/hello-6d6ed7.bc"], output: "/tmp/hello-c7e503.o"
# "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["/tmp/hello-c7e503.o", "/tmp/a-a631e3.o"], output: "a.out"

1.1.1. sample

#include <CL/sycl.hpp>
#include <iostream>
namespace sycl = cl::sycl;

class kernel_vector_add;
int main(int argc, char* argv[]) {
    sycl::float4 a = {1.0, 2.0, 3.0, 4.0};
    sycl::float4 b = {1.0, 2.0, 3.0, 4.0};
    sycl::float4 c = {0.0, 0.0, 0.0, 0.0};
    sycl::default_selector device_selector;
    sycl::queue queue(device_selector);
    {
        sycl::buffer<sycl::float4, 1> buff_a(&a, sycl::range<1>(1));
        sycl::buffer<sycl::float4, 1> buff_b(&b, sycl::range<1>(1));
        sycl::buffer<sycl::float4, 1> buff_c(&c, sycl::range<1>(1));
        queue.submit([&](sycl::handler& cgh) {
            auto a_acc = buff_a.get_access<sycl::access::mode::read>(cgh);
            auto b_acc = buff_b.get_access<sycl::access::mode::read>(cgh);
            auto c_acc =
                buff_c.get_access<sycl::access::mode::discard_write>(cgh);

            cgh.single_task<class kernel_vector_add>(
                [=]() { c_acc[0] = a_acc[0] + b_acc[0]; });
        });
    }

    std::cout << c.x() << "," << c.y() << "," << c.z() << "," << c.w()
              << std::endl;

    return 0;
}

1.1.2. 针对 kernel 生成 llvm bc

$> dpcpp -fsycl-device-only hello.cc
$> ls
hello-sycl-spir64-unknown-unknown-sycldevice.bc

1.1.3. 把 bc 转换成 spir

$> llvm-spirv hello-sycl-spir64-unknown-unknown-sycldevice.bc
$> ls
hello-sycl-spir64-unknown-unknown-sycldevice.spv

1.1.4. 通过 llvm-offload-wrapper 生成 fat bin

$> clang-offload-wrapper --kind=sycl --target=spir64 hello-sycl-spir64-unknown-unknown-sycldevice.spv -o device.bc
$> clang device.bc -c -fPIC
$> file device.o
device.o: ELF 64-bit LSB relocatable, x86-64, version 1 (GNU/Linux), not stripped

1.1.5. 生成 host 需要的头文件

$> dpcpp -fsycl-device-only -Xclang -fsycl-int-header=device.h hello.cc

1.1.6. 编译 host 部分

$> g++ -std=c++17 -c hello.cc -o host.o -include device.h -fPIC -I /opt/intel/oneapi/compiler/2021.4.0/linux/include/sycl

1.1.7. 链接 device 与 hosts

$> gcc host.o device.o -lsycl -lstdc++

1.2. device-sycl

device-sycl 是一个 clang 的 action, 目的是把源码中的 kernel 等编译成 bc.

例如下面的代码:

#include <CL/sycl.hpp>
#include <iostream>
namespace sycl = cl::sycl;

void bar(){};
int foo() {
    int x = 1;
    x += 1;
    return x;
}

class kernel_vector_add;
int main(int argc, char* argv[]) {
    // ...
    sycl::queue queue(device_selector);
    {
        // ...
        queue.submit([&](sycl::handler& cgh) {
            // ...
            cgh.single_task<class kernel_vector_add>([=]() {
                foo();
                c_acc[0] = a_acc[0] + b_acc[0];
            });
        });
    }
    return 0;
}

single_task 中的 lambda 以及 foo 会被 device-sycl 找到, 做为 kernel 输出到 bc 中, 但 main 和 bar 并不会.

判断函数是否是 kernel 是根据 `sycl_kernel` 这个 attribute

1.2.1. sycl 库会把 kernel 加上 sycl_kernel attribute

匿名的 lambda 会被 sycl 库封装成 `__attribute__(sycl_kernel) …` 的样子

$> dpcpp -fsycl-device-only hello.cc  -E

最终代码为:

template <typename KernelName, typename KernelType>
__attribute__((sycl_kernel)) void kernel_single_task(
    const KernelType &KernelFunc) {
    KernelFunc();
}

template <typename KernelName = detail::auto_name, typename KernelType>

kernel_single_task_wrapper(const KernelType &KernelFunc) {
    detail::CheckDeviceCopyable<KernelType>();

    kernel_single_task<KernelName>(KernelFunc);
}

void single_task(const KernelType &KernelFunc) {
    throwIfActionIsCreated();
    using NameT =
        typename detail::get_kernel_name_t<KernelName, KernelType>::name;
    kernel_single_task_wrapper<NameT>(KernelFunc);
}

1.2.2. sema 解析 attribute

clang/lib/Sema/SemaDeclAttr.cpp:

case ParsedAttr::AT_SYCLKernel:
    handleSYCLKernelAttr(S, D, AL);
    break;

1.2.3. sema 找到 sycl_kernel

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp:

static void processFunctionInstantiation(
    Sema &S, SourceLocation PointOfInstantiation, FunctionDecl *FD,
    bool DefinitionRequired, MangleContext &MC) {
    // ...
    if (S.LangOpts.SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>())
        S.ConstructOpenCLKernel(FD, MC);
    // ...
}

1.2.4. 输出 kernel 到 bitcode

clang/lib/Parse/ParseAST.cpp:

if (S.getLangOpts().SYCLIsDevice) {
    for (Decl *D : S.syclDeviceDecls()) {
        Consumer->HandleTopLevelDecl(DeclGroupRef(D));
    }
}

1.4. SPIR-V

1.4.1. utils

1.4.1.1. llvm-spirv

spv 转换为 llvm bc:

$> llvm-spirv -r hello-sycl-spir64-unknown-unknown-sycldevice.spv

llvm bc 转换为 spv:

$> llvm-spirv hello-sycl-spir64-unknown-unknown-sycldevice.bc
1.4.1.2. spirv visualizer
1.4.1.3. spirv tools
  • spirv-dis
  • spirv-as
  • spirv-link

1.4.2. spir details

; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 60
; Schema: 0
               OpCapability Addresses
               OpCapability Kernel
               OpCapability Int64
               OpCapability GenericPointer
          %1 = OpExtInstImport "OpenCL.std"
               OpMemoryModel Physical64 OpenCL
               OpEntryPoint Kernel %16 "_ZTS17kernel_vector_add"
         %58 = OpString "kernel_arg_type._ZTS17kernel_vector_add.class._ZTSN2cl4sycl3vecIfLi4EEE.cl::sycl::vec*,class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range,class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range,class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id,class._ZTSN2cl4sycl3vecIfLi4EEE.cl::sycl::vec*,class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range,class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range,class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id,class._ZTSN2cl4sycl3vecIfLi4EEE.cl::sycl::vec*,class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range,class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range,class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id,"
         %59 = OpString "kernel_arg_type_qual._ZTS17kernel_vector_add.,,,,,,,,,,,,"
               OpSource OpenCL_CPP 100000
               OpName %class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec "class._ZTSN2cl4sycl3vecIfLi4EEE.cl::sycl::vec"
               OpName %class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range "class._ZTSN2cl4sycl5rangeILi1EEE.cl::sycl::range"
               OpName %class__ZTSN2cl4sycl6detail5arrayILi1EEE_cl__sycl__detail__array "class._ZTSN2cl4sycl6detail5arrayILi1EEE.cl::sycl::detail::array"
               OpName %class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id "class._ZTSN2cl4sycl2idILi1EEE.cl::sycl::id"
               OpDecorate %18 FuncParamAttr ByVal
               OpDecorate %19 FuncParamAttr ByVal
               OpDecorate %20 FuncParamAttr ByVal
               OpDecorate %22 FuncParamAttr ByVal
               OpDecorate %23 FuncParamAttr ByVal
               OpDecorate %24 FuncParamAttr ByVal
               OpDecorate %26 FuncParamAttr ByVal
               OpDecorate %27 FuncParamAttr ByVal
               OpDecorate %28 FuncParamAttr ByVal
               OpDecorate %21 FuncParamAttr NoWrite
               OpDecorate %25 FuncParamAttr NoWrite
               OpDecorate %55 FPFastMathMode Fast
      %ulong = OpTypeInt 64 0
       %uint = OpTypeInt 32 0
    %ulong_1 = OpConstant %ulong 1
    %ulong_0 = OpConstant %ulong 0
     %uint_0 = OpConstant %uint 0
       %void = OpTypeVoid
      %float = OpTypeFloat 32
    %v4float = OpTypeVector %float 4
%class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec = OpTypeStruct %v4float
%_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec = OpTypePointer CrossWorkgroup %class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec
%_arr_ulong_ulong_1 = OpTypeArray %ulong %ulong_1
%class__ZTSN2cl4sycl6detail5arrayILi1EEE_cl__sycl__detail__array = OpTypeStruct %_arr_ulong_ulong_1
%class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range = OpTypeStruct %class__ZTSN2cl4sycl6detail5arrayILi1EEE_cl__sycl__detail__array
%_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range = OpTypePointer Function %class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
%class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id = OpTypeStruct %class__ZTSN2cl4sycl6detail5arrayILi1EEE_cl__sycl__detail__array
%_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id = OpTypePointer Function %class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id
         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range %_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range %_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range %_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id
%_ptr_Function_ulong = OpTypePointer Function %ulong
%_ptr_Generic_ulong = OpTypePointer Generic %ulong
%_ptr_Generic_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec = OpTypePointer Generic %class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec
%_ptr_Generic_v4float = OpTypePointer Generic %v4float
         %16 = OpFunction %void None %15
         %17 = OpFunctionParameter %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec
         %18 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
         %19 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
         %20 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id
         %21 = OpFunctionParameter %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec
         %22 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
         %23 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
         %24 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id
         %25 = OpFunctionParameter %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec
         %26 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
         %27 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl5rangeILi1EEE_cl__sycl__range
         %28 = OpFunctionParameter %_ptr_Function_class__ZTSN2cl4sycl2idILi1EEE_cl__sycl__id
         %29 = OpLabel
         %34 = OpInBoundsPtrAccessChain %_ptr_Function_ulong %20 %ulong_0 %uint_0 %uint_0 %ulong_0
         %36 = OpPtrCastToGeneric %_ptr_Generic_ulong %34
         %37 = OpLoad %ulong %36 Aligned 8
         %38 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %17 %37
         %39 = OpInBoundsPtrAccessChain %_ptr_Function_ulong %24 %ulong_0 %uint_0 %uint_0 %ulong_0
         %40 = OpPtrCastToGeneric %_ptr_Generic_ulong %39
         %41 = OpLoad %ulong %40 Aligned 8
         %42 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %21 %41
         %43 = OpInBoundsPtrAccessChain %_ptr_Function_ulong %28 %ulong_0 %uint_0 %uint_0 %ulong_0
         %44 = OpPtrCastToGeneric %_ptr_Generic_ulong %43
         %45 = OpLoad %ulong %44 Aligned 8
         %46 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %25 %45
         %48 = OpPtrCastToGeneric %_ptr_Generic_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %42
         %49 = OpPtrCastToGeneric %_ptr_Generic_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %46
         %51 = OpInBoundsPtrAccessChain %_ptr_Generic_v4float %48 %ulong_0 %uint_0
         %52 = OpLoad %v4float %51 Aligned 16
         %53 = OpInBoundsPtrAccessChain %_ptr_Generic_v4float %49 %ulong_0 %uint_0
         %54 = OpLoad %v4float %53 Aligned 16
         %55 = OpFAdd %v4float %52 %54
         %56 = OpPtrCastToGeneric %_ptr_Generic_class__ZTSN2cl4sycl3vecIfLi4EEE_cl__sycl__vec %38
         %57 = OpInBoundsPtrAccessChain %_ptr_Generic_v4float %56 %ulong_0 %uint_0
               OpStore %57 %55 Aligned 16
               OpReturn
               OpFunctionEnd

Backlinks

OpenMP (OpenMP > libgomp > target): offload 过程与 DPC++, TVM BYOC Codegen 以及 ComputeCpp 类似, 以 nvptx 为例, 主要 步骤是:

Author: [email protected]
Date: 2022-08-03 Wed 14:45
Last updated: 2023-07-25 Tue 19:51

知识共享许可协议