Eigen and SIMD

Table of Contents

1. Eigen and SIMD

1.1. eigen 如何调用 SIMD

#include <Eigen/Dense>
#include <iostream>

using Eigen::Vector4f;

int main() {
    Vector4f a, b;
    a << 1, 2, 3, 4;
    b << 1, 2, 3, 4;
    a += b;
    std::cout << a << std::endl;
}

其中 `a += b` 对应 eigen 中 dense_assignment_loop, 后者会根据 Traversal 不同会调用到不同的实现.

template<typename Kernel,
         int Traversal = Kernel::AssignmentTraits::Traversal,
         int Unrolling = Kernel::AssignmentTraits::Unrolling>
struct dense_assignment_loop;

1.1.1. 非 SIMD 实现

当 Traversal 是 LinearTraversal 时, 会调用到非 SIMD 版本:

// 这里是模板的偏特化 (partial specialization)
template <typename Kernel>
struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling> {
    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
        const Index size = kernel.size();
        for (Index i = 0; i < size; ++i) kernel.assignCoeff(i);
    }
};

template <typename DstScalar, typename SrcScalar>
struct add_assign_op {
    void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; }
}

backtrace:

#0  Eigen::internal::add_assign_op<float, float>::assignCoeff (this=0x7fffffffc167, a=@0x7fffffffc1c0: 1, b=@0x7fffffffc1d0: 1) at eigen-3.4.0/Eigen/src/Core/functors/AssignmentFunctors.h:50
#1  0x0000555555557099 in Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>::assignCoeff (this=0x7fffffffc060, index=0) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:660
#2  0x0000555555556fc7 in Eigen::internal::copy_using_evaluator_LinearTraversal_CompleteUnrolling<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 0, 4>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:247
#3  0x0000555555556f3a in Eigen::internal::dense_assignment_loop<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::intern
                                           ~~~~~~~~~~~~~~~~~~~~~~
al::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 1, 2>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:528

#4  0x0000555555556df6 in Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., 
    func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:784
#5  0x0000555555556cda in Eigen::internal::Assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float>, Eigen::internal::Dense2Dense, void>::run (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:952
#6  0x0000555555556b99 in Eigen::internal::call_assignment_no_alias<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., 
    func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:888
#7  0x00005555555563b0 in Eigen::internal::call_assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> >(Eigen::Matrix<float, 4, 1, 0, 4, 1>&, Eigen::Matrix<float, 4, 1, 0, 4, 1> const&, Eigen::internal::add_assign_op<float, float> const&, Eigen::internal::enable_if<!Eigen::internal::evaluator_assume_aliasing<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::evaluator_traits<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::Shape>::value, void*>::type) (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:857
#8  0x0000555555555c8e in Eigen::MatrixBase<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::operator+=<Eigen::Matrix<float, 4, 1, 0, 4, 1> > (this=0x7fffffffc1c0, other=...)
    at eigen-3.4.0/Eigen/src/Core/CwiseBinaryOp.h:177
#9  0x000055555555554c in main () at matrix.cc:10

1.1.2. SIMD 实现

当 Traversal 是 InnerVectorizedTraversal 时, 会调用到 SIMD 的版本:

template <typename Kernel>
struct dense_assignment_loop<
    Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
    EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
        typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
        copy_using_evaluator_innervec_CompleteUnrolling<
            Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
    }
};

// run 最终会调用到:
template <typename DstScalar, typename SrcScalar>
struct add_assign_op {
    template <int Alignment, typename Packet>
    void assignPacket(DstScalar* a, const Packet& b) const {
        internal::pstoret<DstScalar, Packet, Alignment>(
            a, internal::padd(internal::ploadt<Packet, Alignment>(a), b));
    }
};

// 其中 internal::padd 是 arch 中定义的:
template <>
Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
    return _mm_add_ps(a, b);
}

backtrace 为:

#0  Eigen::internal::padd<float __vector(4)>(float __vector(4) const&, float __vector(4) const&) (a=..., b=...) at eigen-3.4.0/Eigen/src/Core/arch/SSE/PacketMath.h:291
#1  0x000055555555730b in Eigen::internal::add_assign_op<float, float>::assignPacket<16, float __vector(4)>(float*, float __vector(4) const&) const (this=0x7fffffffc167, a=0x7fffffffc1c0, b=...)
at eigen-3.4.0/Eigen/src/Core/functors/AssignmentFunctors.h:54
#2  0x00005555555571f6 in Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>::assignPacket<16, 16, float __vector(4)>(long, long) (this=0x7fffffffc060, row=0, col=0) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:675
#3  0x0000555555557123 in Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>::assignPacketByOuterInner<16, 16, float __vector(4)>(long, long) (this=0x7fffffffc060, outer=0, inner=0) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:689
#4  0x0000555555557056 in Eigen::internal::copy_using_evaluator_innervec_CompleteUnrolling<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 0, 4>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:279
#5  0x0000555555556fc4 in Eigen::internal::dense_assignment_loop<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 2, 2>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:489
                                           ~~~~~~~~~~~~~~~~~~~~~~
#6  0x0000555555556e80 in Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., 
func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:784
#7  0x0000555555556d64 in Eigen::internal::Assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float>, Eigen::internal::Dense2Dense, void>::run (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:952
#8  0x0000555555556c23 in Eigen::internal::call_assignment_no_alias<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., 
func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:888
#9  0x000055555555643a in Eigen::internal::call_assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> >(Eigen::Matrix<float, 4, 1, 0, 4, 1>&, Eigen::Matrix<float, 4, 1, 0, 4, 1> const&, Eigen::internal::add_assign_op<float, float> const&, Eigen::internal::enable_if<!Eigen::internal::evaluator_assume_aliasing<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::evaluator_traits<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::Shape>::value, void*>::type) (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:857
#10 0x0000555555555d18 in Eigen::MatrixBase<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::operator+=<Eigen::Matrix<float, 4, 1, 0, 4, 1> > (this=0x7fffffffc1c0, other=...)
at eigen-3.4.0/Eigen/src/Core/CwiseBinaryOp.h:177
#11 0x000055555555554c in main () at matrix.cc:10

1.1.3. Traversal

Traversal 的值取决于 arch 中的 HasXXX 枚举值

AssignEvaluator.h:

template <
    typename DstEvaluator, typename SrcEvaluator, typename AssignFunc,
    int MaxPacketSize = -1>
struct copy_using_evaluator_traits {
   private:
    enum {
    MightVectorize = // ...
                  && bool(functor_traits<AssignFunc>::PacketAccess),
                                                    //~~~~~~~~~~~~~~ 
    MayInnerVectorize  = MightVectorize && // ...
    // ...
    };

   public:
    enum {
        Traversal =  int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) 
              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
              // ...
              : int(MayLinearize)        ? int(LinearTraversal)
                                         : int(DefaultTraversal),
    };
}

AssignmentFunctors.h:

template<typename DstScalar,typename SrcScalar>
struct functor_traits<add_assign_op<DstScalar,SrcScalar> > {
  enum {
    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost,
    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd
                                                                                  //~~~~~~~ 
  };
};

PacketMath.h@SSE:

template <>
struct packet_traits<Eigen::half> : default_packet_traits {
    enum {
        // ...
        HasAdd = 1,
        // ...
    }
    // ...
}

1.2. eigen 需要的 SIMD 指令

https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html

以 SSE 为例, 列举 eigen 需要的 SIMD 指令

1.2.1. op 需要的指令

op (例如 add_assign_op) 既有非 SIMD 实现, 也有 SIMD 实现, 且 arch 需要定义 HasXXX 枚举类型告诉 eigen 使用哪种实现.

1.2.1.1. assign_op
non-simd internal simd
a=b pstoret _mm_store_ps
1.2.1.2. (add,sub,mul,div)_assign_op
non-simd internal simd
a(+,-,*,/)=b pstoret _mm_store_ps
  pload _mm_load_ps
  p(add,sub,mul,div) _mm_(add,sub,mul,div)_ps
1.2.1.3. scalar_boolean_(and,or,xor)_op
non-simd internal simd
a (&&,||,^) b p(and,or,xor) _mm_(and,or,xor)_ps
1.2.1.4. scalar_opposite_op
non-simd internal simd
­a pnegate _mm_castsi128_ps
    _mm_xor_ps
1.2.1.5. scalar_abs_op
non-simd internal simd
abs(a) pabs _mm_castsi128_ps
    _mm_and_ps
1.2.1.6. scalar_shift_right_op
non-simd internal simd
a>>N parithmetic_shift_right _mm_srai_epi32
1.2.1.7. scalar_shift_left_op
non-simd internal simd
a<<N plogical_shift_left _mm_slli_epi32
1.2.1.8. scalar_exp_op
non-simd internal simd
exp(a) pexp 属于 MathFunctions, 依赖于近似公式, 会用到 pmin, pmax
    pmadd, pmul 等
1.2.1.9. scalar_expm1_op
non-simd internal simd
exp(a)-1 pexpm1 属于 MathFunctions
1.2.1.10. scalar_log_op
non-simd internal simd
log(a) plog 属于 MathFunctions
1.2.1.11. scalar_log1p_op
non-simd internal simd
log(a)+1 plog1p 属于 MathFunctions
1.2.1.12. scalar_log(10,2)_op
non-simd internal simd
log(10,2)(a) plog(10,2) 属于 MathFunctions
1.2.1.13. scalar_sqrt_op
non-simd internal simd
sqrt(a) psqrt 属于 MathFunctions, 但可以利用 _mm_rsqrt_ps
1.2.1.14. scalar_(sin,cos,tan,asin,acos,atan,tanh,atanh,sinh,asinh,cosh,acosh)_op
1.2.1.15. scalar_(round,floor,rint,ceil)_op
non-simd internal simd
(roud,floor,rint,ceil)(a) p(round,floor,rint,ceil) _mm_(round,floor,round,ceil)_ps
    或者使用 pand, psum, _mm_cmpgt_ps 间接实现
1.2.1.16. scalar_(min,max)_op
non-simd internal simd
(min,max)(a,b) p(min,max) _mm_(min,max)_ps

1.2.2. 其它指令

还有一些接口是并没有对应的 op, 它们主要是由其它代码间接调用到, 例如 pmadd 主要由 gebp kernel 调用.

1.2.2.1. pcmp_(le,lt,eq)

_mm_cmp(le,lt,eq)_ps

1.2.2.2. peven_mask

_mm_set_epi32

1.2.2.3. pzero

_mm_setzero_ps

1.2.2.4. pselect

_mm_blendv_ps

1.2.2.5. pmadd

_mm_fmadd_ps

1.2.2.6. paddsub

_mm_addsub_ps

1.2.2.7. paddnot

_mm_andnot_ps

1.3. GEBP kernel 需要的 SIMD 指令

GEBP (GEneral Block Panel kernel) 是 eigen 自己的 GEMM (GEneral Matrix Multiply) 实现

1.3.1. madd

madd 是指 `c=a*b+c` 操作, gebp_kernel 依赖这个操作来加速 GEMM:

template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
EIGEN_STRONG_INLINE void madd_impl(
    const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
    RhsPacketType& tmp, const true_type&) const {
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
    c.v = pmadd(a, b.v, c.v);
#else
    tmp = b;
    tmp.v = pmul(a, tmp.v);
    c = padd(c, tmp);
#endif
}

arch 如果支持 madd 类指令例如 FMA, 则需要定义 EIGEN_HAS_SINGLE_INSTRUCTION_MADD 并提供 pmadd 的实现

1.3.2. prefetch

gebp_kernel 会用 prefetch 指令来预读数据, 需要 arch 去提供 (例如 SSE 的 `_mm_prefetch`) 或者使用 gcc 的 `__builtin_prefetch`

https://eigen.tuxfamily.org/bz/show_bug.cgi?id=1578

http://eigen.tuxfamily.org/bz/show_bug.cgi?id=953

Backlinks

GCC Prefetch (GCC Prefetch): prefetch 可以隐藏内存访问的 latency. 例如 eigen 的 gebp kernel 就使用了 __builtin_prefetch 来提高 GEMM 性能.

1.4. 添加新的 arch

  1. 定义自己的 packet_traits, 需要继承自 default_packet_traits, 按 SIMD 的规格做一些修改, 例如 size, HasXXX 等
  2. 定义 Packet4f, Packet4i 等与底层 intrinsic 类型 (例如 __m128, __m128i) 的对应关系
  3. 针对各个 pxxx 通过 C++ 模板特化 (template specialization) 提供自己的实现. 如果没有提供, 会调用 GenericPacketMath 定义的通用的版本:
    • 对于 add, sub 等基本操作, GenericPacketMath 会通过 gcc vector extension 调到 SIMD
    • 对于复杂的操作, 例如 pselect, GenericPacketMath 会调用到 pand, por 等基本操作

Backlinks

Tensorflow Architecture: Parallism (Tensorflow Architecture: Parallism > kernels > eigen): 默认情况下 tensorflow 使用 eigen, eigen 是一个基于 c++ 模板的线性代数库, 和 BLAS (Basic Linear Algebra Subprograms) 功能类似. 它支持的特性包括:

Tensorflow Architecture: Parallism (Tensorflow Architecture: Parallism > Parallelism > 后续工作): 利用 RISC-V 的 vector 或 SIMD 指令加速 eigen 的 CPU device, 现在 `Eigen/src/Core/arch` 下已经包含 NEON, SSE, AVX, AVX512 等.

Author: [email protected]
Date: 2022-11-17 Thu 15:19
Last updated: 2022-12-06 Tue 19:30

知识共享许可协议