Eigen and SIMD
Table of Contents
1. Eigen and SIMD
1.1. eigen 如何调用 SIMD
#include <Eigen/Dense> #include <iostream> using Eigen::Vector4f; int main() { Vector4f a, b; a << 1, 2, 3, 4; b << 1, 2, 3, 4; a += b; std::cout << a << std::endl; }
其中 `a += b` 对应 eigen 中 dense_assignment_loop, 后者会根据 Traversal 不同会调用到不同的实现.
template<typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal, int Unrolling = Kernel::AssignmentTraits::Unrolling> struct dense_assignment_loop;
1.1.1. 非 SIMD 实现
当 Traversal 是 LinearTraversal 时, 会调用到非 SIMD 版本:
// 这里是模板的偏特化 (partial specialization) template <typename Kernel> struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { const Index size = kernel.size(); for (Index i = 0; i < size; ++i) kernel.assignCoeff(i); } }; template <typename DstScalar, typename SrcScalar> struct add_assign_op { void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; } }
backtrace:
#0 Eigen::internal::add_assign_op<float, float>::assignCoeff (this=0x7fffffffc167, a=@0x7fffffffc1c0: 1, b=@0x7fffffffc1d0: 1) at eigen-3.4.0/Eigen/src/Core/functors/AssignmentFunctors.h:50 #1 0x0000555555557099 in Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>::assignCoeff (this=0x7fffffffc060, index=0) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:660 #2 0x0000555555556fc7 in Eigen::internal::copy_using_evaluator_LinearTraversal_CompleteUnrolling<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 0, 4>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:247 #3 0x0000555555556f3a in Eigen::internal::dense_assignment_loop<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::intern ~~~~~~~~~~~~~~~~~~~~~~ al::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 1, 2>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:528 #4 0x0000555555556df6 in Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:784 #5 0x0000555555556cda in Eigen::internal::Assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float>, Eigen::internal::Dense2Dense, void>::run (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:952 #6 0x0000555555556b99 in Eigen::internal::call_assignment_no_alias<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:888 #7 0x00005555555563b0 in Eigen::internal::call_assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> >(Eigen::Matrix<float, 4, 1, 0, 4, 1>&, Eigen::Matrix<float, 4, 1, 0, 4, 1> const&, Eigen::internal::add_assign_op<float, float> const&, Eigen::internal::enable_if<!Eigen::internal::evaluator_assume_aliasing<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::evaluator_traits<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::Shape>::value, void*>::type) (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:857 #8 0x0000555555555c8e in Eigen::MatrixBase<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::operator+=<Eigen::Matrix<float, 4, 1, 0, 4, 1> > (this=0x7fffffffc1c0, other=...) at eigen-3.4.0/Eigen/src/Core/CwiseBinaryOp.h:177 #9 0x000055555555554c in main () at matrix.cc:10
1.1.2. SIMD 实现
当 Traversal 是 InnerVectorizedTraversal 时, 会调用到 SIMD 的版本:
template <typename Kernel> struct dense_assignment_loop< Kernel, InnerVectorizedTraversal, CompleteUnrolling> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; copy_using_evaluator_innervec_CompleteUnrolling< Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel); } }; // run 最终会调用到: template <typename DstScalar, typename SrcScalar> struct add_assign_op { template <int Alignment, typename Packet> void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret<DstScalar, Packet, Alignment>( a, internal::padd(internal::ploadt<Packet, Alignment>(a), b)); } }; // 其中 internal::padd 是 arch 中定义的: template <> Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a, b); }
backtrace 为:
#0 Eigen::internal::padd<float __vector(4)>(float __vector(4) const&, float __vector(4) const&) (a=..., b=...) at eigen-3.4.0/Eigen/src/Core/arch/SSE/PacketMath.h:291 #1 0x000055555555730b in Eigen::internal::add_assign_op<float, float>::assignPacket<16, float __vector(4)>(float*, float __vector(4) const&) const (this=0x7fffffffc167, a=0x7fffffffc1c0, b=...) at eigen-3.4.0/Eigen/src/Core/functors/AssignmentFunctors.h:54 #2 0x00005555555571f6 in Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>::assignPacket<16, 16, float __vector(4)>(long, long) (this=0x7fffffffc060, row=0, col=0) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:675 #3 0x0000555555557123 in Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>::assignPacketByOuterInner<16, 16, float __vector(4)>(long, long) (this=0x7fffffffc060, outer=0, inner=0) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:689 #4 0x0000555555557056 in Eigen::internal::copy_using_evaluator_innervec_CompleteUnrolling<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 0, 4>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:279 #5 0x0000555555556fc4 in Eigen::internal::dense_assignment_loop<Eigen::internal::generic_dense_assignment_kernel<Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::evaluator<Eigen::Matrix<float, 4, 1, 0, 4, 1> >, Eigen::internal::add_assign_op<float, float>, 0>, 2, 2>::run (kernel=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:489 ~~~~~~~~~~~~~~~~~~~~~~ #6 0x0000555555556e80 in Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:784 #7 0x0000555555556d64 in Eigen::internal::Assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float>, Eigen::internal::Dense2Dense, void>::run (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:952 #8 0x0000555555556c23 in Eigen::internal::call_assignment_no_alias<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> > (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:888 #9 0x000055555555643a in Eigen::internal::call_assignment<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::add_assign_op<float, float> >(Eigen::Matrix<float, 4, 1, 0, 4, 1>&, Eigen::Matrix<float, 4, 1, 0, 4, 1> const&, Eigen::internal::add_assign_op<float, float> const&, Eigen::internal::enable_if<!Eigen::internal::evaluator_assume_aliasing<Eigen::Matrix<float, 4, 1, 0, 4, 1>, Eigen::internal::evaluator_traits<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::Shape>::value, void*>::type) (dst=..., src=..., func=...) at eigen-3.4.0/Eigen/src/Core/AssignEvaluator.h:857 #10 0x0000555555555d18 in Eigen::MatrixBase<Eigen::Matrix<float, 4, 1, 0, 4, 1> >::operator+=<Eigen::Matrix<float, 4, 1, 0, 4, 1> > (this=0x7fffffffc1c0, other=...) at eigen-3.4.0/Eigen/src/Core/CwiseBinaryOp.h:177 #11 0x000055555555554c in main () at matrix.cc:10
1.1.3. Traversal
Traversal 的值取决于 arch 中的 HasXXX 枚举值
AssignEvaluator.h:
template < typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1> struct copy_using_evaluator_traits { private: enum { MightVectorize = // ... && bool(functor_traits<AssignFunc>::PacketAccess), //~~~~~~~~~~~~~~ MayInnerVectorize = MightVectorize && // ... // ... }; public: enum { Traversal = int(Dst::SizeAtCompileTime) == 0 ? int(AllAtOnceTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) // ... : int(MayLinearize) ? int(LinearTraversal) : int(DefaultTraversal), }; }
AssignmentFunctors.h:
template<typename DstScalar,typename SrcScalar> struct functor_traits<add_assign_op<DstScalar,SrcScalar> > { enum { Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::AddCost, PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasAdd //~~~~~~~ }; };
PacketMath.h@SSE:
template <> struct packet_traits<Eigen::half> : default_packet_traits { enum { // ... HasAdd = 1, // ... } // ... }
1.2. eigen 需要的 SIMD 指令
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
以 SSE 为例, 列举 eigen 需要的 SIMD 指令
1.2.1. op 需要的指令
op (例如 add_assign_op) 既有非 SIMD 实现, 也有 SIMD 实现, 且 arch 需要定义 HasXXX 枚举类型告诉 eigen 使用哪种实现.
1.2.1.1. assign_op
non-simd | internal | simd |
---|---|---|
a=b | pstoret | _mm_store_ps |
1.2.1.2. (add,sub,mul,div)_assign_op
non-simd | internal | simd |
---|---|---|
a(+,-,*,/)=b | pstoret | _mm_store_ps |
pload | _mm_load_ps | |
p(add,sub,mul,div) | _mm_(add,sub,mul,div)_ps |
1.2.1.3. scalar_boolean_(and,or,xor)_op
non-simd | internal | simd |
---|---|---|
a (&&,||,^) b | p(and,or,xor) | _mm_(and,or,xor)_ps |
1.2.1.4. scalar_opposite_op
non-simd | internal | simd |
---|---|---|
a | pnegate | _mm_castsi128_ps |
_mm_xor_ps |
1.2.1.5. scalar_abs_op
non-simd | internal | simd |
---|---|---|
abs(a) | pabs | _mm_castsi128_ps |
_mm_and_ps |
1.2.1.6. scalar_shift_right_op
non-simd | internal | simd |
---|---|---|
a>>N | parithmetic_shift_right | _mm_srai_epi32 |
1.2.1.7. scalar_shift_left_op
non-simd | internal | simd |
---|---|---|
a<<N | plogical_shift_left | _mm_slli_epi32 |
1.2.1.8. scalar_exp_op
non-simd | internal | simd |
---|---|---|
exp(a) | pexp | 属于 MathFunctions, 依赖于近似公式, 会用到 pmin, pmax |
pmadd, pmul 等 |
1.2.1.9. scalar_expm1_op
non-simd | internal | simd |
---|---|---|
exp(a)-1 | pexpm1 | 属于 MathFunctions |
1.2.1.10. scalar_log_op
non-simd | internal | simd |
---|---|---|
log(a) | plog | 属于 MathFunctions |
1.2.1.11. scalar_log1p_op
non-simd | internal | simd |
---|---|---|
log(a)+1 | plog1p | 属于 MathFunctions |
1.2.1.12. scalar_log(10,2)_op
non-simd | internal | simd |
---|---|---|
log(10,2)(a) | plog(10,2) | 属于 MathFunctions |
1.2.1.13. scalar_sqrt_op
non-simd | internal | simd |
---|---|---|
sqrt(a) | psqrt | 属于 MathFunctions, 但可以利用 _mm_rsqrt_ps |
1.2.1.14. scalar_(sin,cos,tan,asin,acos,atan,tanh,atanh,sinh,asinh,cosh,acosh)_op
1.2.1.15. scalar_(round,floor,rint,ceil)_op
non-simd | internal | simd |
---|---|---|
(roud,floor,rint,ceil)(a) | p(round,floor,rint,ceil) | _mm_(round,floor,round,ceil)_ps |
或者使用 pand, psum, _mm_cmpgt_ps 间接实现 |
1.2.1.16. scalar_(min,max)_op
non-simd | internal | simd |
---|---|---|
(min,max)(a,b) | p(min,max) | _mm_(min,max)_ps |
1.2.2. 其它指令
还有一些接口是并没有对应的 op, 它们主要是由其它代码间接调用到, 例如 pmadd 主要由 gebp kernel 调用.
1.2.2.1. pcmp_(le,lt,eq)
_mm_cmp(le,lt,eq)_ps
1.2.2.2. peven_mask
_mm_set_epi32
1.2.2.3. pzero
_mm_setzero_ps
1.2.2.4. pselect
_mm_blendv_ps
1.2.2.5. pmadd
_mm_fmadd_ps
1.2.2.6. paddsub
_mm_addsub_ps
1.2.2.7. paddnot
_mm_andnot_ps
1.3. GEBP kernel 需要的 SIMD 指令
GEBP (GEneral Block Panel kernel) 是 eigen 自己的 GEMM (GEneral Matrix Multiply) 实现
1.3.1. madd
madd 是指 `c=a*b+c` 操作, gebp_kernel 依赖这个操作来加速 GEMM:
template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> EIGEN_STRONG_INLINE void madd_impl( const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD c.v = pmadd(a, b.v, c.v); #else tmp = b; tmp.v = pmul(a, tmp.v); c = padd(c, tmp); #endif }
arch 如果支持 madd 类指令例如 FMA, 则需要定义 EIGEN_HAS_SINGLE_INSTRUCTION_MADD 并提供 pmadd 的实现
1.3.2. prefetch
gebp_kernel 会用 prefetch 指令来预读数据, 需要 arch 去提供 (例如 SSE 的 `_mm_prefetch`) 或者使用 gcc 的 `__builtin_prefetch`
Backlinks
GCC Prefetch (GCC Prefetch): prefetch 可以隐藏内存访问的 latency. 例如 eigen 的 gebp kernel 就使用了 __builtin_prefetch 来提高 GEMM 性能.
1.4. 添加新的 arch
- 定义自己的 packet_traits, 需要继承自 default_packet_traits, 按 SIMD 的规格做一些修改, 例如 size, HasXXX 等
- 定义 Packet4f, Packet4i 等与底层 intrinsic 类型 (例如 __m128, __m128i) 的对应关系
- 针对各个 pxxx 通过 C++ 模板特化 (template specialization) 提供自己的实现. 如果没有提供, 会调用 GenericPacketMath 定义的通用的版本:
- 对于 add, sub 等基本操作, GenericPacketMath 会通过 gcc vector extension 调到 SIMD
- 对于复杂的操作, 例如 pselect, GenericPacketMath 会调用到 pand, por 等基本操作
Backlinks
Tensorflow Architecture: Parallism (Tensorflow Architecture: Parallism > kernels > eigen): 默认情况下 tensorflow 使用 eigen, eigen 是一个基于 c++ 模板的线性代数库, 和 BLAS (Basic Linear Algebra Subprograms) 功能类似. 它支持的特性包括:
Tensorflow Architecture: Parallism (Tensorflow Architecture: Parallism > Parallelism > 后续工作): 利用 RISC-V 的 vector 或 SIMD 指令加速 eigen 的 CPU device, 现在 `Eigen/src/Core/arch` 下已经包含 NEON, SSE, AVX, AVX512 等.