TVM Relay IR
Table of Contents
1. TVM Relay IR
tvm/docs/dev/relay_intro.rst
1.1. 编译一个简单的 function
1.1.1. 使用 Relay IR 定义一个 function
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # 2021-08-03 11:11 import tvm from tvm import relay x = relay.var("x", shape=(1, 1000), dtype="float32") y = relay.add(x, x) func = relay.Function([x], y) mod = tvm.IRModule.from_expr(func) print(mod)
def @main(%x: Tensor[(1, 1000), float32]) { add(%x, %x) }
1.1.2. C Target
with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, target="c -mcpu=skylake-avx512", params=None) print(graph) print(lib.get_source())
// tvm target: c -keys=cpu -link-params=0 -mcpu=skylake-avx512
#define TVM_EXPORTS
#include "tvm/runtime/c_runtime_api.h"
#include "tvm/runtime/c_backend_api.h"
#include <math.h>
#ifdef __cplusplus
extern "C"
#endif
TVM_DLL int32_t tvmgen_default_fused_add(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
void* arg0 = (((TVMValue*)args)[0].v_handle);
int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
void* arg1 = (((TVMValue*)args)[1].v_handle);
int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
void* placeholder = (((DLTensor*)arg0)[0].data);
void* arg0_shape = (((DLTensor*)arg0)[0].shape);
void* arg0_strides = (((DLTensor*)arg0)[0].strides);
int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
void* T_add = (((DLTensor*)arg1)[0].data);
void* arg1_shape = (((DLTensor*)arg1)[0].shape);
void* arg1_strides = (((DLTensor*)arg1)[0].strides);
if (!(arg0_strides = NULL)) {
}
if (!(arg1_strides =
NULL)) {
}
for (int32_t ax1_outer = 0; ax1_outer < 63; ++ax1_outer) {
for (int32_t ax1_inner_s = 0; ax1_inner_s < 16; ++ax1_inner_s) {
if (((ax1_outer * 16) + ax1_inner_s) < 1000) {
((float*)T_add)[(((ax1_outer * 16) + ax1_inner_s))] = (((float*)placeholder)[(((ax1_outer * 16) + ax1_inner_s))] + ((float*)placeholder)[(((ax1_outer * 16) + ax1_inner_s))]);
}
}
}
return 0;
}
1.1.3. LLVM Target
with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build( mod, target="llvm -mcpu=skylake-avx512", params=None ) lib.save("/tmp/a.o")
/tmp/ipykernel_30923/3795460359.py:2: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage. graph, lib, params = relay.build(
objdump -d /tmp/a.o
/tmp/a.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <tvmgen_default_fused_add>: 0: 50 push %rax 1: 83 fa 02 cmp $0x2,%edx 4: 0f 85 70 01 00 00 jne 17a <tvmgen_default_fused_add+0x17a> a: 8b 06 mov (%rsi),%eax c: 83 f8 0d cmp $0xd,%eax f: 0f 87 3e 01 00 00 ja 153 <tvmgen_default_fused_add+0x153> 15: b9 98 20 00 00 mov $0x2098,%ecx 1a: 0f a3 c1 bt %eax,%ecx 1d: 0f 83 30 01 00 00 jae 153 <tvmgen_default_fused_add+0x153> 23: 8b 46 04 mov 0x4(%rsi),%eax 26: 83 f8 0d cmp $0xd,%eax 29: 0f 87 34 01 00 00 ja 163 <tvmgen_default_fused_add+0x163> 2f: b9 98 20 00 00 mov $0x2098,%ecx 34: 0f a3 c1 bt %eax,%ecx 37: 0f 83 26 01 00 00 jae 163 <tvmgen_default_fused_add+0x163> 3d: 48 8b 0f mov (%rdi),%rcx 40: 83 79 10 02 cmpl $0x2,0x10(%rcx) 44: 0f 85 40 01 00 00 jne 18a <tvmgen_default_fused_add+0x18a> 4a: 66 83 79 16 01 cmpw $0x1,0x16(%rcx) 4f: 0f 85 45 01 00 00 jne 19a <tvmgen_default_fused_add+0x19a> 55: 80 79 15 20 cmpb $0x20,0x15(%rcx) 59: 0f 85 3b 01 00 00 jne 19a <tvmgen_default_fused_add+0x19a> 5f: 80 79 14 02 cmpb $0x2,0x14(%rcx) 63: 0f 85 31 01 00 00 jne 19a <tvmgen_default_fused_add+0x19a> 69: 48 8b 41 18 mov 0x18(%rcx),%rax 6d: 83 38 01 cmpl $0x1,(%rax) 70: 0f 85 34 01 00 00 jne 1aa <tvmgen_default_fused_add+0x1aa> 76: 81 78 08 e8 03 00 00 cmpl $0x3e8,0x8(%rax) 7d: 0f 85 37 01 00 00 jne 1ba <tvmgen_default_fused_add+0x1ba> 83: 48 8b 47 08 mov 0x8(%rdi),%rax 87: 48 8b 31 mov (%rcx),%rsi 8a: 48 8b 51 20 mov 0x20(%rcx),%rdx 8e: 44 8b 41 0c mov 0xc(%rcx),%r8d 92: 48 8b 38 mov (%rax),%rdi 95: 4c 8b 48 18 mov 0x18(%rax),%r9 99: 4c 8b 50 20 mov 0x20(%rax),%r10 9d: 48 85 d2 test %rdx,%rdx a0: 74 16 je b8 <tvmgen_default_fused_add+0xb8> a2: 81 3a e8 03 00 00 cmpl $0x3e8,(%rdx) a8: 0f 85 1c 01 00 00 jne 1ca <tvmgen_default_fused_add+0x1ca> ae: 83 7a 08 01 cmpl $0x1,0x8(%rdx) b2: 0f 85 12 01 00 00 jne 1ca <tvmgen_default_fused_add+0x1ca> b8: 48 83 79 28 00 cmpq $0x0,0x28(%rcx) bd: 0f 85 17 01 00 00 jne 1da <tvmgen_default_fused_add+0x1da> c3: 83 79 08 01 cmpl $0x1,0x8(%rcx) c7: 0f 85 1d 01 00 00 jne 1ea <tvmgen_default_fused_add+0x1ea> cd: 83 78 10 02 cmpl $0x2,0x10(%rax) d1: 0f 85 26 01 00 00 jne 1fd <tvmgen_default_fused_add+0x1fd> d7: 66 83 78 16 01 cmpw $0x1,0x16(%rax) dc: 0f 85 2e 01 00 00 jne 210 <tvmgen_default_fused_add+0x210> e2: 80 78 15 20 cmpb $0x20,0x15(%rax) e6: 0f 85 24 01 00 00 jne 210 <tvmgen_default_fused_add+0x210> ec: 80 78 14 02 cmpb $0x2,0x14(%rax) f0: 0f 85 1a 01 00 00 jne 210 <tvmgen_default_fused_add+0x210> f6: 41 83 39 01 cmpl $0x1,(%r9) fa: 0f 85 23 01 00 00 jne 223 <tvmgen_default_fused_add+0x223> 100: 41 81 79 08 e8 03 00 cmpl $0x3e8,0x8(%r9) 107: 00 108: 0f 85 28 01 00 00 jne 236 <tvmgen_default_fused_add+0x236> 10e: 4d 85 d2 test %r10,%r10 111: 74 18 je 12b <tvmgen_default_fused_add+0x12b> 113: 41 81 3a e8 03 00 00 cmpl $0x3e8,(%r10) 11a: 0f 85 29 01 00 00 jne 249 <tvmgen_default_fused_add+0x249> 120: 41 83 7a 08 01 cmpl $0x1,0x8(%r10) 125: 0f 85 1e 01 00 00 jne 249 <tvmgen_default_fused_add+0x249> 12b: 48 83 78 28 00 cmpq $0x0,0x28(%rax) 130: 0f 85 26 01 00 00 jne 25c <tvmgen_default_fused_add+0x25c> 136: 83 78 08 01 cmpl $0x1,0x8(%rax) 13a: 0f 85 2f 01 00 00 jne 26f <tvmgen_default_fused_add+0x26f> 140: 44 3b 40 0c cmp 0xc(%rax),%r8d 144: 0f 85 38 01 00 00 jne 282 <tvmgen_default_fused_add+0x282> 14a: e8 51 01 00 00 callq 2a0 <tvmgen_default_fused_add_compute_> 14f: 31 c0 xor %eax,%eax 151: 59 pop %rcx 152: c3 retq 153: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 15a <tvmgen_default_fused_add+0x15a> 15a: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 161 <tvmgen_default_fused_add+0x161> 161: eb 0e jmp 171 <tvmgen_default_fused_add+0x171> 163: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 16a <tvmgen_default_fused_add+0x16a> 16a: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 171 <tvmgen_default_fused_add+0x171> 171: ff 10 callq *(%rax) 173: b8 ff ff ff ff mov $0xffffffff,%eax 178: 59 pop %rcx 179: c3 retq 17a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 181 <tvmgen_default_fused_add+0x181> 181: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 188 <tvmgen_default_fused_add+0x188> 188: eb e7 jmp 171 <tvmgen_default_fused_add+0x171> 18a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 191 <tvmgen_default_fused_add+0x191> 191: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 198 <tvmgen_default_fused_add+0x198> 198: eb d7 jmp 171 <tvmgen_default_fused_add+0x171> 19a: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 1a1 <tvmgen_default_fused_add+0x1a1> 1a1: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 1a8 <tvmgen_default_fused_add+0x1a8> 1a8: eb c7 jmp 171 <tvmgen_default_fused_add+0x171> 1aa: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 1b1 <tvmgen_default_fused_add+0x1b1> 1b1: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 1b8 <tvmgen_default_fused_add+0x1b8> 1b8: eb b7 jmp 171 <tvmgen_default_fused_add+0x171> 1ba: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 1c1 <tvmgen_default_fused_add+0x1c1> 1c1: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 1c8 <tvmgen_default_fused_add+0x1c8> 1c8: eb a7 jmp 171 <tvmgen_default_fused_add+0x171> 1ca: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 1d1 <tvmgen_default_fused_add+0x1d1> 1d1: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 1d8 <tvmgen_default_fused_add+0x1d8> 1d8: eb 97 jmp 171 <tvmgen_default_fused_add+0x171> 1da: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 1e1 <tvmgen_default_fused_add+0x1e1> 1e1: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 1e8 <tvmgen_default_fused_add+0x1e8> 1e8: eb 87 jmp 171 <tvmgen_default_fused_add+0x171> 1ea: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 1f1 <tvmgen_default_fused_add+0x1f1> 1f1: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 1f8 <tvmgen_default_fused_add+0x1f8> 1f8: e9 74 ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 1fd: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 204 <tvmgen_default_fused_add+0x204> 204: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 20b <tvmgen_default_fused_add+0x20b> 20b: e9 61 ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 210: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 217 <tvmgen_default_fused_add+0x217> 217: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 21e <tvmgen_default_fused_add+0x21e> 21e: e9 4e ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 223: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 22a <tvmgen_default_fused_add+0x22a> 22a: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 231 <tvmgen_default_fused_add+0x231> 231: e9 3b ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 236: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 23d <tvmgen_default_fused_add+0x23d> 23d: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 244 <tvmgen_default_fused_add+0x244> 244: e9 28 ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 249: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 250 <tvmgen_default_fused_add+0x250> 250: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 257 <tvmgen_default_fused_add+0x257> 257: e9 15 ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 25c: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 263 <tvmgen_default_fused_add+0x263> 263: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 26a <tvmgen_default_fused_add+0x26a> 26a: e9 02 ff ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 26f: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 276 <tvmgen_default_fused_add+0x276> 276: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 27d <tvmgen_default_fused_add+0x27d> 27d: e9 ef fe ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 282: 48 8b 05 00 00 00 00 mov 0x0(%rip),%rax # 289 <tvmgen_default_fused_add+0x289> 289: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 290 <tvmgen_default_fused_add+0x290> 290: e9 dc fe ff ff jmpq 171 <tvmgen_default_fused_add+0x171> 295: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 29c: 00 00 00 29f: 90 nop
00000000000002a0 <tvmgen_default_fused_add_compute_>: 2a0: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 2a8 <tvmgen_default_fused_add_compute_+0x8> 2a7: 00 2a8: c5 fc 46 c8 kxnorw %k0,%k0,%k1 2ac: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 2b3: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 2b7: c5 fc 46 c8 kxnorw %k0,%k0,%k1 2bb: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 2c2: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 2ca <tvmgen_default_fused_add_compute_+0x2a> 2c9: 00 2ca: c5 fc 46 c8 kxnorw %k0,%k0,%k1 2ce: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 2d5: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 2d9: c5 fc 46 c8 kxnorw %k0,%k0,%k1 2dd: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 2e4: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 2ec <tvmgen_default_fused_add_compute_+0x4c> 2eb: 00 2ec: c5 fc 46 c8 kxnorw %k0,%k0,%k1 2f0: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 2f7: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 2fb: c5 fc 46 c8 kxnorw %k0,%k0,%k1 2ff: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 306: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 30e <tvmgen_default_fused_add_compute_+0x6e> 30d: 00 30e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 312: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 319: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 31d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 321: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 328: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 330 <tvmgen_default_fused_add_compute_+0x90> 32f: 00 330: c5 fc 46 c8 kxnorw %k0,%k0,%k1 334: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 33b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 33f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 343: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 34a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 352 <tvmgen_default_fused_add_compute_+0xb2> 351: 00 352: c5 fc 46 c8 kxnorw %k0,%k0,%k1 356: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 35d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 361: c5 fc 46 c8 kxnorw %k0,%k0,%k1 365: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 36c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 374 <tvmgen_default_fused_add_compute_+0xd4> 373: 00 374: c5 fc 46 c8 kxnorw %k0,%k0,%k1 378: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 37f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 383: c5 fc 46 c8 kxnorw %k0,%k0,%k1 387: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 38e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 396 <tvmgen_default_fused_add_compute_+0xf6> 395: 00 396: c5 fc 46 c8 kxnorw %k0,%k0,%k1 39a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 3a1: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 3a5: c5 fc 46 c8 kxnorw %k0,%k0,%k1 3a9: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 3b0: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 3b8 <tvmgen_default_fused_add_compute_+0x118> 3b7: 00 3b8: c5 fc 46 c8 kxnorw %k0,%k0,%k1 3bc: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 3c3: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 3c7: c5 fc 46 c8 kxnorw %k0,%k0,%k1 3cb: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 3d2: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 3da <tvmgen_default_fused_add_compute_+0x13a> 3d9: 00 3da: c5 fc 46 c8 kxnorw %k0,%k0,%k1 3de: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 3e5: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 3e9: c5 fc 46 c8 kxnorw %k0,%k0,%k1 3ed: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 3f4: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 3fc <tvmgen_default_fused_add_compute_+0x15c> 3fb: 00 3fc: c5 fc 46 c8 kxnorw %k0,%k0,%k1 400: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 407: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 40b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 40f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 416: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 41e <tvmgen_default_fused_add_compute_+0x17e> 41d: 00 41e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 422: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 429: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 42d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 431: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 438: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 440 <tvmgen_default_fused_add_compute_+0x1a0> 43f: 00 440: c5 fc 46 c8 kxnorw %k0,%k0,%k1 444: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 44b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 44f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 453: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 45a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 462 <tvmgen_default_fused_add_compute_+0x1c2> 461: 00 462: c5 fc 46 c8 kxnorw %k0,%k0,%k1 466: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 46d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 471: c5 fc 46 c8 kxnorw %k0,%k0,%k1 475: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 47c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 484 <tvmgen_default_fused_add_compute_+0x1e4> 483: 00 484: c5 fc 46 c8 kxnorw %k0,%k0,%k1 488: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 48f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 493: c5 fc 46 c8 kxnorw %k0,%k0,%k1 497: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 49e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 4a6 <tvmgen_default_fused_add_compute_+0x206> 4a5: 00 4a6: c5 fc 46 c8 kxnorw %k0,%k0,%k1 4aa: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 4b1: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 4b5: c5 fc 46 c8 kxnorw %k0,%k0,%k1 4b9: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 4c0: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 4c8 <tvmgen_default_fused_add_compute_+0x228> 4c7: 00 4c8: c5 fc 46 c8 kxnorw %k0,%k0,%k1 4cc: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 4d3: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 4d7: c5 fc 46 c8 kxnorw %k0,%k0,%k1 4db: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 4e2: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 4ea <tvmgen_default_fused_add_compute_+0x24a> 4e9: 00 4ea: c5 fc 46 c8 kxnorw %k0,%k0,%k1 4ee: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 4f5: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 4f9: c5 fc 46 c8 kxnorw %k0,%k0,%k1 4fd: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 504: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 50c <tvmgen_default_fused_add_compute_+0x26c> 50b: 00 50c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 510: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 517: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 51b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 51f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 526: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 52e <tvmgen_default_fused_add_compute_+0x28e> 52d: 00 52e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 532: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 539: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 53d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 541: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 548: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 550 <tvmgen_default_fused_add_compute_+0x2b0> 54f: 00 550: c5 fc 46 c8 kxnorw %k0,%k0,%k1 554: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 55b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 55f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 563: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 56a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 572 <tvmgen_default_fused_add_compute_+0x2d2> 571: 00 572: c5 fc 46 c8 kxnorw %k0,%k0,%k1 576: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 57d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 581: c5 fc 46 c8 kxnorw %k0,%k0,%k1 585: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 58c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 594 <tvmgen_default_fused_add_compute_+0x2f4> 593: 00 594: c5 fc 46 c8 kxnorw %k0,%k0,%k1 598: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 59f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 5a3: c5 fc 46 c8 kxnorw %k0,%k0,%k1 5a7: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 5ae: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 5b6 <tvmgen_default_fused_add_compute_+0x316> 5b5: 00 5b6: c5 fc 46 c8 kxnorw %k0,%k0,%k1 5ba: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 5c1: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 5c5: c5 fc 46 c8 kxnorw %k0,%k0,%k1 5c9: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 5d0: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 5d8 <tvmgen_default_fused_add_compute_+0x338> 5d7: 00 5d8: c5 fc 46 c8 kxnorw %k0,%k0,%k1 5dc: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 5e3: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 5e7: c5 fc 46 c8 kxnorw %k0,%k0,%k1 5eb: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 5f2: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 5fa <tvmgen_default_fused_add_compute_+0x35a> 5f9: 00 5fa: c5 fc 46 c8 kxnorw %k0,%k0,%k1 5fe: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 605: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 609: c5 fc 46 c8 kxnorw %k0,%k0,%k1 60d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 614: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 61c <tvmgen_default_fused_add_compute_+0x37c> 61b: 00 61c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 620: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 627: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 62b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 62f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 636: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 63e <tvmgen_default_fused_add_compute_+0x39e> 63d: 00 63e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 642: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 649: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 64d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 651: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 658: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 660 <tvmgen_default_fused_add_compute_+0x3c0> 65f: 00 660: c5 fc 46 c8 kxnorw %k0,%k0,%k1 664: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 66b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 66f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 673: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 67a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 682 <tvmgen_default_fused_add_compute_+0x3e2> 681: 00 682: c5 fc 46 c8 kxnorw %k0,%k0,%k1 686: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 68d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 691: c5 fc 46 c8 kxnorw %k0,%k0,%k1 695: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 69c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 6a4 <tvmgen_default_fused_add_compute_+0x404> 6a3: 00 6a4: c5 fc 46 c8 kxnorw %k0,%k0,%k1 6a8: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 6af: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 6b3: c5 fc 46 c8 kxnorw %k0,%k0,%k1 6b7: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 6be: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 6c6 <tvmgen_default_fused_add_compute_+0x426> 6c5: 00 6c6: c5 fc 46 c8 kxnorw %k0,%k0,%k1 6ca: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 6d1: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 6d5: c5 fc 46 c8 kxnorw %k0,%k0,%k1 6d9: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 6e0: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 6e8 <tvmgen_default_fused_add_compute_+0x448> 6e7: 00 6e8: c5 fc 46 c8 kxnorw %k0,%k0,%k1 6ec: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 6f3: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 6f7: c5 fc 46 c8 kxnorw %k0,%k0,%k1 6fb: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 702: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 70a <tvmgen_default_fused_add_compute_+0x46a> 709: 00 70a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 70e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 715: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 719: c5 fc 46 c8 kxnorw %k0,%k0,%k1 71d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 724: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 72c <tvmgen_default_fused_add_compute_+0x48c> 72b: 00 72c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 730: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 737: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 73b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 73f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 746: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 74e <tvmgen_default_fused_add_compute_+0x4ae> 74d: 00 74e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 752: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 759: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 75d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 761: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 768: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 770 <tvmgen_default_fused_add_compute_+0x4d0> 76f: 00 770: c5 fc 46 c8 kxnorw %k0,%k0,%k1 774: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 77b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 77f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 783: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 78a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 792 <tvmgen_default_fused_add_compute_+0x4f2> 791: 00 792: c5 fc 46 c8 kxnorw %k0,%k0,%k1 796: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 79d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 7a1: c5 fc 46 c8 kxnorw %k0,%k0,%k1 7a5: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 7ac: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 7b4 <tvmgen_default_fused_add_compute_+0x514> 7b3: 00 7b4: c5 fc 46 c8 kxnorw %k0,%k0,%k1 7b8: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 7bf: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 7c3: c5 fc 46 c8 kxnorw %k0,%k0,%k1 7c7: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 7ce: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 7d6 <tvmgen_default_fused_add_compute_+0x536> 7d5: 00 7d6: c5 fc 46 c8 kxnorw %k0,%k0,%k1 7da: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 7e1: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 7e5: c5 fc 46 c8 kxnorw %k0,%k0,%k1 7e9: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 7f0: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 7f8 <tvmgen_default_fused_add_compute_+0x558> 7f7: 00 7f8: c5 fc 46 c8 kxnorw %k0,%k0,%k1 7fc: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 803: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 807: c5 fc 46 c8 kxnorw %k0,%k0,%k1 80b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 812: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 81a <tvmgen_default_fused_add_compute_+0x57a> 819: 00 81a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 81e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 825: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 829: c5 fc 46 c8 kxnorw %k0,%k0,%k1 82d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 834: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 83c <tvmgen_default_fused_add_compute_+0x59c> 83b: 00 83c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 840: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 847: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 84b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 84f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 856: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 85e <tvmgen_default_fused_add_compute_+0x5be> 85d: 00 85e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 862: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 869: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 86d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 871: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 878: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 880 <tvmgen_default_fused_add_compute_+0x5e0> 87f: 00 880: c5 fc 46 c8 kxnorw %k0,%k0,%k1 884: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 88b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 88f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 893: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 89a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 8a2 <tvmgen_default_fused_add_compute_+0x602> 8a1: 00 8a2: c5 fc 46 c8 kxnorw %k0,%k0,%k1 8a6: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 8ad: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 8b1: c5 fc 46 c8 kxnorw %k0,%k0,%k1 8b5: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 8bc: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 8c4 <tvmgen_default_fused_add_compute_+0x624> 8c3: 00 8c4: c5 fc 46 c8 kxnorw %k0,%k0,%k1 8c8: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 8cf: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 8d3: c5 fc 46 c8 kxnorw %k0,%k0,%k1 8d7: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 8de: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 8e6 <tvmgen_default_fused_add_compute_+0x646> 8e5: 00 8e6: c5 fc 46 c8 kxnorw %k0,%k0,%k1 8ea: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 8f1: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 8f5: c5 fc 46 c8 kxnorw %k0,%k0,%k1 8f9: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 900: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 908 <tvmgen_default_fused_add_compute_+0x668> 907: 00 908: c5 fc 46 c8 kxnorw %k0,%k0,%k1 90c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 913: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 917: c5 fc 46 c8 kxnorw %k0,%k0,%k1 91b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 922: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 92a <tvmgen_default_fused_add_compute_+0x68a> 929: 00 92a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 92e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 935: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 939: c5 fc 46 c8 kxnorw %k0,%k0,%k1 93d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 944: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 94c <tvmgen_default_fused_add_compute_+0x6ac> 94b: 00 94c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 950: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 957: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 95b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 95f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 966: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 96e <tvmgen_default_fused_add_compute_+0x6ce> 96d: 00 96e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 972: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 979: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 97d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 981: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 988: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 990 <tvmgen_default_fused_add_compute_+0x6f0> 98f: 00 990: c5 fc 46 c8 kxnorw %k0,%k0,%k1 994: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 99b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 99f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 9a3: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 9aa: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 9b2 <tvmgen_default_fused_add_compute_+0x712> 9b1: 00 9b2: c5 fc 46 c8 kxnorw %k0,%k0,%k1 9b6: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 9bd: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 9c1: c5 fc 46 c8 kxnorw %k0,%k0,%k1 9c5: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 9cc: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 9d4 <tvmgen_default_fused_add_compute_+0x734> 9d3: 00 9d4: c5 fc 46 c8 kxnorw %k0,%k0,%k1 9d8: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 9df: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 9e3: c5 fc 46 c8 kxnorw %k0,%k0,%k1 9e7: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 9ee: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 9f6 <tvmgen_default_fused_add_compute_+0x756> 9f5: 00 9f6: c5 fc 46 c8 kxnorw %k0,%k0,%k1 9fa: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} a01: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 a05: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a09: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} a10: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # a18 <tvmgen_default_fused_add_compute_+0x778> a17: 00 a18: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a1c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} a23: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 a27: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a2b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} a32: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # a3a <tvmgen_default_fused_add_compute_+0x79a> a39: 00 a3a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a3e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} a45: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 a49: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a4d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} a54: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # a5c <tvmgen_default_fused_add_compute_+0x7bc> a5b: 00 a5c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a60: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} a67: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 a6b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a6f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} a76: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # a7e <tvmgen_default_fused_add_compute_+0x7de> a7d: 00 a7e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a82: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} a89: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 a8d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 a91: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} a98: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # aa0 <tvmgen_default_fused_add_compute_+0x800> a9f: 00 aa0: c5 fc 46 c8 kxnorw %k0,%k0,%k1 aa4: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} aab: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 aaf: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ab3: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} aba: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # ac2 <tvmgen_default_fused_add_compute_+0x822> ac1: 00 ac2: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ac6: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} acd: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 ad1: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ad5: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} adc: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # ae4 <tvmgen_default_fused_add_compute_+0x844> ae3: 00 ae4: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ae8: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} aef: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 af3: c5 fc 46 c8 kxnorw %k0,%k0,%k1 af7: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} afe: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # b06 <tvmgen_default_fused_add_compute_+0x866> b05: 00 b06: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b0a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} b11: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 b15: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b19: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} b20: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # b28 <tvmgen_default_fused_add_compute_+0x888> b27: 00 b28: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b2c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} b33: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 b37: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b3b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} b42: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # b4a <tvmgen_default_fused_add_compute_+0x8aa> b49: 00 b4a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b4e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} b55: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 b59: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b5d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} b64: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # b6c <tvmgen_default_fused_add_compute_+0x8cc> b6b: 00 b6c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b70: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} b77: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 b7b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b7f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} b86: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # b8e <tvmgen_default_fused_add_compute_+0x8ee> b8d: 00 b8e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 b92: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} b99: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 b9d: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ba1: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} ba8: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # bb0 <tvmgen_default_fused_add_compute_+0x910> baf: 00 bb0: c5 fc 46 c8 kxnorw %k0,%k0,%k1 bb4: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} bbb: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 bbf: c5 fc 46 c8 kxnorw %k0,%k0,%k1 bc3: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} bca: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # bd2 <tvmgen_default_fused_add_compute_+0x932> bd1: 00 bd2: c5 fc 46 c8 kxnorw %k0,%k0,%k1 bd6: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} bdd: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 be1: c5 fc 46 c8 kxnorw %k0,%k0,%k1 be5: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} bec: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # bf4 <tvmgen_default_fused_add_compute_+0x954> bf3: 00 bf4: c5 fc 46 c8 kxnorw %k0,%k0,%k1 bf8: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} bff: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 c03: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c07: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} c0e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # c16 <tvmgen_default_fused_add_compute_+0x976> c15: 00 c16: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c1a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} c21: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 c25: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c29: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} c30: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # c38 <tvmgen_default_fused_add_compute_+0x998> c37: 00 c38: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c3c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} c43: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 c47: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c4b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} c52: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # c5a <tvmgen_default_fused_add_compute_+0x9ba> c59: 00 c5a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c5e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} c65: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 c69: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c6d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} c74: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # c7c <tvmgen_default_fused_add_compute_+0x9dc> c7b: 00 c7c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c80: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} c87: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 c8b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 c8f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} c96: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # c9e <tvmgen_default_fused_add_compute_+0x9fe> c9d: 00 c9e: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ca2: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ca9: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 cad: c5 fc 46 c8 kxnorw %k0,%k0,%k1 cb1: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} cb8: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # cc0 <tvmgen_default_fused_add_compute_+0xa20> cbf: 00 cc0: c5 fc 46 c8 kxnorw %k0,%k0,%k1 cc4: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ccb: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 ccf: c5 fc 46 c8 kxnorw %k0,%k0,%k1 cd3: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} cda: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # ce2 <tvmgen_default_fused_add_compute_+0xa42> ce1: 00 ce2: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ce6: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ced: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 cf1: c5 fc 46 c8 kxnorw %k0,%k0,%k1 cf5: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} cfc: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # d04 <tvmgen_default_fused_add_compute_+0xa64> d03: 00 d04: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d08: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} d0f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 d13: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d17: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} d1e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # d26 <tvmgen_default_fused_add_compute_+0xa86> d25: 00 d26: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d2a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} d31: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 d35: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d39: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} d40: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # d48 <tvmgen_default_fused_add_compute_+0xaa8> d47: 00 d48: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d4c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} d53: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 d57: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d5b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} d62: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # d6a <tvmgen_default_fused_add_compute_+0xaca> d69: 00 d6a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d6e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} d75: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 d79: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d7d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} d84: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # d8c <tvmgen_default_fused_add_compute_+0xaec> d8b: 00 d8c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d90: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} d97: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 d9b: c5 fc 46 c8 kxnorw %k0,%k0,%k1 d9f: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} da6: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # dae <tvmgen_default_fused_add_compute_+0xb0e> dad: 00 dae: c5 fc 46 c8 kxnorw %k0,%k0,%k1 db2: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} db9: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 dbd: c5 fc 46 c8 kxnorw %k0,%k0,%k1 dc1: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} dc8: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # dd0 <tvmgen_default_fused_add_compute_+0xb30> dcf: 00 dd0: c5 fc 46 c8 kxnorw %k0,%k0,%k1 dd4: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ddb: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 ddf: c5 fc 46 c8 kxnorw %k0,%k0,%k1 de3: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} dea: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # df2 <tvmgen_default_fused_add_compute_+0xb52> df1: 00 df2: c5 fc 46 c8 kxnorw %k0,%k0,%k1 df6: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} dfd: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 e01: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e05: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} e0c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # e14 <tvmgen_default_fused_add_compute_+0xb74> e13: 00 e14: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e18: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} e1f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 e23: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e27: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} e2e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # e36 <tvmgen_default_fused_add_compute_+0xb96> e35: 00 e36: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e3a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} e41: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 e45: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e49: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} e50: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # e58 <tvmgen_default_fused_add_compute_+0xbb8> e57: 00 e58: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e5c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} e63: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 e67: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e6b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} e72: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # e7a <tvmgen_default_fused_add_compute_+0xbda> e79: 00 e7a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e7e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} e85: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 e89: c5 fc 46 c8 kxnorw %k0,%k0,%k1 e8d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} e94: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # e9c <tvmgen_default_fused_add_compute_+0xbfc> e9b: 00 e9c: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ea0: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ea7: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 eab: c5 fc 46 c8 kxnorw %k0,%k0,%k1 eaf: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} eb6: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # ebe <tvmgen_default_fused_add_compute_+0xc1e> ebd: 00 ebe: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ec2: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ec9: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 ecd: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ed1: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} ed8: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # ee0 <tvmgen_default_fused_add_compute_+0xc40> edf: 00 ee0: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ee4: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} eeb: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 eef: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ef3: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} efa: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # f02 <tvmgen_default_fused_add_compute_+0xc62> f01: 00 f02: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f06: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} f0d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 f11: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f15: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} f1c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # f24 <tvmgen_default_fused_add_compute_+0xc84> f23: 00 f24: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f28: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} f2f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 f33: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f37: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} f3e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # f46 <tvmgen_default_fused_add_compute_+0xca6> f45: 00 f46: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f4a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} f51: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 f55: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f59: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} f60: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # f68 <tvmgen_default_fused_add_compute_+0xcc8> f67: 00 f68: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f6c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} f73: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 f77: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f7b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} f82: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # f8a <tvmgen_default_fused_add_compute_+0xcea> f89: 00 f8a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f8e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} f95: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 f99: c5 fc 46 c8 kxnorw %k0,%k0,%k1 f9d: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} fa4: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # fac <tvmgen_default_fused_add_compute_+0xd0c> fab: 00 fac: c5 fc 46 c8 kxnorw %k0,%k0,%k1 fb0: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} fb7: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 fbb: c5 fc 46 c8 kxnorw %k0,%k0,%k1 fbf: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} fc6: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # fce <tvmgen_default_fused_add_compute_+0xd2e> fcd: 00 fce: c5 fc 46 c8 kxnorw %k0,%k0,%k1 fd2: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} fd9: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 fdd: c5 fc 46 c8 kxnorw %k0,%k0,%k1 fe1: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} fe8: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # ff0 <tvmgen_default_fused_add_compute_+0xd50> fef: 00 ff0: c5 fc 46 c8 kxnorw %k0,%k0,%k1 ff4: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} ffb: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 fff: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1003: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 100a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1012 <tvmgen_default_fused_add_compute_+0xd72> 1011: 00 1012: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1016: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 101d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1021: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1025: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 102c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1034 <tvmgen_default_fused_add_compute_+0xd94> 1033: 00 1034: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1038: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 103f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1043: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1047: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 104e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1056 <tvmgen_default_fused_add_compute_+0xdb6> 1055: 00 1056: c5 fc 46 c8 kxnorw %k0,%k0,%k1 105a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 1061: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1065: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1069: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 1070: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1078 <tvmgen_default_fused_add_compute_+0xdd8> 1077: 00 1078: c5 fc 46 c8 kxnorw %k0,%k0,%k1 107c: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 1083: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1087: c5 fc 46 c8 kxnorw %k0,%k0,%k1 108b: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 1092: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 109a <tvmgen_default_fused_add_compute_+0xdfa> 1099: 00 109a: c5 fc 46 c8 kxnorw %k0,%k0,%k1 109e: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 10a5: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 10a9: c5 fc 46 c8 kxnorw %k0,%k0,%k1 10ad: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 10b4: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 10bc <tvmgen_default_fused_add_compute_+0xe1c> 10bb: 00 10bc: c5 fc 46 c8 kxnorw %k0,%k0,%k1 10c0: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 10c7: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 10cb: c5 fc 46 c8 kxnorw %k0,%k0,%k1 10cf: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 10d6: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 10de <tvmgen_default_fused_add_compute_+0xe3e> 10dd: 00 10de: c5 fc 46 c8 kxnorw %k0,%k0,%k1 10e2: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 10e9: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 10ed: c5 fc 46 c8 kxnorw %k0,%k0,%k1 10f1: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 10f8: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1100 <tvmgen_default_fused_add_compute_+0xe60> 10ff: 00 1100: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1104: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 110b: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 110f: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1113: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 111a: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1122 <tvmgen_default_fused_add_compute_+0xe82> 1121: 00 1122: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1126: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 112d: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1131: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1135: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 113c: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1144 <tvmgen_default_fused_add_compute_+0xea4> 1143: 00 1144: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1148: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 114f: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1153: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1157: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 115e: c5 fc 28 05 00 00 00 vmovaps 0x0(%rip),%ymm0 # 1166 <tvmgen_default_fused_add_compute_+0xec6> 1165: 00 1166: c5 fc 46 c8 kxnorw %k0,%k0,%k1 116a: 62 f2 7d 29 92 0c 86 vgatherdps (%rsi,%ymm0,4),%ymm1{%k1} 1171: c5 fc 46 c8 kxnorw %k0,%k0,%k1 1175: c5 f4 58 c9 vaddps %ymm1,%ymm1,%ymm1 1179: 62 f2 7d 29 a2 0c 87 vscatterdps %ymm1,(%rdi,%ymm0,4){%k1} 1180: c5 fc 28 86 00 0e 00 vmovaps 0xe00(%rsi),%ymm0 1187: 00 1188: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 118c: c5 fc 29 87 00 0e 00 vmovaps %ymm0,0xe00(%rdi) 1193: 00 1194: c5 fa 10 86 20 0e 00 vmovss 0xe20(%rsi),%xmm0 119b: 00 119c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 11a0: c5 fa 11 87 20 0e 00 vmovss %xmm0,0xe20(%rdi) 11a7: 00 11a8: c5 fa 10 86 24 0e 00 vmovss 0xe24(%rsi),%xmm0 11af: 00 11b0: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 11b4: c5 fa 11 87 24 0e 00 vmovss %xmm0,0xe24(%rdi) 11bb: 00 11bc: c5 fa 10 86 28 0e 00 vmovss 0xe28(%rsi),%xmm0 11c3: 00 11c4: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 11c8: c5 fa 11 87 28 0e 00 vmovss %xmm0,0xe28(%rdi) 11cf: 00 11d0: c5 fa 10 86 2c 0e 00 vmovss 0xe2c(%rsi),%xmm0 11d7: 00 11d8: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 11dc: c5 fa 11 87 2c 0e 00 vmovss %xmm0,0xe2c(%rdi) 11e3: 00 11e4: c5 fa 10 86 30 0e 00 vmovss 0xe30(%rsi),%xmm0 11eb: 00 11ec: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 11f0: c5 fa 11 87 30 0e 00 vmovss %xmm0,0xe30(%rdi) 11f7: 00 11f8: c5 fa 10 86 34 0e 00 vmovss 0xe34(%rsi),%xmm0 11ff: 00 1200: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1204: c5 fa 11 87 34 0e 00 vmovss %xmm0,0xe34(%rdi) 120b: 00 120c: c5 fa 10 86 38 0e 00 vmovss 0xe38(%rsi),%xmm0 1213: 00 1214: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1218: c5 fa 11 87 38 0e 00 vmovss %xmm0,0xe38(%rdi) 121f: 00 1220: c5 fa 10 86 3c 0e 00 vmovss 0xe3c(%rsi),%xmm0 1227: 00 1228: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 122c: c5 fa 11 87 3c 0e 00 vmovss %xmm0,0xe3c(%rdi) 1233: 00 1234: c5 fc 28 86 40 0e 00 vmovaps 0xe40(%rsi),%ymm0 123b: 00 123c: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 1240: c5 fc 29 87 40 0e 00 vmovaps %ymm0,0xe40(%rdi) 1247: 00 1248: c5 fa 10 86 60 0e 00 vmovss 0xe60(%rsi),%xmm0 124f: 00 1250: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1254: c5 fa 11 87 60 0e 00 vmovss %xmm0,0xe60(%rdi) 125b: 00 125c: c5 fa 10 86 64 0e 00 vmovss 0xe64(%rsi),%xmm0 1263: 00 1264: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1268: c5 fa 11 87 64 0e 00 vmovss %xmm0,0xe64(%rdi) 126f: 00 1270: c5 fa 10 86 68 0e 00 vmovss 0xe68(%rsi),%xmm0 1277: 00 1278: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 127c: c5 fa 11 87 68 0e 00 vmovss %xmm0,0xe68(%rdi) 1283: 00 1284: c5 fa 10 86 6c 0e 00 vmovss 0xe6c(%rsi),%xmm0 128b: 00 128c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1290: c5 fa 11 87 6c 0e 00 vmovss %xmm0,0xe6c(%rdi) 1297: 00 1298: c5 fa 10 86 70 0e 00 vmovss 0xe70(%rsi),%xmm0 129f: 00 12a0: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 12a4: c5 fa 11 87 70 0e 00 vmovss %xmm0,0xe70(%rdi) 12ab: 00 12ac: c5 fa 10 86 74 0e 00 vmovss 0xe74(%rsi),%xmm0 12b3: 00 12b4: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 12b8: c5 fa 11 87 74 0e 00 vmovss %xmm0,0xe74(%rdi) 12bf: 00 12c0: c5 fa 10 86 78 0e 00 vmovss 0xe78(%rsi),%xmm0 12c7: 00 12c8: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 12cc: c5 fa 11 87 78 0e 00 vmovss %xmm0,0xe78(%rdi) 12d3: 00 12d4: c5 fa 10 86 7c 0e 00 vmovss 0xe7c(%rsi),%xmm0 12db: 00 12dc: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 12e0: c5 fa 11 87 7c 0e 00 vmovss %xmm0,0xe7c(%rdi) 12e7: 00 12e8: c5 fc 28 86 80 0e 00 vmovaps 0xe80(%rsi),%ymm0 12ef: 00 12f0: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 12f4: c5 fc 29 87 80 0e 00 vmovaps %ymm0,0xe80(%rdi) 12fb: 00 12fc: c5 fa 10 86 a0 0e 00 vmovss 0xea0(%rsi),%xmm0 1303: 00 1304: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1308: c5 fa 11 87 a0 0e 00 vmovss %xmm0,0xea0(%rdi) 130f: 00 1310: c5 fa 10 86 a4 0e 00 vmovss 0xea4(%rsi),%xmm0 1317: 00 1318: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 131c: c5 fa 11 87 a4 0e 00 vmovss %xmm0,0xea4(%rdi) 1323: 00 1324: c5 fa 10 86 a8 0e 00 vmovss 0xea8(%rsi),%xmm0 132b: 00 132c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1330: c5 fa 11 87 a8 0e 00 vmovss %xmm0,0xea8(%rdi) 1337: 00 1338: c5 fa 10 86 ac 0e 00 vmovss 0xeac(%rsi),%xmm0 133f: 00 1340: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1344: c5 fa 11 87 ac 0e 00 vmovss %xmm0,0xeac(%rdi) 134b: 00 134c: c5 fa 10 86 b0 0e 00 vmovss 0xeb0(%rsi),%xmm0 1353: 00 1354: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1358: c5 fa 11 87 b0 0e 00 vmovss %xmm0,0xeb0(%rdi) 135f: 00 1360: c5 fa 10 86 b4 0e 00 vmovss 0xeb4(%rsi),%xmm0 1367: 00 1368: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 136c: c5 fa 11 87 b4 0e 00 vmovss %xmm0,0xeb4(%rdi) 1373: 00 1374: c5 fa 10 86 b8 0e 00 vmovss 0xeb8(%rsi),%xmm0 137b: 00 137c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1380: c5 fa 11 87 b8 0e 00 vmovss %xmm0,0xeb8(%rdi) 1387: 00 1388: c5 fa 10 86 bc 0e 00 vmovss 0xebc(%rsi),%xmm0 138f: 00 1390: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1394: c5 fa 11 87 bc 0e 00 vmovss %xmm0,0xebc(%rdi) 139b: 00 139c: c5 fc 28 86 c0 0e 00 vmovaps 0xec0(%rsi),%ymm0 13a3: 00 13a4: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 13a8: c5 fc 29 87 c0 0e 00 vmovaps %ymm0,0xec0(%rdi) 13af: 00 13b0: c5 fa 10 86 e0 0e 00 vmovss 0xee0(%rsi),%xmm0 13b7: 00 13b8: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 13bc: c5 fa 11 87 e0 0e 00 vmovss %xmm0,0xee0(%rdi) 13c3: 00 13c4: c5 fa 10 86 e4 0e 00 vmovss 0xee4(%rsi),%xmm0 13cb: 00 13cc: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 13d0: c5 fa 11 87 e4 0e 00 vmovss %xmm0,0xee4(%rdi) 13d7: 00 13d8: c5 fa 10 86 e8 0e 00 vmovss 0xee8(%rsi),%xmm0 13df: 00 13e0: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 13e4: c5 fa 11 87 e8 0e 00 vmovss %xmm0,0xee8(%rdi) 13eb: 00 13ec: c5 fa 10 86 ec 0e 00 vmovss 0xeec(%rsi),%xmm0 13f3: 00 13f4: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 13f8: c5 fa 11 87 ec 0e 00 vmovss %xmm0,0xeec(%rdi) 13ff: 00 1400: c5 fa 10 86 f0 0e 00 vmovss 0xef0(%rsi),%xmm0 1407: 00 1408: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 140c: c5 fa 11 87 f0 0e 00 vmovss %xmm0,0xef0(%rdi) 1413: 00 1414: c5 fa 10 86 f4 0e 00 vmovss 0xef4(%rsi),%xmm0 141b: 00 141c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1420: c5 fa 11 87 f4 0e 00 vmovss %xmm0,0xef4(%rdi) 1427: 00 1428: c5 fa 10 86 f8 0e 00 vmovss 0xef8(%rsi),%xmm0 142f: 00 1430: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1434: c5 fa 11 87 f8 0e 00 vmovss %xmm0,0xef8(%rdi) 143b: 00 143c: c5 fa 10 86 fc 0e 00 vmovss 0xefc(%rsi),%xmm0 1443: 00 1444: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1448: c5 fa 11 87 fc 0e 00 vmovss %xmm0,0xefc(%rdi) 144f: 00 1450: c5 fc 28 86 00 0f 00 vmovaps 0xf00(%rsi),%ymm0 1457: 00 1458: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 145c: c5 fc 29 87 00 0f 00 vmovaps %ymm0,0xf00(%rdi) 1463: 00 1464: c5 fa 10 86 20 0f 00 vmovss 0xf20(%rsi),%xmm0 146b: 00 146c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1470: c5 fa 11 87 20 0f 00 vmovss %xmm0,0xf20(%rdi) 1477: 00 1478: c5 fa 10 86 24 0f 00 vmovss 0xf24(%rsi),%xmm0 147f: 00 1480: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1484: c5 fa 11 87 24 0f 00 vmovss %xmm0,0xf24(%rdi) 148b: 00 148c: c5 fa 10 86 28 0f 00 vmovss 0xf28(%rsi),%xmm0 1493: 00 1494: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1498: c5 fa 11 87 28 0f 00 vmovss %xmm0,0xf28(%rdi) 149f: 00 14a0: c5 fa 10 86 2c 0f 00 vmovss 0xf2c(%rsi),%xmm0 14a7: 00 14a8: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 14ac: c5 fa 11 87 2c 0f 00 vmovss %xmm0,0xf2c(%rdi) 14b3: 00 14b4: c5 fa 10 86 30 0f 00 vmovss 0xf30(%rsi),%xmm0 14bb: 00 14bc: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 14c0: c5 fa 11 87 30 0f 00 vmovss %xmm0,0xf30(%rdi) 14c7: 00 14c8: c5 fa 10 86 34 0f 00 vmovss 0xf34(%rsi),%xmm0 14cf: 00 14d0: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 14d4: c5 fa 11 87 34 0f 00 vmovss %xmm0,0xf34(%rdi) 14db: 00 14dc: c5 fa 10 86 38 0f 00 vmovss 0xf38(%rsi),%xmm0 14e3: 00 14e4: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 14e8: c5 fa 11 87 38 0f 00 vmovss %xmm0,0xf38(%rdi) 14ef: 00 14f0: c5 fa 10 86 3c 0f 00 vmovss 0xf3c(%rsi),%xmm0 14f7: 00 14f8: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 14fc: c5 fa 11 87 3c 0f 00 vmovss %xmm0,0xf3c(%rdi) 1503: 00 1504: c5 fc 28 86 40 0f 00 vmovaps 0xf40(%rsi),%ymm0 150b: 00 150c: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 1510: c5 fc 29 87 40 0f 00 vmovaps %ymm0,0xf40(%rdi) 1517: 00 1518: c5 fa 10 86 60 0f 00 vmovss 0xf60(%rsi),%xmm0 151f: 00 1520: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1524: c5 fa 11 87 60 0f 00 vmovss %xmm0,0xf60(%rdi) 152b: 00 152c: c5 fa 10 86 64 0f 00 vmovss 0xf64(%rsi),%xmm0 1533: 00 1534: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1538: c5 fa 11 87 64 0f 00 vmovss %xmm0,0xf64(%rdi) 153f: 00 1540: c5 fa 10 86 68 0f 00 vmovss 0xf68(%rsi),%xmm0 1547: 00 1548: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 154c: c5 fa 11 87 68 0f 00 vmovss %xmm0,0xf68(%rdi) 1553: 00 1554: c5 fa 10 86 6c 0f 00 vmovss 0xf6c(%rsi),%xmm0 155b: 00 155c: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1560: c5 fa 11 87 6c 0f 00 vmovss %xmm0,0xf6c(%rdi) 1567: 00 1568: c5 fa 10 86 70 0f 00 vmovss 0xf70(%rsi),%xmm0 156f: 00 1570: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1574: c5 fa 11 87 70 0f 00 vmovss %xmm0,0xf70(%rdi) 157b: 00 157c: c5 fa 10 86 74 0f 00 vmovss 0xf74(%rsi),%xmm0 1583: 00 1584: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 1588: c5 fa 11 87 74 0f 00 vmovss %xmm0,0xf74(%rdi) 158f: 00 1590: c5 fa 10 86 78 0f 00 vmovss 0xf78(%rsi),%xmm0 1597: 00 1598: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 159c: c5 fa 11 87 78 0f 00 vmovss %xmm0,0xf78(%rdi) 15a3: 00 15a4: c5 fa 10 86 7c 0f 00 vmovss 0xf7c(%rsi),%xmm0 15ab: 00 15ac: c5 fa 58 c0 vaddss %xmm0,%xmm0,%xmm0 15b0: c5 fa 11 87 7c 0f 00 vmovss %xmm0,0xf7c(%rdi) 15b7: 00 15b8: c5 fc 28 86 80 0f 00 vmovaps 0xf80(%rsi),%ymm0 15bf: 00 15c0: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 15c4: c5 fc 29 87 80 0f 00 vmovaps %ymm0,0xf80(%rdi) 15cb: 00 15cc: c5 f8 77 vzeroupper 15cf: c3 retq
1.1.4. Opencl Target
with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, target="opencl", params=None, target_host= "llvm") print(lib.imported_modules[0].get_source()) lib.export_library("/tmp/a.elf")
// Function: tvmgen_default_fused_add_kernel0 __kernel void tvmgen_default_fused_add_kernel0(__global float* restrict T_add, __global float* restrict placeholder) { if (((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) < 1000) { T_add[(((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))))] = (placeholder[(((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))))] + placeholder[(((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))))]); } }
readelf -p .rodata /tmp/a.elf|grep Function
[ ab7] // Function: tvmgen_default_fused_add_kernel0^J__kernel void tvmgen_default_fused_add_kernel0(__global float* restrict T_add, __global float* restrict placeholder) {^J if (((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))) < 1000) {^J T_add[(((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))))] = (placeholder[(((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))))] + placeholder[(((((int)get_group_id(0)) * 256) + ((int)get_local_id(0))))]);^J }^J}^J^J^L
1.2. 编译一个 onnx 模型
import onnx import numpy as np import tvm from tvm import te import tvm.relay as relay from tvm.contrib.download import download_testdata onnx_model = onnx.load( download_testdata( "https://gist.github.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/93672b029103648953c4e5ad3ac3aadf346a4cdc/super_resolution_0.2.onnx", "super_resolution.onnx", module="onnx", ) ) shape_dict = {"1": (1, 1, 224, 224)} mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, target="llvm", params=params) # graph 是保存图的结构 graph_json_path = "/tmp/a.json" with open(graph_json_path, "w") as f: f.write(graph) # lib 是图上 operator 对应的实现 param_path = "/tmp/a.params" with open(param_path, "wb") as f: f.write(relay.save_param_dict(params)) # params 是图的参数 lib.export_library("/tmp/liba.so")
/tmp/ipykernel_30923/1147443566.py:21: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage. graph, lib, params = relay.build(mod, target="llvm", params=params)
echo "------function implemented in lib" objdump -d /tmp/liba.so |grep " <tvmgen.*:"|grep -v compute echo "------function used in graph" grep "func_name" /tmp/a.json
-–—function implemented in lib 0000000000001110 <tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu>: 0000000000005360 <tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add>: 0000000000007230 <tvmgen_default_fused_layout_transform_2>: 00000000000076e0 <tvmgen_default_fused_layout_transform_reshape_transpose_reshape>: 0000000000008330 <tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_2>: 000000000000c4b0 <tvmgen_default_fused_layout_transform>: 000000000000cbe0 <tvmgen_default_fused_layout_transform_3>: 000000000000d080 <tvmgen_default_fused_layout_transform_1>: 000000000000d6a0 <tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1>: -–—function used in graph "func_name": "tvmgen_default_fused_layout_transform", "func_name": "tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu" "func_name": "tvmgen_default_fused_layout_transform_1", "func_name": "tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1" "func_name": "tvmgen_default_fused_layout_transform_2", "func_name": "tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_2" "func_name": "tvmgen_default_fused_layout_transform_3", "func_name": "tvmgen_default_fused_nn_contrib_conv2d_NCHWc_add", "func_name": "tvmgen_default_fused_layout_transform_reshape_transpose_reshape",
所以,tvm 的 frontend 并不是把 nn 网络做为整体编译成单个 function, 而是以 operator 为单位编译成多个 function, 在设备上运行时需要借助 tvm runtime 载入 graph, params 后才能运行
1.3. Relay IR 列表
1.3.1. tvm.relay
1.3.2. tvm.relay.annotation
通过 relay.annotation.on_device, 可以指定 relay ir 在哪个 device 上执行, 而不再由 relay.build 时的 target 指定, 例如:
ir1 = relay.annotation.on_device(ir, "cpu") ir2 = relay.annotation.on_device(ir, "dpu") # 此时 relay.build 的 target 需要是一个 mapping 而不是单独一个 target relay.build(xxx, target = {"cpu": "llvm", "dpu": "ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17"})
1.3.3. tvm.relay.nn
1.3.4. tv.relay.image
1.3.5. tv.relay.vision
1.4. Relay IR Transform
1.5. Relay IR Frontend
1.6. Add Relay OP
tvm/docs/dev/relay_add_op.rst
https://github.com/sunwayforever/tvm/tree/my_add
TVM 针对 Relay OP 的注册的核心的机制是 OpRegistry, 无论是 c++ 的 tvm_register_op 宏还是 python 的 register_compute, 都是通过 OpRegistry 完成注册的, 通过 OpRegistry, c++ 和 python 可以透明的读写 Op 的属性:
- type relation
- compute
- schedule
- …
1.6.1. Registry
Relay OP 转换为 tvm::relay::Call
以 relay.add() 为例, 它最终会返回 Call(Op::Get("add"), {lhs, rhs}, Attrs(), {}) 这个 expr, build 时会通过 OpRegistry 查找 `add` op 对应的信息, 例如 compute, schedule 等
y = relay.add(x, x) return _make.add(lhs, rhs) # _make.add 最终会返回 Call(Op::Get("add"), {lhs, rhs}, Attrs(), {}) Call(Op::Get("add"), {lhs, rhs}, Attrs(), {}) # 关于 _make 模块 tvm._ffi._init_api("relay.op._make", __name__) _init_api_prefix(target_module_name, namespace) for name in list_global_func_names(): # 找到所有的 relay.op._make.xxx 函数 if not name.startswith(prefix): continue fname = name[len(prefix) + 1 :] target_module = module f = get_global_func(name) ff = _get_api(f) ff.__name__ = fname ff.__doc__ = "TVM PackedFunc %s. " % fname # 这里导致 .make.{sort, add...} 可以访问了 setattr(target_module, fname, ff) # relay.op._make.add 不是直接定义的, 而是通过 RELAY_REGISTER_BINARY_OP 定义的 #define RELAY_REGISTER_BINARY_OP(OpName) \ TVM_REGISTER_GLOBAL("relay.op._make." OpName).set_body_typed([](Expr lhs, Expr rhs) { \ static const Op& op = Op::Get(OpName); \ return Call(op, {lhs, rhs}, Attrs(), {}); \ }); \ RELAY_REGISTER_OP(OpName) \ .set_num_inputs(2) \ .add_argument("lhs", "Tensor", "The left hand side tensor.") \ .add_argument("rhs", "Tensor", "The right hand side tensor.") \ .add_type_rel("Broadcast", BroadcastRel) \ .set_attr<TOpPattern>("TOpPattern", kBroadcast) \ .set_attr<TOpIsStateful>("TOpIsStateful", false) \ .set_attr<FInferCorrectLayout>("FInferCorrectLayout", BinaryBroadcastLayout) # 这里导致 relay.op._make.add() 会返回 Call(Op::Get("add"), {lhs, rhs}, Attrs(), # {}), 另外, add 的 compute 是在 c++ 里注册的, 而不是通过 python 的 # register_compute RELAY_REGISTER_BINARY_OP("add") .describe("Elementwise add with broadcasting") .set_support_level(1) .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::add));
1.6.2. TypeRelation
TypeRelation 的目的是为了支持 tvm 的 TVM InferType 功能
RELAY_REGISTER_BINARY_OP 时通过 add_type_rel 指定 op 的 TypeRelation
以 Conv2D 为例:
RELAY_REGISTER_OP("nn.conv2d") .add_type_rel("Conv2D", Conv2DRel<Conv2DAttrs>) template <typename AttrType> bool Conv2DRel( const Array<Type>& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { // ... Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0}); IndexExpr pad_h, pad_w; GetPaddingHeightWidth(param->padding, &pad_h, &pad_w); // ((I+2P-K)/S) + 1 oshape.Set(2,indexdiv(dshape_nchw[2] + pad_h - dilated_ksize_y, param->strides[0]) +1); oshape.Set(3,indexdiv(dshape_nchw[3] + pad_w - dilated_ksize_x, param->strides[1]) +1); oshape = trans_out_layout.BackwardShape(oshape); // assign output type reporter->Assign(types[2], TensorType(oshape, out_dtype)); return true; // ... }
1.6.3. Relay Operator Strategy
tvm/docs/dev/relay_op_strategy.rst
relay.build 的第一步是把 Relay IR 转换为 TE, 例如:
relay.exp 会转换成
te.compute(x.shape, lambda *i: te.exp(x(*i)))
relay.log_softmax 会转换成
m, n = x.shape k = te.reduce_axis((0, n), name="k") max_elem = te.compute((m,), lambda i: tvm.te.max(x[i, k], axis=k)) k = te.reduce_axis((0, n), name="k") expsum = te.compute((m,), lambda i: te.sum(te.exp(x[i, k] - max_elem[i]), axis=k)) return te.compute(x.shape, lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]))
这些具体的 TE 实现是在 TOPI (Tvm Operator Index Inventory) 中定义的, TVM 通过 relay operator strategy 把 operator 与 TOPI 中定义的 `Compute` 和 `Schedule` 关联起来.
tvm/docs/dev/relay_op_strategy.rst
1.6.3.1. 定义 strategy
tvm/python/tvm/relay/op/strategy/generic.py::@override_native_generic_func("softmax_strategy")
strategy 包含两个数据:
- compute, 例如 topi.nn.softmax
- schedule, 例如 topi.generic.schedule_softmax
@override_native_generic_func("softmax_strategy") def softmax_strategy(attrs, inputs, out_type, target): """softmax generic strategy""" strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_softmax(topi.nn.softmax), wrap_topi_schedule(topi.generic.schedule_softmax), name="softmax.generic", ) return strategy
通过 override_native_generic_func, softmax_strategy 变成一个 generic_func, 这个 generic_func 可以通过 xxx.register 添加另一个适用于不同 target 的版本
@softmax_strategy.register("cpu") def softmax_strategy_cpu(attrs, inputs, out_type, target): """softmax x86 strategy""" strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_softmax(topi.nn.softmax), wrap_topi_schedule(topi.x86.schedule_softmax), name="softmax.x86", ) return strategy
可见当 target 为 cpu 时, 针对 softmax 会使用不同的 schedule
1.6.3.2. 绑定 strategy 与 op
tvm/python/tvm/relay/op/nn/_nn.py
reg.register_strategy("nn.softmax", strategy.softmax_strategy) tvm.ir.register_op_attr(op_name, "FTVMStrategy", fstrategy, level)
1.6.3.3. relay.build 时查找 strategy
tvm/python/tvm/relay/backend/compile_engine.py::def lower_call(call, inputs, target):
def lower_call(call, inputs, target): best_impl, outputs = select_implementation( op, call.attrs, inputs, ret_type, target, use_autotvm=False ) def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True): all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) for impl in all_impls: outs = impl.compute(attrs, inputs, out_type) # ... def get_valid_implementations(op, attrs, inputs, out_type, target): fstrategy = op.get_attr("FTVMStrategy") # ...
1.7. Let Expression
Let Expession 可以明确的表示变量的作用域, 更有利用编译器对代码的分析. 例如:
// 不使用 let 的代码 a=1 b=2 c=a+b # 此处 a, b 是否还是 live? // 使用 let 的代码 let (a=1, b=2) { c=a+b; c } # 这里明确知道 a,b 不再 live
TVM 支持 let 形式的 relay, 但现在看并不能正常工作…
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # 2021-10-11 11:00 import tvm from tvm import relay import numpy as np def get_demo_mod(): a = relay.var("a", shape=(1, 10), dtype="float32") b = relay.var("b", shape=(1, 10), dtype="float32") c = relay.var("c", shape=(1, 10), dtype="float32") cond = relay.var("cond", shape=(), dtype="bool") sb = relay.ScopeBuilder() out1 = sb.let("out1", relay.add(a, b)) out2 = sb.let("out2", relay.add(out1, c)) sb.ret(out2) sb2 = relay.ScopeBuilder() out = sb2.let("out", relay.multiply(sb.get(), c)) with sb2.if_scope(cond): sb2.ret(c) with sb2.else_scope(): sb2.ret(out) func = relay.Function([a, b, c, cond], sb2.get()) mod = tvm.IRModule.from_expr(func) mod = relay.transform.InferType()(mod) return mod mod = get_demo_mod() print(mod)
def @main(%a: Tensor[(1, 10), float32], %b: Tensor[(1, 10), float32], %c: Tensor[(1, 10), float32], %cond: bool) -> Tensor[(1, 10), float32] { %0 = ( let %out1: Tensor[(1, 10), float32] = add(%a, %b) * ty=Tensor[(1, 10), float32] *; let %out2: Tensor[(1, 10), float32] = add(%out1, %c) * ty=Tensor[(1, 10), float32] *; %out2 ); let %out: Tensor[(1, 10), float32] = multiply(%0, %c) * ty=Tensor[(1, 10), float32] *; if (%cond) { %c } else { %out } }