OpenMP
Table of Contents
1. OpenMP
1.1. Overview
OpenMP 主要包含两部分内容:
compiler directive
openmp 的 api 主要是 compiler directive 的形式, 例如 `#pragma omp parallel`, 编译器需要:
- 识别这些 directive
- 生成 parallel block 对应的匿名函数
- 调用 libgomp 中相应 api
libgomp
libgomp 中有少量函数例如 `omp_get_thread_num` 可以由用户代码直接调用, 其它的大部分的调用都是由编译器产生的, 例如 GOMP_parallel, GOMP_critical_start, GOMP_loop_nonmonotonic_dynamic_next 等
1.3. libgomp
1.3.1. parallel
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { int id = omp_get_thread_num(); printf("hello world %d\n", id); } printf("------\n"); }
$> riscv64-linux-gnu-gcc test.c -O0 -g -fopenmp
$> readelf -a ./a.out|grep NEEDED
0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1]
0x0000000000000001 (NEEDED) Shared library: [libc.so.6]
$> riscv64-linux-gnu-objdump -d -r ./a.out
000000000000076a <main>:
76a: 1101 addi sp,sp,-32
76c: ec06 sd ra,24(sp)
76e: e822 sd s0,16(sp)
770: 1000 addi s0,sp,32
772: 87aa mv a5,a0
774: feb43023 sd a1,-32(s0)
778: fef42623 sw a5,-20(s0)
77c: 4681 li a3,0
77e: 4609 li a2,2
~~~~~~~~
780: 4581 li a1,0
782: 00000517 auipc a0,0x0
786: 02450513 addi a0,a0,36 # 7a6 <main._omp_fn.0>
~~~~~~~~~~~~~~
78a: f07ff0ef jal ra,690 <GOMP_parallel@plt>
~~~~~~~~~~~~~~~~~
78e: 00000517 auipc a0,0x0
792: 0a250513 addi a0,a0,162 # 830 <__libc_csu_fini+0x2>
796: edbff0ef jal ra,670 <puts@plt>
79a: 4781 li a5,0
79c: 853e mv a0,a5
79e: 60e2 ld ra,24(sp)
7a0: 6442 ld s0,16(sp)
7a2: 6105 addi sp,sp,32
7a4: 8082 ret
00000000000007a6 <main._omp_fn.0>:
7a6: 7179 addi sp,sp,-48
7a8: f406 sd ra,40(sp)
7aa: f022 sd s0,32(sp)
7ac: 1800 addi s0,sp,48
7ae: fca43c23 sd a0,-40(s0)
7b2: eafff0ef jal ra,660 <omp_get_thread_num@plt>
7b6: 87aa mv a5,a0
7b8: fef42623 sw a5,-20(s0)
7bc: fec42783 lw a5,-20(s0)
7c0: 85be mv a1,a5
7c2: 00000517 auipc a0,0x0
7c6: 07650513 addi a0,a0,118 # 838 <__libc_csu_fini+0xa>
7ca: eb7ff0ef jal ra,680 <printf@plt>
7ce: 70a2 ld ra,40(sp)
7d0: 7402 ld s0,32(sp)
7d2: 6145 addi sp,sp,48
7d4: 8082 ret
- gcc 会把 `omp parallel` 对应的 block `{int id=….;printf(…)}` 编译成一个独立的函数 `main._omp_fn.0`
- `omp parallel` 对应 `GOMP_parallel` 函数, 它的实现在 libgomp 中, 并且调用时传递了 `2` 做为参数, 即需要两个线程的并行
1.3.1.1. GOMP_parallel
void GOMP_parallel( void (*fn)(void *), void *data, unsigned num_threads, unsigned int flags) { num_threads = gomp_resolve_num_threads(num_threads, 0); gomp_team_start( fn, data, num_threads, flags, gomp_new_team(num_threads), NULL); /* NOTE: fn 即 main._omp_fn.0, 它在主线程会直接执行一次, 所以即使指定 * num_threads 为 0, 也会执行一次 */ fn(data); ialias_call(GOMP_parallel_end)(); } gomp_team_start: /* NOTE: gomp_team_start 的 num_threads 是从 1 开始计数的, 因为主线程已经执行了一次了 */ if (nthreads == 1) return; i = 1; for (; i < nthreads; ++i): start_data->fn = fn; start_data->fn_data = data; // ... pthread_create (&start_data->handle, attr, gomp_thread_start, start_data); gomp_thread_start: // ... local_fn = data->fn; local_data = data->fn_data; local_fn (local_data); // ...
1.3.2. sync
1.3.2.1. atomic
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel num_threads(2) { #pragma omp atomic sum += 1; } printf("%d\n", sum); }
$> riscv64-linux-gnu-gcc test.c -O0 -g -fopenmp -c -fno-stack-protector
$> riscv64-linux-gnu-objdump -d -r ./test.o
000000000000005a <main._omp_fn.0>:
5a: 1101 addi sp,sp,-32
5c: ec22 sd s0,24(sp)
5e: 1000 addi s0,sp,32
60: fea43423 sd a0,-24(s0)
64: fe843783 ld a5,-24(s0)
68: 639c ld a5,0(a5)
6a: 4705 li a4,1
6c: 00e7a02f amoadd.w zero,a4,(a5)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
70: 6462 ld s0,24(sp)
72: 6105 addi sp,sp,32
74: 8082 ret
`omp atomic` 在 riscv 上实际对应的是 AMO 指令
1.3.2.2. critical
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel num_threads(2) { #pragma omp critical sum += 1; } printf("%d\n", sum); }
0000000000000062 <main._omp_fn.0>:
62: 1101 addi sp,sp,-32
64: ec06 sd ra,24(sp)
66: e822 sd s0,16(sp)
68: 1000 addi s0,sp,32
6a: fea43423 sd a0,-24(s0)
6e: 00000097 auipc ra,0x0
6e: R_RISCV_CALL_PLT GOMP_critical_start
6e: R_RISCV_RELAX *ABS*
72: 000080e7 jalr ra # 6e <main._omp_fn.0+0xc>
76: fe843783 ld a5,-24(s0)
7a: 439c lw a5,0(a5)
7c: 2785 addiw a5,a5,1
7e: 0007871b sext.w a4,a5
82: fe843783 ld a5,-24(s0)
86: c398 sw a4,0(a5)
88: 00000097 auipc ra,0x0
88: R_RISCV_CALL_PLT GOMP_critical_end
88: R_RISCV_RELAX *ABS*
8c: 000080e7 jalr ra # 88 <main._omp_fn.0+0x26>
90: 60e2 ld ra,24(sp)
92: 6442 ld s0,16(sp)
94: 6105 addi sp,sp,32
96: 8082 ret
`omp critical` 对应 `GOMP_critical_start` 和 `GOMP_critical_end`, 后者对应 posix 的 `pthread_mutex_lock` 或者 linux 的 `futex_wait`
1.3.2.3. barrier
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { printf("%d\n", omp_get_thread_num()); #pragma omp barrier printf("%d\n", omp_get_thread_num()); } return 0; }
0000000000000000 <main._omp_fn.0>:
0: 1141 addi sp,sp,-16
2: e406 sd ra,8(sp)
4: e022 sd s0,0(sp)
6: 00000097 auipc ra,0x0
6: R_RISCV_CALL_PLT omp_get_thread_num
6: R_RISCV_RELAX *ABS*
a: 000080e7 jalr ra # 6 <main._omp_fn.0+0x6>
e: 862a mv a2,a0
10: 842a mv s0,a0
12: 00000597 auipc a1,0x0
12: R_RISCV_PCREL_HI20 .LC0
12: R_RISCV_RELAX *ABS*
16: 00058593 mv a1,a1
16: R_RISCV_PCREL_LO12_I .L0
16: R_RISCV_RELAX *ABS*
1a: 4505 li a0,1
1c: 00000097 auipc ra,0x0
1c: R_RISCV_CALL_PLT __printf_chk
1c: R_RISCV_RELAX *ABS*
20: 000080e7 jalr ra # 1c <main._omp_fn.0+0x1c>
24: 00000097 auipc ra,0x0
24: R_RISCV_CALL_PLT GOMP_barrier
~~~~~~~~~~~~
24: R_RISCV_RELAX *ABS*
28: 000080e7 jalr ra # 24 <main._omp_fn.0+0x24>
2c: 8622 mv a2,s0
2e: 6402 ld s0,0(sp)
30: 60a2 ld ra,8(sp)
32: 00000597 auipc a1,0x0
32: R_RISCV_PCREL_HI20 .LC0
32: R_RISCV_RELAX *ABS*
36: 00058593 mv a1,a1
36: R_RISCV_PCREL_LO12_I .L0
36: R_RISCV_RELAX *ABS*
3a: 4505 li a0,1
3c: 0141 addi sp,sp,16
3e: 00000317 auipc t1,0x0
3e: R_RISCV_CALL_PLT __printf_chk
3e: R_RISCV_RELAX *ABS*
42: 00030067 jr t1 # 3e <main._omp_fn.0+0x3e>
1.3.2.3.1. GOMP_barrier
GOMP_barrier 需要保证:
- 插入一个 memory fence
- 已经执行到 barrier 的线程需要等待其它尚未执行到 barrier 的线程
void GOMP_barrier(): gomp_team_barrier_wait(&team->barrier); gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); gomp_barrier_state_t gomp_barrier_wait_start (gomp_barrier_t *bar): /* NOTE: __atomic_xx 后的 MEMMODEL 相当于 memory fench */ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ret &= -BAR_INCR | BAR_CANCELLED; /* NOTE: 当 awaited 减小到 0 时, 意味着最后一个线程到达 barrier, 这个带有 * BAR_WAS_LAST 标记的线程会负责唤醒前面等待的线程 */ if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0): ret |= BAR_WAS_LAST; return ret; void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state): unsigned int generation, gen; /* NOTE: if (__builtin_expect (x, 0)) {xxx} 相当于 if (x) {xxx}, 其中的 0 只是给 * compiler 的 hint, 告诉它 x 为 0 的概率较大, 可以让 compiler 产生有利于 branch * prediction 的代码 */ if (__builtin_expect (state & BAR_WAS_LAST, 0)): bar->awaited = bar->total; team->work_share_cancelled = 0; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); /* NOTE: 如果当前线程是 BAR_WAS_LAST, 则它会 wake 其它线程 */ futex_wake ((int *) &bar->generation, INT_MAX); return; generation = state; state &= ~BAR_CANCELLED; do: /* NOTE: 非 BAR_WAS_LAST 的线程会等待 BAR_WAS_LAST 线程的 wake */ do_wait ((int *) &bar->generation, generation); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_TASK_PENDING, 0)): gomp_barrier_handle_tasks (state); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); generation |= gen & BAR_WAITING_FOR_TASK; while (gen != state + BAR_INCR);
1.3.2.4. single
single 要求只有一个线程能执行相应代码, 但并没有要求是哪一个线程
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { #pragma omp single { printf("single %d\n", omp_get_thread_num()); } printf("single %d\n", omp_get_thread_num()); } return 0; }
0000000000000000 <main._omp_fn.0>:
0: 1101 addi sp,sp,-32
2: ec06 sd ra,24(sp)
4: e822 sd s0,16(sp)
6: e426 sd s1,8(sp)
8: 00000097 auipc ra,0x0
8: R_RISCV_CALL_PLT GOMP_single_start
~~~~~~~~~~~~~~~~~
8: R_RISCV_RELAX *ABS*
c: 000080e7 jalr ra # 8 <main._omp_fn.0+0x8>
10: 84aa mv s1,a0
12: 00000097 auipc ra,0x0
12: R_RISCV_CALL_PLT omp_get_thread_num
12: R_RISCV_RELAX *ABS*
16: 000080e7 jalr ra # 12 <main._omp_fn.0+0x12>
1a: 842a mv s0,a0
1c: e09d bnez s1,42 <.L2>
1c: R_RISCV_RVC_BRANCH .L2
000000000000001e <.L3>:
1e: 00000097 auipc ra,0x0
1e: R_RISCV_CALL_PLT GOMP_barrier
~~~~~~~~~~~~
single 后有一个隐式的 barrier
1e: R_RISCV_RELAX *ABS*
22: 000080e7 jalr ra # 1e <.L3>
26: 8622 mv a2,s0
28: 6442 ld s0,16(sp)
2a: 60e2 ld ra,24(sp)
2c: 64a2 ld s1,8(sp)
2e: 00000597 auipc a1,0x0
2e: R_RISCV_PCREL_HI20 .LC0
2e: R_RISCV_RELAX *ABS*
32: 00058593 mv a1,a1
32: R_RISCV_PCREL_LO12_I .L0
32: R_RISCV_RELAX *ABS*
36: 4505 li a0,1
38: 6105 addi sp,sp,32
3a: 00000317 auipc t1,0x0
3a: R_RISCV_CALL_PLT __printf_chk
3a: R_RISCV_RELAX *ABS*
3e: 00030067 jr t1 # 3a <.L3+0x1c>
0000000000000042 <.L2>:
42: 862a mv a2,a0
44: 00000597 auipc a1,0x0
44: R_RISCV_PCREL_HI20 .LC0
44: R_RISCV_RELAX *ABS*
48: 00058593 mv a1,a1
48: R_RISCV_PCREL_LO12_I .L0
48: R_RISCV_RELAX *ABS*
4c: 4505 li a0,1
4e: 00000097 auipc ra,0x0
4e: R_RISCV_CALL_PLT __printf_chk
4e: R_RISCV_RELAX *ABS*
52: 000080e7 jalr ra # 4e <.L2+0xc>
56: b7e1 j 1e <.L3>
56: R_RISCV_RVC_JUMP .L3
1.3.2.4.1. GOMP_single_start
bool GOMP_single_start(void): struct gomp_thread *thr = gomp_thread(); struct gomp_team *team = thr->ts.team; unsigned long single_count; if (__builtin_expect(team == NULL, 0)) return true; single_count = thr->ts.single_count++; /* NOTE: __sync_bool_compare_and_swap (a,b,c) 的操作是: 比较 a, b, 若相等, 则 * 把 c 赋给a, 并返回 true, 且整个过程是原子操作. * * N 个线程竞争, 只有一个会胜出. 但不论是否胜出 ,所有线程的 single_count 以及 * team 的 single_count 都会加 1, 以便能共同竞争下一下 single, 所以 single 不 * 会固定给某一个线程执行 */ return __sync_bool_compare_and_swap( &team->single_count, single_count, single_count + 1L);
1.3.2.5. master
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { #pragma omp master { printf("%d\n", omp_get_thread_num()); } printf("%d\n", omp_get_thread_num()); } return 0; }
0000000000000000 <main._omp_fn.0>:
0: 1141 addi sp,sp,-16
2: e022 sd s0,0(sp)
4: e406 sd ra,8(sp)
6: 00000097 auipc ra,0x0
6: R_RISCV_CALL_PLT omp_get_thread_num
6: R_RISCV_RELAX *ABS*
a: 000080e7 jalr ra # 6 <main._omp_fn.0+0x6>
e: 842a mv s0,a0
10: cd11 beqz a0,2c <.L2>
~~~~~~~~~~~~~~~~~~~~
由于 master 只能由主线程执行, 所以这里简单判断 omp_get_thread_num 是否
为 0 即可, 不需要像 single 那样有复杂的同步操作
10: R_RISCV_RVC_BRANCH .L2
12: 8622 mv a2,s0
14: 6402 ld s0,0(sp)
16: 60a2 ld ra,8(sp)
18: 00000597 auipc a1,0x0
18: R_RISCV_PCREL_HI20 .LC0
18: R_RISCV_RELAX *ABS*
1c: 00058593 mv a1,a1
1c: R_RISCV_PCREL_LO12_I .L0
1c: R_RISCV_RELAX *ABS*
20: 4505 li a0,1
22: 0141 addi sp,sp,16
24: 00000317 auipc t1,0x0
24: R_RISCV_CALL_PLT __printf_chk
24: R_RISCV_RELAX *ABS*
28: 00030067 jr t1 # 24 <main._omp_fn.0+0x24>
000000000000002c <.L2>:
2c: 4601 li a2,0
2e: 00000597 auipc a1,0x0
2e: R_RISCV_PCREL_HI20 .LC0
2e: R_RISCV_RELAX *ABS*
32: 00058593 mv a1,a1
32: R_RISCV_PCREL_LO12_I .L0
32: R_RISCV_RELAX *ABS*
36: 4505 li a0,1
38: 00000097 auipc ra,0x0
38: R_RISCV_CALL_PLT __printf_chk
38: R_RISCV_RELAX *ABS*
3c: 000080e7 jalr ra # 38 <.L2+0xc>
40: 8622 mv a2,s0
42: 6402 ld s0,0(sp)
44: 60a2 ld ra,8(sp)
46: 00000597 auipc a1,0x0
46: R_RISCV_PCREL_HI20 .LC0
46: R_RISCV_RELAX *ABS*
4a: 00058593 mv a1,a1
4a: R_RISCV_PCREL_LO12_I .L0
4a: R_RISCV_RELAX *ABS*
4e: 4505 li a0,1
50: 0141 addi sp,sp,16
52: 00000317 auipc t1,0x0
52: R_RISCV_CALL_PLT __printf_chk
52: R_RISCV_RELAX *ABS*
56: 00030067 jr t1 # 52 <.L2+0x26>
1.3.3. for
1.3.3.1. static schedule
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel for num_threads(2) for (int i = 0; i < 10; i++) { printf("%d\n", i); } printf("%d\n", sum); }
默认情况下 `omp for` 使用 `schedule (static, N/num_threads)`, work sharing 是静态的, 由编译器直接完成
$> riscv64-linux-gnu-gcc test.c -O2 -fopenmp -c -fno-stack-protector
$> riscv64-linux-gnu-objdump -d -r ./test.o
0000000000000000 <main._omp_fn.0>:
0: 1101 addi sp,sp,-32
2: ec06 sd ra,24(sp)
4: e822 sd s0,16(sp)
6: e426 sd s1,8(sp)
8: e04a sd s2,0(sp)
a: 00000097 auipc ra,0x0
a: R_RISCV_CALL omp_get_num_threads
a: R_RISCV_RELAX *ABS*
e: 000080e7 jalr ra # a <main._omp_fn.0+0xa>
12: 842a mv s0,a0
14: 00000097 auipc ra,0x0
14: R_RISCV_CALL omp_get_thread_num
14: R_RISCV_RELAX *ABS*
18: 000080e7 jalr ra # 14 <main._omp_fn.0+0x14>
1c: 47a9 li a5,10
1e: 0287e73b remw a4,a5,s0
22: 0287c6bb divw a3,a5,s0
~~~~~~~~~~~~~~~~
a3=10/(omp_get_num_threads()), a3 表示 schedcule(static, step) 中的 step
26: 02e54c63 blt a0,a4,5e <.L2>
~~~~~~~~~~~~~~~~~~
如果 step 不能整除, 会把多余的部分平摊到前面的 thread, 例如 10%4=2, 则 4 线程分到的任务为 3,3,2,2
26: R_RISCV_BRANCH .L2
000000000000002a <.L5>:
2a: 02a684bb mulw s1,a3,a0
~~~~~~~~~~~~~~~~
a0 是 omp_get_thread_num(), s1 表示 begin=omp_get_thread_num()*step
2e: 00e4843b addw s0,s1,a4
32: 00d404bb addw s1,s0,a3
~~~~~~~~~~~~~~~~
s0 是 begin, s1 是 begin+step, 即 end, 当前 thread 处理的范围是 [s0, s1]
36: 00945e63 bge s0,s1,52 <.L1>
36: R_RISCV_BRANCH .L1
3a: 00000937 lui s2,0x0
3a: R_RISCV_HI20 .LC0
3a: R_RISCV_RELAX *ABS*
000000000000003e <.L4>:
3e: 85a2 mv a1,s0
40: 00090513 mv a0,s2
40: R_RISCV_LO12_I .LC0
40: R_RISCV_RELAX *ABS*
44: 2405 addiw s0,s0,1
46: 00000097 auipc ra,0x0
46: R_RISCV_CALL printf
46: R_RISCV_RELAX *ABS*
4a: 000080e7 jalr ra # 46 <.L4+0x8>
4e: fe8498e3 bne s1,s0,3e <.L4>
4e: R_RISCV_BRANCH .L4
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.L4 对应的代码是 for (i=start; i<end; i++) {printf("%d\n",i);}
0000000000000052 <.L1>:
52: 60e2 ld ra,24(sp)
54: 6442 ld s0,16(sp)
56: 64a2 ld s1,8(sp)
58: 6902 ld s2,0(sp)
5a: 6105 addi sp,sp,32
5c: 8082 ret
000000000000005e <.L2>:
5e: 2685 addiw a3,a3,1
60: 4701 li a4,0
62: b7e1 j 2a <.L5>
62: R_RISCV_RVC_JUMP .L5
1.3.3.2. dynamic schedule
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel for num_threads(2) schedule(dynamic, 1) for (int i = 0; i < 10; i++) { printf("%d\n", i); } }
0000000000000000 <main>:
0: 00000537 lui a0,0x0
0: R_RISCV_HI20 main._omp_fn.0
0: R_RISCV_RELAX *ABS*
4: 1141 addi sp,sp,-16
6: 4881 li a7,0
8: 4805 li a6,1
a: 4785 li a5,1
c: 4729 li a4,10
e: 4681 li a3,0
10: 4609 li a2,2
12: 4581 li a1,0
14: 00050513 mv a0,a0
14: R_RISCV_LO12_I main._omp_fn.0
14: R_RISCV_RELAX *ABS*
18: e406 sd ra,8(sp)
1a: 00000097 auipc ra,0x0
1a: R_RISCV_CALL GOMP_parallel_loop_nonmonotonic_dynamic
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
设置 dynmic schedule 的参数, 例如 chunk size (1)
1a: R_RISCV_RELAX *ABS*
1e: 000080e7 jalr ra # 1a <main+0x1a>
0000000000000000 <main._omp_fn.0>:
0: 7179 addi sp,sp,-48
2: 002c addi a1,sp,8
4: 850a mv a0,sp
6: f406 sd ra,40(sp)
8: f022 sd s0,32(sp)
a: ec26 sd s1,24(sp)
c: e84a sd s2,16(sp)
e: 00000097 auipc ra,0x0
e: R_RISCV_CALL GOMP_loop_nonmonotonic_dynamic_next
e: R_RISCV_RELAX *ABS*
12: 000080e7 jalr ra # e <main._omp_fn.0+0xe>
16: c515 beqz a0,42 <.L2>
16: R_RISCV_RVC_BRANCH .L2
18: 00000937 lui s2,0x0
18: R_RISCV_HI20 .LC0
18: R_RISCV_RELAX *ABS*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.L4, .L3 的代码相当于:
while (GOMP_loop_nonmonotonic_dynamic_next(*start,*end)) {
for (i=start;i<end;i++) {
printf("%d\n",i);
}
}
000000000000001c <.L4>:
1c: 4402 lw s0,0(sp)
1e: 44a2 lw s1,8(sp)
~~~~~~~~~~~~
s0 是 start, s1 是 end, 由于 GOMP_loop_nonmonotonic_dynamic_next 的参数是 (*start,*end)
所以它使用栈来传递参数
0000000000000020 <.L3>:
20: 85a2 mv a1,s0
22: 00090513 mv a0,s2
22: R_RISCV_LO12_I .LC0
22: R_RISCV_RELAX *ABS*
26: 2405 addiw s0,s0,1
28: 00000097 auipc ra,0x0
28: R_RISCV_CALL printf
28: R_RISCV_RELAX *ABS*
2c: 000080e7 jalr ra # 28 <.L3+0x8>
30: fe9448e3 blt s0,s1,20 <.L3>
~~~~~~~~~~~~~~~~~~~
.L3 对应内层的 for (i=start;i<end;i++) {xxx} 循环
30: R_RISCV_BRANCH .L3
34: 002c addi a1,sp,8
~~~~~~~~~~~~~~~ end 的地址
36: 850a mv a0,sp
~~~~~~~~~ start 的地址
38: 00000097 auipc ra,0x0
38: R_RISCV_CALL GOMP_loop_nonmonotonic_dynamic_next
38: R_RISCV_RELAX *ABS*
3c: 000080e7 jalr ra # 38 <.L3+0x18>
40: fd71 bnez a0,1c <.L4>
40: R_RISCV_RVC_BRANCH .L4
~~~~~
.L4 对应外层的 while (GOMP_loop_nonmonotonic_dynamic_next(*start,*end) 循环
可见 dynamic schedule 有额外的运行时开销
1.3.3.2.1. GOMP_loop_nonmonotonic_dynamic_next
bool GOMP_loop_nonmonotonic_dynamic_next(long *istart, long *iend) return gomp_loop_dynamic_next (istart, iend); return gomp_iter_dynamic_next (istart, iend); long tmp = __sync_fetch_and_add (&ws->next, chunk); if (tmp >= end) return false; nend = tmp + chunk; if (nend > end) nend = end; *pstart = tmp; *pend = nend; return true;
1.3.3.3. reduction
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel for reduction(+ : sum) num_threads(2) for (int i = 0; i < 10; i++) { sum += i; } printf("%d\n", sum); return 0; }
0000000000000000 <main._omp_fn.0>:
0: 1101 addi sp,sp,-32
2: ec06 sd ra,24(sp)
4: e822 sd s0,16(sp)
6: e426 sd s1,8(sp)
8: 842a mv s0,a0
a: 00000097 auipc ra,0x0
a: R_RISCV_CALL_PLT omp_get_num_threads
a: R_RISCV_RELAX *ABS*
e: 000080e7 jalr ra # a <main._omp_fn.0+0xa>
12: 84aa mv s1,a0
14: 00000097 auipc ra,0x0
14: R_RISCV_CALL_PLT omp_get_thread_num
14: R_RISCV_RELAX *ABS*
18: 000080e7 jalr ra # 14 <main._omp_fn.0+0x14>
1c: 46a9 li a3,10
1e: 0296e63b remw a2,a3,s1
22: 87aa mv a5,a0
24: 0296c5bb divw a1,a3,s1
28: 02c54e63 blt a0,a2,64 <.L2>
28: R_RISCV_BRANCH .L2
000000000000002c <.L5>:
2c: 4681 li a3,0
2e: 02f5873b mulw a4,a1,a5
32: 00c707bb addw a5,a4,a2
36: 00b7873b addw a4,a5,a1
~~~~~~~~~~~~~~~~
当前线程要处理的范围是 [a5, a4]
3a: 00e7c963 blt a5,a4,4c <.L4>
3a: R_RISCV_BRANCH .L4
3e: 00d4202f amoadd.w zero,a3,(s0)
~~~~~~~~~~~~~~~~~~~~~~~~
线程处理完以后把 a3 (当前线程的 partial sum 累加到 (s0)
42: 60e2 ld ra,24(sp)
44: 6442 ld s0,16(sp)
46: 64a2 ld s1,8(sp)
48: 6105 addi sp,sp,32
4a: 8082 ret
000000000000004c <.L4>:
4c: 9ebd addw a3,a3,a5
4e: 2785 addiw a5,a5,1
~~~~~~~~~~~~~~~~
a3 是 partial sum, 不断的累加 a5 (i)
50: fef71ee3 bne a4,a5,4c <.L4>
50: R_RISCV_BRANCH .L4
54: 2681 sext.w a3,a3
56: 00d4202f amoadd.w zero,a3,(s0)
5a: 60e2 ld ra,24(sp)
5c: 6442 ld s0,16(sp)
5e: 64a2 ld s1,8(sp)
60: 6105 addi sp,sp,32
62: 8082 ret
0000000000000064 <.L2>:
64: 2585 addiw a1,a1,1
66: 4601 li a2,0
68: b7d1 j 2c <.L5>
68: R_RISCV_RVC_JUMP .L5
reduction 的代码是由编译器直接生成的, 不需要调用 libgomp. 生成的代码和手写的不用 reduction 的代码 (例如 hello_openmp 中的 pi_parallel_for) 基本相同
1.3.4. task
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { #pragma omp single { #pragma omp task printf("%d\n", omp_get_thread_num()); #pragma omp task printf("%d\n", omp_get_thread_num()); } } return 0; }
0000000000000014 <.L6>:
14: 4881 li a7,0
16: 4801 li a6,0
18: 4785 li a5,1
1a: 4705 li a4,1
1c: 4681 li a3,0
1e: 4601 li a2,0
20: 4581 li a1,0
22: 00000517 auipc a0,0x0
22: R_RISCV_PCREL_HI20 main._omp_fn.1
22: R_RISCV_RELAX *ABS*
26: 00050513 mv a0,a0
26: R_RISCV_PCREL_LO12_I .L0
26: R_RISCV_RELAX *ABS*
2a: e002 sd zero,0(sp)
2c: 00000097 auipc ra,0x0
2c: R_RISCV_CALL_PLT GOMP_task
~~~~~~~~~
GOMP_task(main._omp_fn.1)
2c: R_RISCV_RELAX *ABS*
30: 000080e7 jalr ra # 2c <.L6+0x18>
34: 4881 li a7,0
36: e002 sd zero,0(sp)
38: 4801 li a6,0
3a: 4785 li a5,1
3c: 4705 li a4,1
3e: 4681 li a3,0
40: 4601 li a2,0
42: 4581 li a1,0
44: 00000517 auipc a0,0x0
44: R_RISCV_PCREL_HI20 main._omp_fn.2
44: R_RISCV_RELAX *ABS*
48: 00050513 mv a0,a0
48: R_RISCV_PCREL_LO12_I .L0
48: R_RISCV_RELAX *ABS*
4c: 00000097 auipc ra,0x0
4c: R_RISCV_CALL_PLT GOMP_task
~~~~~~~~~
GOMP_task(main._omp_fn.1)
4c: R_RISCV_RELAX *ABS*
50: 000080e7 jalr ra # 4c <.L6+0x38>
54: 60e2 ld ra,24(sp)
56: 6105 addi sp,sp,32
58: 8082 ret
000000000000005a <main._omp_fn.1>:
5a: 1141 addi sp,sp,-16
5c: e406 sd ra,8(sp)
5e: 00000097 auipc ra,0x0
5e: R_RISCV_CALL_PLT omp_get_thread_num
5e: R_RISCV_RELAX *ABS*
62: 000080e7 jalr ra # 5e <main._omp_fn.1+0x4>
66: 60a2 ld ra,8(sp)
68: 862a mv a2,a0
6a: 00000597 auipc a1,0x0
6a: R_RISCV_PCREL_HI20 .LC0
6a: R_RISCV_RELAX *ABS*
6e: 00058593 mv a1,a1
6e: R_RISCV_PCREL_LO12_I .L0
6e: R_RISCV_RELAX *ABS*
72: 4505 li a0,1
74: 0141 addi sp,sp,16
76: 00000317 auipc t1,0x0
76: R_RISCV_CALL_PLT __printf_chk
76: R_RISCV_RELAX *ABS*
7a: 00030067 jr t1 # 76 <main._omp_fn.1+0x1c>
000000000000007e <main._omp_fn.2>:
7e: 00000317 auipc t1,0x0
7e: R_RISCV_CALL main._omp_fn.1
7e: R_RISCV_RELAX *ABS*
82: 00030067 jr t1 # 7e <main._omp_fn.2>
每个 task 对应一个单独的 main._omp_fn.x, 通过 GOMP_task 放在某个线程里执行
1.3.4.1. GOMP_task
void GOMP_task( void (*fn)(void *), void *data, void (*cpyfn)(void *, void *), long arg_size, long arg_align, bool if_clause, unsigned flags, void **depend, int priority_arg, void *detach): priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority, PRIORITY_INSERT_END, /*adjust_parent_depends_on=*/false, task->parent_depends_on); gomp_team_barrier_set_task_pending (&team->barrier); do_wake = team->task_running_count + !parent->in_tied_task < team->nthreads; if (do_wake) gomp_team_barrier_wake (&team->barrier, 1);
1.3.5. target
#include <omp.h> #include <stdio.h> int main() { int x = 1; #pragma omp target map(tofrom : x) for (int i = 0; i < 10; i++) { x += i; } printf("x = %d\n", x); return 0; }
openmp 从 4.0 开始支持 offload 到不同的 target, 例如 nvptx 和 amd gcn.
offload 过程与 DPC++, TVM BYOC Codegen 以及 ComputeCpp 类似, 以 nvptx 为例, 主要步骤是:
- gcc 编译出 block 对应的 ptx 汇编
- 使用 mkoffload 生成一个头文件, 它会包含 ptx 汇编, 用于注册的初始化函数以及一些 mapping 信息, 确保运行时 gomp 可以找到对应的 ptx 并调用 cuda 去编译和执行它. 这个头文件与 ComputeCpp 生成的 sycl 文件类似
1.3.5.1. GOMP_offload_register_ver
GOMP_offload_register_ver 负责向 runtime 注册 offload 相关的信息
通过 `–save-temps` 可以看到这个生成的 offload 头文件:
$> gcc test.c -O0 -fopenmp -fcf-protection=none -foffload=-misa=sm_35 -fno-stack-protector --save-temps
$> cat ccuL4e4U.i
static const char ptx_code_0[] =
"// BEGIN PREAMBLE\n"
".version 3.1\n"
".target sm_35\n"
".address_size 64\n"
"// BEGIN FUNCTION DECL: main$_omp_fn$0$impl\n"
".func main$_omp_fn$0$impl (.param .u64 %in_ar0);\n"
"// BEGIN GLOBAL FUNCTION DECL: gomp_nvptx_main\n"
".extern .func gomp_nvptx_main (.param .u64 %in_ar1, .param .u64 %in_ar2);\n"
"// BEGIN GLOBAL VAR DECL: __nvptx_stacks\n"
".extern .shared .u64 __nvptx_stacks[32];\n"
"// BEGIN GLOBAL VAR DECL: __nvptx_uni\n"
".extern .shared .u32 __nvptx_uni[32];\n"
"// END PREAMBLE\n"
".visible .entry main$_omp_fn$0 (.param .u64 %arg, .param .u64 %stack, .param .u64 %sz)\n"
"{\n"
".reg .u32 %r<3>;\n"
".reg .u64 %R<4>;\n"
/* ... */
"mov.u32 %r0,%tid.y;\n"
/* ... */
static const char ptx_code_19[] = "" /* ... */
static const struct ptx_obj {
const char *code;
long unsigned int size;
} ptx_objs[] = {
{ptx_code_0, sizeof (ptx_code_0)},
{ptx_code_1, sizeof (ptx_code_1)},
{ptx_code_2, sizeof (ptx_code_2)},
{ptx_code_3, sizeof (ptx_code_3)},
{ptx_code_4, sizeof (ptx_code_4)},
{ptx_code_5, sizeof (ptx_code_5)},
{ptx_code_6, sizeof (ptx_code_6)},
{ptx_code_7, sizeof (ptx_code_7)},
{ptx_code_8, sizeof (ptx_code_8)},
{ptx_code_9, sizeof (ptx_code_9)},
{ptx_code_10, sizeof (ptx_code_10)},
{ptx_code_11, sizeof (ptx_code_11)},
{ptx_code_12, sizeof (ptx_code_12)},
{ptx_code_13, sizeof (ptx_code_13)},
{ptx_code_14, sizeof (ptx_code_14)},
{ptx_code_15, sizeof (ptx_code_15)},
{ptx_code_16, sizeof (ptx_code_16)},
{ptx_code_17, sizeof (ptx_code_17)},
{ptx_code_18, sizeof (ptx_code_18)},
{ptx_code_19, sizeof (ptx_code_19)}
};
static const char *const var_mappings[] = {
};
static const struct nvptx_fn {
const char *name;
unsigned short dim[3];
} func_mappings[] = {
{"main$_omp_fn$0"}
};
static const struct nvptx_tdata {
const struct ptx_obj *ptx_objs;
unsigned ptx_num;
const char *const *var_names;
unsigned var_num;
const struct nvptx_fn *fn_names;
unsigned fn_num;
} target_data = {
ptx_objs, sizeof (ptx_objs) / sizeof (ptx_objs[0]),
var_mappings, sizeof (var_mappings) / sizeof (var_mappings[0]),
func_mappings, sizeof (func_mappings) / sizeof (func_mappings[0])
};
extern void GOMP_offload_register_ver (unsigned, const void *, int, const void *);
extern void GOMP_offload_unregister_ver (unsigned, const void *, int, const void *);
extern const void *const __OFFLOAD_TABLE__[];
static __attribute__((constructor)) void init (void)
{
GOMP_offload_register_ver (0x10001, __OFFLOAD_TABLE__, 5 , &target_data);
};
static __attribute__((destructor)) void fini (void)
{
GOMP_offload_unregister_ver (0x10001, __OFFLOAD_TABLE__, 5 , &target_data);
};
通过注册的 target_data 中的 func_mappings 和 ptx_objs, 运行时可以找到函数 (例如 main._omp_fn.0) 对应的 ptx code (例如 ptx_code_0)
1.3.5.2. GOMP_target_ext
0000000000001169 <main>:
# ...
11a7: 6a 00 pushq $0x0
11a9: 4c 8d 0d 68 ee 06 00 lea 0x6ee68(%rip),%r9 # 70018 <.omp_data_kinds.6.2657>
11b0: 4c 8d 05 59 ee 06 00 lea 0x6ee59(%rip),%r8 # 70010 <.omp_data_sizes.5.2656>
11b7: 48 89 c1 mov %rax,%rcx
11ba: ba 01 00 00 00 mov $0x1,%edx
11bf: 48 8d 35 2b 00 00 00 lea 0x2b(%rip),%rsi # 11f1 <main._omp_fn.0>
~~~~~~~~~~~~~~~~~~~~~~~
11c6: bf ff ff ff ff mov $0xffffffff,%edi
11cb: e8 60 fe ff ff callq 1030 <GOMP_target_ext@plt>
~~~~~~~~~~~~~~~~~~~~
GOMP_target_ext 会负责调用 ptx runtime 相关代码去找 main._omp_fn.0
对应的 ptx code 并执行, 但如果没有成功, 可以 fallback 到 cpu 代码
所以下面有 main._omp_fn.0 完整的 cpu 实现
11d0: 48 83 c4 20 add $0x20,%rsp
11d4: 8b 45 d4 mov -0x2c(%rbp),%eax
11d7: 89 c6 mov %eax,%esi
11d9: 48 8d 3d 24 0e 00 00 lea 0xe24(%rip),%rdi # 2004 <_IO_stdin_used+0x4>
11e0: b8 00 00 00 00 mov $0x0,%eax
11e5: e8 66 fe ff ff callq 1050 <printf@plt>
11ea: b8 00 00 00 00 mov $0x0,%eax
11ef: c9 leaveq
11f0: c3 retq
00000000000011f1 <main._omp_fn.0>:
11f1: 55 push %rbp
11f2: 48 89 e5 mov %rsp,%rbp
11f5: 48 89 7d e8 mov %rdi,-0x18(%rbp)
11f9: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
1200: 83 7d fc 09 cmpl $0x9,-0x4(%rbp)
~~~~~~~~~~~~~~~~~~~~~~ 循环 10 次
1204: 7f 1d jg 1223 <main._omp_fn.0+0x32>
1206: 48 8b 45 e8 mov -0x18(%rbp),%rax
120a: 48 8b 00 mov (%rax),%rax
120d: 8b 10 mov (%rax),%edx
120f: 8b 45 fc mov -0x4(%rbp),%eax
1212: 01 c2 add %eax,%edx
1214: 48 8b 45 e8 mov -0x18(%rbp),%rax
1218: 48 8b 00 mov (%rax),%rax
121b: 89 10 mov %edx,(%rax)
121d: 83 45 fc 01 addl $0x1,-0x4(%rbp)
1221: eb dd jmp 1200 <main._omp_fn.0+0xf>
1223: 5d pop %rbp
1224: c3 retq
1225: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
122c: 00 00 00
122f: 90 nop
1.3.6. simd
openmp 支持通过 `#pragma omp simd` 给编译器一个提示, 以便对循环进行向量化.
1.4. Compiler Directive
gcc 对 openmp 的支持主要在两个 tree pass:
- omplower
- ompexp
1.4.1. Example
以下面的代码为例:
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel for num_threads(2) for (int i = 0; i < 10; i++) { printf("%d\n", omp_get_thread_num()); } return 0; }
$> riscv64-linux-gnu-gcc test.c -O2 -fopenmp -c -fno-stack-protector -fdump-tree-all
test.c.007t.omplower:
;; Function main (main, funcdef_no=23, decl_uid=2425, cgraph_uid=24, symbol_order=23)
Introduced new external node (main._omp_fn.0/24).
main (int argc, char * * argv)
{
int D.2430;
{
{
#pragma omp parallel num_threads(2) [child fn: main._omp_fn.0 (???)]
{
int i;
{
#pragma omp for nowait
for (i = 0; i < 10; i = i + 1)
D.2429 = omp_get_thread_num ();
printf ("%d\n", D.2429);
#pragma omp continue (i, i)
#pragma omp return(nowait)
}
}
#pragma omp return
}
D.2430 = 0;
return D.2430;
}
D.2430 = 0;
return D.2430;
}
test.c.013t.ompexp:
;; Function main._omp_fn.0 (main._omp_fn.0, funcdef_no=24, decl_uid=2432, cgraph_uid=25, symbol_order=24)
main._omp_fn.0 (void * .omp_data_i)
{
int D.2451;
int i;
int D.2449;
int D.2448;
int D.2447;
int tt.2;
int q.1;
int D.2444;
int D.2443;
<bb 11> :
<bb 3> :
D.2443 = __builtin_omp_get_num_threads ();
D.2444 = __builtin_omp_get_thread_num ();
q.1 = 10 / D.2443;
tt.2 = 10 % D.2443;
if (D.2444 < tt.2)
goto <bb 9>; [25.00%]
else
goto <bb 10>; [75.00%]
<bb 10> :
D.2447 = q.1 * D.2444;
D.2448 = D.2447 + tt.2;
D.2449 = D.2448 + q.1;
if (D.2448 >= D.2449)
goto <bb 5>; [INV]
else
goto <bb 8>; [INV]
<bb 8> :
i = D.2448;
<bb 4> :
D.2451 = __builtin_omp_get_thread_num ();
printf ("%d\n", D.2451);
i = i + 1;
if (i < D.2449)
goto <bb 4>; [INV]
else
goto <bb 5>; [INV]
<bb 5> :
<bb 6> :
return;
<bb 9> :
tt.2 = 0;
q.1 = q.1 + 1;
goto <bb 10>; [100.00%]
}
;; Function main (main, funcdef_no=23, decl_uid=2425, cgraph_uid=24, symbol_order=23)
Merging blocks 2 and 12
Merging blocks 2 and 7
main (int argc, char * * argv)
{
int D.2442;
int D.2440;
int D.2441;
int tt.2;
int q.1;
int D.2437;
int D.2436;
int i;
int D.2430;
<bb 2> :
__builtin_GOMP_parallel (main._omp_fn.0, 0B, 2, 0);
D.2430 = 0;
return D.2430;
}
1.4.2. omplower
TBD
1.4.3. ompexp
TBD
1.4.4. omp-builtins
生成代码中对 libgomp 函数的调用是以 builtin 的形式, 例如 `__builtin_GOMP_parallel (main._omp_fn.0, 0B, 2, 0)`, 这些 omp builtins 不需要定义 optab 和 rtl insn, 它们会在 rtl expand 时变成对 libgomp 函数的调用.
所有的 omp builtin 定义在 `omp-builtins.def`, `expand_builtin` 负责把它们变成函数调用.
rtx expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, int ignore) { tree fndecl = get_callee_fndecl (exp); machine_mode target_mode = TYPE_MODE (TREE_TYPE (exp)); int flags; switch (fcode) { CASE_FLT_FN (BUILT_IN_FABS): CASE_FLT_FN_FLOATN_NX (BUILT_IN_FABS): case BUILT_IN_FABSD32: case BUILT_IN_FABSD64: case BUILT_IN_FABSD128: target = expand_builtin_fabs (exp, target, subtarget); if (target) return target; break; ... default: /* just do library call, if unknown builtin */ break; } // NOTE: 这里生成对 libgomp 的调用 return expand_call(exp, target, ignore); }
