OpenMP
Table of Contents
1. OpenMP
1.1. Overview
OpenMP 主要包含两部分内容:
compiler directive
openmp 的 api 主要是 compiler directive 的形式, 例如 `#pragma omp parallel`, 编译器需要:
- 识别这些 directive
- 生成 parallel block 对应的匿名函数
- 调用 libgomp 中相应 api
libgomp
libgomp 中有少量函数例如 `omp_get_thread_num` 可以由用户代码直接调用, 其它的大部分的调用都是由编译器产生的, 例如 GOMP_parallel, GOMP_critical_start, GOMP_loop_nonmonotonic_dynamic_next 等
1.3. libgomp
1.3.1. parallel
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { int id = omp_get_thread_num(); printf("hello world %d\n", id); } printf("------\n"); }
$> riscv64-linux-gnu-gcc test.c -O0 -g -fopenmp $> readelf -a ./a.out|grep NEEDED 0x0000000000000001 (NEEDED) Shared library: [libgomp.so.1] 0x0000000000000001 (NEEDED) Shared library: [libc.so.6] $> riscv64-linux-gnu-objdump -d -r ./a.out 000000000000076a <main>: 76a: 1101 addi sp,sp,-32 76c: ec06 sd ra,24(sp) 76e: e822 sd s0,16(sp) 770: 1000 addi s0,sp,32 772: 87aa mv a5,a0 774: feb43023 sd a1,-32(s0) 778: fef42623 sw a5,-20(s0) 77c: 4681 li a3,0 77e: 4609 li a2,2 ~~~~~~~~ 780: 4581 li a1,0 782: 00000517 auipc a0,0x0 786: 02450513 addi a0,a0,36 # 7a6 <main._omp_fn.0> ~~~~~~~~~~~~~~ 78a: f07ff0ef jal ra,690 <GOMP_parallel@plt> ~~~~~~~~~~~~~~~~~ 78e: 00000517 auipc a0,0x0 792: 0a250513 addi a0,a0,162 # 830 <__libc_csu_fini+0x2> 796: edbff0ef jal ra,670 <puts@plt> 79a: 4781 li a5,0 79c: 853e mv a0,a5 79e: 60e2 ld ra,24(sp) 7a0: 6442 ld s0,16(sp) 7a2: 6105 addi sp,sp,32 7a4: 8082 ret 00000000000007a6 <main._omp_fn.0>: 7a6: 7179 addi sp,sp,-48 7a8: f406 sd ra,40(sp) 7aa: f022 sd s0,32(sp) 7ac: 1800 addi s0,sp,48 7ae: fca43c23 sd a0,-40(s0) 7b2: eafff0ef jal ra,660 <omp_get_thread_num@plt> 7b6: 87aa mv a5,a0 7b8: fef42623 sw a5,-20(s0) 7bc: fec42783 lw a5,-20(s0) 7c0: 85be mv a1,a5 7c2: 00000517 auipc a0,0x0 7c6: 07650513 addi a0,a0,118 # 838 <__libc_csu_fini+0xa> 7ca: eb7ff0ef jal ra,680 <printf@plt> 7ce: 70a2 ld ra,40(sp) 7d0: 7402 ld s0,32(sp) 7d2: 6145 addi sp,sp,48 7d4: 8082 ret
- gcc 会把 `omp parallel` 对应的 block `{int id=….;printf(…)}` 编译成一个独立的函数 `main._omp_fn.0`
- `omp parallel` 对应 `GOMP_parallel` 函数, 它的实现在 libgomp 中, 并且调用时传递了 `2` 做为参数, 即需要两个线程的并行
1.3.1.1. GOMP_parallel
void GOMP_parallel( void (*fn)(void *), void *data, unsigned num_threads, unsigned int flags) { num_threads = gomp_resolve_num_threads(num_threads, 0); gomp_team_start( fn, data, num_threads, flags, gomp_new_team(num_threads), NULL); /* NOTE: fn 即 main._omp_fn.0, 它在主线程会直接执行一次, 所以即使指定 * num_threads 为 0, 也会执行一次 */ fn(data); ialias_call(GOMP_parallel_end)(); } gomp_team_start: /* NOTE: gomp_team_start 的 num_threads 是从 1 开始计数的, 因为主线程已经执行了一次了 */ if (nthreads == 1) return; i = 1; for (; i < nthreads; ++i): start_data->fn = fn; start_data->fn_data = data; // ... pthread_create (&start_data->handle, attr, gomp_thread_start, start_data); gomp_thread_start: // ... local_fn = data->fn; local_data = data->fn_data; local_fn (local_data); // ...
1.3.2. sync
1.3.2.1. atomic
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel num_threads(2) { #pragma omp atomic sum += 1; } printf("%d\n", sum); }
$> riscv64-linux-gnu-gcc test.c -O0 -g -fopenmp -c -fno-stack-protector $> riscv64-linux-gnu-objdump -d -r ./test.o 000000000000005a <main._omp_fn.0>: 5a: 1101 addi sp,sp,-32 5c: ec22 sd s0,24(sp) 5e: 1000 addi s0,sp,32 60: fea43423 sd a0,-24(s0) 64: fe843783 ld a5,-24(s0) 68: 639c ld a5,0(a5) 6a: 4705 li a4,1 6c: 00e7a02f amoadd.w zero,a4,(a5) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 70: 6462 ld s0,24(sp) 72: 6105 addi sp,sp,32 74: 8082 ret
`omp atomic` 在 riscv 上实际对应的是 AMO 指令
1.3.2.2. critical
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel num_threads(2) { #pragma omp critical sum += 1; } printf("%d\n", sum); }
0000000000000062 <main._omp_fn.0>: 62: 1101 addi sp,sp,-32 64: ec06 sd ra,24(sp) 66: e822 sd s0,16(sp) 68: 1000 addi s0,sp,32 6a: fea43423 sd a0,-24(s0) 6e: 00000097 auipc ra,0x0 6e: R_RISCV_CALL_PLT GOMP_critical_start 6e: R_RISCV_RELAX *ABS* 72: 000080e7 jalr ra # 6e <main._omp_fn.0+0xc> 76: fe843783 ld a5,-24(s0) 7a: 439c lw a5,0(a5) 7c: 2785 addiw a5,a5,1 7e: 0007871b sext.w a4,a5 82: fe843783 ld a5,-24(s0) 86: c398 sw a4,0(a5) 88: 00000097 auipc ra,0x0 88: R_RISCV_CALL_PLT GOMP_critical_end 88: R_RISCV_RELAX *ABS* 8c: 000080e7 jalr ra # 88 <main._omp_fn.0+0x26> 90: 60e2 ld ra,24(sp) 92: 6442 ld s0,16(sp) 94: 6105 addi sp,sp,32 96: 8082 ret
`omp critical` 对应 `GOMP_critical_start` 和 `GOMP_critical_end`, 后者对应 posix 的 `pthread_mutex_lock` 或者 linux 的 `futex_wait`
1.3.2.3. barrier
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { printf("%d\n", omp_get_thread_num()); #pragma omp barrier printf("%d\n", omp_get_thread_num()); } return 0; }
0000000000000000 <main._omp_fn.0>: 0: 1141 addi sp,sp,-16 2: e406 sd ra,8(sp) 4: e022 sd s0,0(sp) 6: 00000097 auipc ra,0x0 6: R_RISCV_CALL_PLT omp_get_thread_num 6: R_RISCV_RELAX *ABS* a: 000080e7 jalr ra # 6 <main._omp_fn.0+0x6> e: 862a mv a2,a0 10: 842a mv s0,a0 12: 00000597 auipc a1,0x0 12: R_RISCV_PCREL_HI20 .LC0 12: R_RISCV_RELAX *ABS* 16: 00058593 mv a1,a1 16: R_RISCV_PCREL_LO12_I .L0 16: R_RISCV_RELAX *ABS* 1a: 4505 li a0,1 1c: 00000097 auipc ra,0x0 1c: R_RISCV_CALL_PLT __printf_chk 1c: R_RISCV_RELAX *ABS* 20: 000080e7 jalr ra # 1c <main._omp_fn.0+0x1c> 24: 00000097 auipc ra,0x0 24: R_RISCV_CALL_PLT GOMP_barrier ~~~~~~~~~~~~ 24: R_RISCV_RELAX *ABS* 28: 000080e7 jalr ra # 24 <main._omp_fn.0+0x24> 2c: 8622 mv a2,s0 2e: 6402 ld s0,0(sp) 30: 60a2 ld ra,8(sp) 32: 00000597 auipc a1,0x0 32: R_RISCV_PCREL_HI20 .LC0 32: R_RISCV_RELAX *ABS* 36: 00058593 mv a1,a1 36: R_RISCV_PCREL_LO12_I .L0 36: R_RISCV_RELAX *ABS* 3a: 4505 li a0,1 3c: 0141 addi sp,sp,16 3e: 00000317 auipc t1,0x0 3e: R_RISCV_CALL_PLT __printf_chk 3e: R_RISCV_RELAX *ABS* 42: 00030067 jr t1 # 3e <main._omp_fn.0+0x3e>
1.3.2.3.1. GOMP_barrier
GOMP_barrier 需要保证:
- 插入一个 memory fence
- 已经执行到 barrier 的线程需要等待其它尚未执行到 barrier 的线程
void GOMP_barrier(): gomp_team_barrier_wait(&team->barrier); gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); gomp_barrier_state_t gomp_barrier_wait_start (gomp_barrier_t *bar): /* NOTE: __atomic_xx 后的 MEMMODEL 相当于 memory fench */ unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); ret &= -BAR_INCR | BAR_CANCELLED; /* NOTE: 当 awaited 减小到 0 时, 意味着最后一个线程到达 barrier, 这个带有 * BAR_WAS_LAST 标记的线程会负责唤醒前面等待的线程 */ if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0): ret |= BAR_WAS_LAST; return ret; void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state): unsigned int generation, gen; /* NOTE: if (__builtin_expect (x, 0)) {xxx} 相当于 if (x) {xxx}, 其中的 0 只是给 * compiler 的 hint, 告诉它 x 为 0 的概率较大, 可以让 compiler 产生有利于 branch * prediction 的代码 */ if (__builtin_expect (state & BAR_WAS_LAST, 0)): bar->awaited = bar->total; team->work_share_cancelled = 0; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); /* NOTE: 如果当前线程是 BAR_WAS_LAST, 则它会 wake 其它线程 */ futex_wake ((int *) &bar->generation, INT_MAX); return; generation = state; state &= ~BAR_CANCELLED; do: /* NOTE: 非 BAR_WAS_LAST 的线程会等待 BAR_WAS_LAST 线程的 wake */ do_wait ((int *) &bar->generation, generation); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_TASK_PENDING, 0)): gomp_barrier_handle_tasks (state); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); generation |= gen & BAR_WAITING_FOR_TASK; while (gen != state + BAR_INCR);
1.3.2.4. single
single 要求只有一个线程能执行相应代码, 但并没有要求是哪一个线程
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { #pragma omp single { printf("single %d\n", omp_get_thread_num()); } printf("single %d\n", omp_get_thread_num()); } return 0; }
0000000000000000 <main._omp_fn.0>: 0: 1101 addi sp,sp,-32 2: ec06 sd ra,24(sp) 4: e822 sd s0,16(sp) 6: e426 sd s1,8(sp) 8: 00000097 auipc ra,0x0 8: R_RISCV_CALL_PLT GOMP_single_start ~~~~~~~~~~~~~~~~~ 8: R_RISCV_RELAX *ABS* c: 000080e7 jalr ra # 8 <main._omp_fn.0+0x8> 10: 84aa mv s1,a0 12: 00000097 auipc ra,0x0 12: R_RISCV_CALL_PLT omp_get_thread_num 12: R_RISCV_RELAX *ABS* 16: 000080e7 jalr ra # 12 <main._omp_fn.0+0x12> 1a: 842a mv s0,a0 1c: e09d bnez s1,42 <.L2> 1c: R_RISCV_RVC_BRANCH .L2 000000000000001e <.L3>: 1e: 00000097 auipc ra,0x0 1e: R_RISCV_CALL_PLT GOMP_barrier ~~~~~~~~~~~~ single 后有一个隐式的 barrier 1e: R_RISCV_RELAX *ABS* 22: 000080e7 jalr ra # 1e <.L3> 26: 8622 mv a2,s0 28: 6442 ld s0,16(sp) 2a: 60e2 ld ra,24(sp) 2c: 64a2 ld s1,8(sp) 2e: 00000597 auipc a1,0x0 2e: R_RISCV_PCREL_HI20 .LC0 2e: R_RISCV_RELAX *ABS* 32: 00058593 mv a1,a1 32: R_RISCV_PCREL_LO12_I .L0 32: R_RISCV_RELAX *ABS* 36: 4505 li a0,1 38: 6105 addi sp,sp,32 3a: 00000317 auipc t1,0x0 3a: R_RISCV_CALL_PLT __printf_chk 3a: R_RISCV_RELAX *ABS* 3e: 00030067 jr t1 # 3a <.L3+0x1c> 0000000000000042 <.L2>: 42: 862a mv a2,a0 44: 00000597 auipc a1,0x0 44: R_RISCV_PCREL_HI20 .LC0 44: R_RISCV_RELAX *ABS* 48: 00058593 mv a1,a1 48: R_RISCV_PCREL_LO12_I .L0 48: R_RISCV_RELAX *ABS* 4c: 4505 li a0,1 4e: 00000097 auipc ra,0x0 4e: R_RISCV_CALL_PLT __printf_chk 4e: R_RISCV_RELAX *ABS* 52: 000080e7 jalr ra # 4e <.L2+0xc> 56: b7e1 j 1e <.L3> 56: R_RISCV_RVC_JUMP .L3
1.3.2.4.1. GOMP_single_start
bool GOMP_single_start(void): struct gomp_thread *thr = gomp_thread(); struct gomp_team *team = thr->ts.team; unsigned long single_count; if (__builtin_expect(team == NULL, 0)) return true; single_count = thr->ts.single_count++; /* NOTE: __sync_bool_compare_and_swap (a,b,c) 的操作是: 比较 a, b, 若相等, 则 * 把 c 赋给a, 并返回 true, 且整个过程是原子操作. * * N 个线程竞争, 只有一个会胜出. 但不论是否胜出 ,所有线程的 single_count 以及 * team 的 single_count 都会加 1, 以便能共同竞争下一下 single, 所以 single 不 * 会固定给某一个线程执行 */ return __sync_bool_compare_and_swap( &team->single_count, single_count, single_count + 1L);
1.3.2.5. master
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { #pragma omp master { printf("%d\n", omp_get_thread_num()); } printf("%d\n", omp_get_thread_num()); } return 0; }
0000000000000000 <main._omp_fn.0>: 0: 1141 addi sp,sp,-16 2: e022 sd s0,0(sp) 4: e406 sd ra,8(sp) 6: 00000097 auipc ra,0x0 6: R_RISCV_CALL_PLT omp_get_thread_num 6: R_RISCV_RELAX *ABS* a: 000080e7 jalr ra # 6 <main._omp_fn.0+0x6> e: 842a mv s0,a0 10: cd11 beqz a0,2c <.L2> ~~~~~~~~~~~~~~~~~~~~ 由于 master 只能由主线程执行, 所以这里简单判断 omp_get_thread_num 是否 为 0 即可, 不需要像 single 那样有复杂的同步操作 10: R_RISCV_RVC_BRANCH .L2 12: 8622 mv a2,s0 14: 6402 ld s0,0(sp) 16: 60a2 ld ra,8(sp) 18: 00000597 auipc a1,0x0 18: R_RISCV_PCREL_HI20 .LC0 18: R_RISCV_RELAX *ABS* 1c: 00058593 mv a1,a1 1c: R_RISCV_PCREL_LO12_I .L0 1c: R_RISCV_RELAX *ABS* 20: 4505 li a0,1 22: 0141 addi sp,sp,16 24: 00000317 auipc t1,0x0 24: R_RISCV_CALL_PLT __printf_chk 24: R_RISCV_RELAX *ABS* 28: 00030067 jr t1 # 24 <main._omp_fn.0+0x24> 000000000000002c <.L2>: 2c: 4601 li a2,0 2e: 00000597 auipc a1,0x0 2e: R_RISCV_PCREL_HI20 .LC0 2e: R_RISCV_RELAX *ABS* 32: 00058593 mv a1,a1 32: R_RISCV_PCREL_LO12_I .L0 32: R_RISCV_RELAX *ABS* 36: 4505 li a0,1 38: 00000097 auipc ra,0x0 38: R_RISCV_CALL_PLT __printf_chk 38: R_RISCV_RELAX *ABS* 3c: 000080e7 jalr ra # 38 <.L2+0xc> 40: 8622 mv a2,s0 42: 6402 ld s0,0(sp) 44: 60a2 ld ra,8(sp) 46: 00000597 auipc a1,0x0 46: R_RISCV_PCREL_HI20 .LC0 46: R_RISCV_RELAX *ABS* 4a: 00058593 mv a1,a1 4a: R_RISCV_PCREL_LO12_I .L0 4a: R_RISCV_RELAX *ABS* 4e: 4505 li a0,1 50: 0141 addi sp,sp,16 52: 00000317 auipc t1,0x0 52: R_RISCV_CALL_PLT __printf_chk 52: R_RISCV_RELAX *ABS* 56: 00030067 jr t1 # 52 <.L2+0x26>
1.3.3. for
1.3.3.1. static schedule
// 2022-12-05 14:51 #include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel for num_threads(2) for (int i = 0; i < 10; i++) { printf("%d\n", i); } printf("%d\n", sum); }
默认情况下 `omp for` 使用 `schedule (static, N/num_threads)`, work sharing 是静态的, 由编译器直接完成
$> riscv64-linux-gnu-gcc test.c -O2 -fopenmp -c -fno-stack-protector $> riscv64-linux-gnu-objdump -d -r ./test.o 0000000000000000 <main._omp_fn.0>: 0: 1101 addi sp,sp,-32 2: ec06 sd ra,24(sp) 4: e822 sd s0,16(sp) 6: e426 sd s1,8(sp) 8: e04a sd s2,0(sp) a: 00000097 auipc ra,0x0 a: R_RISCV_CALL omp_get_num_threads a: R_RISCV_RELAX *ABS* e: 000080e7 jalr ra # a <main._omp_fn.0+0xa> 12: 842a mv s0,a0 14: 00000097 auipc ra,0x0 14: R_RISCV_CALL omp_get_thread_num 14: R_RISCV_RELAX *ABS* 18: 000080e7 jalr ra # 14 <main._omp_fn.0+0x14> 1c: 47a9 li a5,10 1e: 0287e73b remw a4,a5,s0 22: 0287c6bb divw a3,a5,s0 ~~~~~~~~~~~~~~~~ a3=10/(omp_get_num_threads()), a3 表示 schedcule(static, step) 中的 step 26: 02e54c63 blt a0,a4,5e <.L2> ~~~~~~~~~~~~~~~~~~ 如果 step 不能整除, 会把多余的部分平摊到前面的 thread, 例如 10%4=2, 则 4 线程分到的任务为 3,3,2,2 26: R_RISCV_BRANCH .L2 000000000000002a <.L5>: 2a: 02a684bb mulw s1,a3,a0 ~~~~~~~~~~~~~~~~ a0 是 omp_get_thread_num(), s1 表示 begin=omp_get_thread_num()*step 2e: 00e4843b addw s0,s1,a4 32: 00d404bb addw s1,s0,a3 ~~~~~~~~~~~~~~~~ s0 是 begin, s1 是 begin+step, 即 end, 当前 thread 处理的范围是 [s0, s1] 36: 00945e63 bge s0,s1,52 <.L1> 36: R_RISCV_BRANCH .L1 3a: 00000937 lui s2,0x0 3a: R_RISCV_HI20 .LC0 3a: R_RISCV_RELAX *ABS* 000000000000003e <.L4>: 3e: 85a2 mv a1,s0 40: 00090513 mv a0,s2 40: R_RISCV_LO12_I .LC0 40: R_RISCV_RELAX *ABS* 44: 2405 addiw s0,s0,1 46: 00000097 auipc ra,0x0 46: R_RISCV_CALL printf 46: R_RISCV_RELAX *ABS* 4a: 000080e7 jalr ra # 46 <.L4+0x8> 4e: fe8498e3 bne s1,s0,3e <.L4> 4e: R_RISCV_BRANCH .L4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .L4 对应的代码是 for (i=start; i<end; i++) {printf("%d\n",i);} 0000000000000052 <.L1>: 52: 60e2 ld ra,24(sp) 54: 6442 ld s0,16(sp) 56: 64a2 ld s1,8(sp) 58: 6902 ld s2,0(sp) 5a: 6105 addi sp,sp,32 5c: 8082 ret 000000000000005e <.L2>: 5e: 2685 addiw a3,a3,1 60: 4701 li a4,0 62: b7e1 j 2a <.L5> 62: R_RISCV_RVC_JUMP .L5
1.3.3.2. dynamic schedule
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel for num_threads(2) schedule(dynamic, 1) for (int i = 0; i < 10; i++) { printf("%d\n", i); } }
0000000000000000 <main>: 0: 00000537 lui a0,0x0 0: R_RISCV_HI20 main._omp_fn.0 0: R_RISCV_RELAX *ABS* 4: 1141 addi sp,sp,-16 6: 4881 li a7,0 8: 4805 li a6,1 a: 4785 li a5,1 c: 4729 li a4,10 e: 4681 li a3,0 10: 4609 li a2,2 12: 4581 li a1,0 14: 00050513 mv a0,a0 14: R_RISCV_LO12_I main._omp_fn.0 14: R_RISCV_RELAX *ABS* 18: e406 sd ra,8(sp) 1a: 00000097 auipc ra,0x0 1a: R_RISCV_CALL GOMP_parallel_loop_nonmonotonic_dynamic ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 设置 dynmic schedule 的参数, 例如 chunk size (1) 1a: R_RISCV_RELAX *ABS* 1e: 000080e7 jalr ra # 1a <main+0x1a> 0000000000000000 <main._omp_fn.0>: 0: 7179 addi sp,sp,-48 2: 002c addi a1,sp,8 4: 850a mv a0,sp 6: f406 sd ra,40(sp) 8: f022 sd s0,32(sp) a: ec26 sd s1,24(sp) c: e84a sd s2,16(sp) e: 00000097 auipc ra,0x0 e: R_RISCV_CALL GOMP_loop_nonmonotonic_dynamic_next e: R_RISCV_RELAX *ABS* 12: 000080e7 jalr ra # e <main._omp_fn.0+0xe> 16: c515 beqz a0,42 <.L2> 16: R_RISCV_RVC_BRANCH .L2 18: 00000937 lui s2,0x0 18: R_RISCV_HI20 .LC0 18: R_RISCV_RELAX *ABS* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .L4, .L3 的代码相当于: while (GOMP_loop_nonmonotonic_dynamic_next(*start,*end)) { for (i=start;i<end;i++) { printf("%d\n",i); } } 000000000000001c <.L4>: 1c: 4402 lw s0,0(sp) 1e: 44a2 lw s1,8(sp) ~~~~~~~~~~~~ s0 是 start, s1 是 end, 由于 GOMP_loop_nonmonotonic_dynamic_next 的参数是 (*start,*end) 所以它使用栈来传递参数 0000000000000020 <.L3>: 20: 85a2 mv a1,s0 22: 00090513 mv a0,s2 22: R_RISCV_LO12_I .LC0 22: R_RISCV_RELAX *ABS* 26: 2405 addiw s0,s0,1 28: 00000097 auipc ra,0x0 28: R_RISCV_CALL printf 28: R_RISCV_RELAX *ABS* 2c: 000080e7 jalr ra # 28 <.L3+0x8> 30: fe9448e3 blt s0,s1,20 <.L3> ~~~~~~~~~~~~~~~~~~~ .L3 对应内层的 for (i=start;i<end;i++) {xxx} 循环 30: R_RISCV_BRANCH .L3 34: 002c addi a1,sp,8 ~~~~~~~~~~~~~~~ end 的地址 36: 850a mv a0,sp ~~~~~~~~~ start 的地址 38: 00000097 auipc ra,0x0 38: R_RISCV_CALL GOMP_loop_nonmonotonic_dynamic_next 38: R_RISCV_RELAX *ABS* 3c: 000080e7 jalr ra # 38 <.L3+0x18> 40: fd71 bnez a0,1c <.L4> 40: R_RISCV_RVC_BRANCH .L4 ~~~~~ .L4 对应外层的 while (GOMP_loop_nonmonotonic_dynamic_next(*start,*end) 循环
可见 dynamic schedule 有额外的运行时开销
1.3.3.2.1. GOMP_loop_nonmonotonic_dynamic_next
bool GOMP_loop_nonmonotonic_dynamic_next(long *istart, long *iend) return gomp_loop_dynamic_next (istart, iend); return gomp_iter_dynamic_next (istart, iend); long tmp = __sync_fetch_and_add (&ws->next, chunk); if (tmp >= end) return false; nend = tmp + chunk; if (nend > end) nend = end; *pstart = tmp; *pend = nend; return true;
1.3.3.3. reduction
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { int sum = 0; #pragma omp parallel for reduction(+ : sum) num_threads(2) for (int i = 0; i < 10; i++) { sum += i; } printf("%d\n", sum); return 0; }
0000000000000000 <main._omp_fn.0>: 0: 1101 addi sp,sp,-32 2: ec06 sd ra,24(sp) 4: e822 sd s0,16(sp) 6: e426 sd s1,8(sp) 8: 842a mv s0,a0 a: 00000097 auipc ra,0x0 a: R_RISCV_CALL_PLT omp_get_num_threads a: R_RISCV_RELAX *ABS* e: 000080e7 jalr ra # a <main._omp_fn.0+0xa> 12: 84aa mv s1,a0 14: 00000097 auipc ra,0x0 14: R_RISCV_CALL_PLT omp_get_thread_num 14: R_RISCV_RELAX *ABS* 18: 000080e7 jalr ra # 14 <main._omp_fn.0+0x14> 1c: 46a9 li a3,10 1e: 0296e63b remw a2,a3,s1 22: 87aa mv a5,a0 24: 0296c5bb divw a1,a3,s1 28: 02c54e63 blt a0,a2,64 <.L2> 28: R_RISCV_BRANCH .L2 000000000000002c <.L5>: 2c: 4681 li a3,0 2e: 02f5873b mulw a4,a1,a5 32: 00c707bb addw a5,a4,a2 36: 00b7873b addw a4,a5,a1 ~~~~~~~~~~~~~~~~ 当前线程要处理的范围是 [a5, a4] 3a: 00e7c963 blt a5,a4,4c <.L4> 3a: R_RISCV_BRANCH .L4 3e: 00d4202f amoadd.w zero,a3,(s0) ~~~~~~~~~~~~~~~~~~~~~~~~ 线程处理完以后把 a3 (当前线程的 partial sum 累加到 (s0) 42: 60e2 ld ra,24(sp) 44: 6442 ld s0,16(sp) 46: 64a2 ld s1,8(sp) 48: 6105 addi sp,sp,32 4a: 8082 ret 000000000000004c <.L4>: 4c: 9ebd addw a3,a3,a5 4e: 2785 addiw a5,a5,1 ~~~~~~~~~~~~~~~~ a3 是 partial sum, 不断的累加 a5 (i) 50: fef71ee3 bne a4,a5,4c <.L4> 50: R_RISCV_BRANCH .L4 54: 2681 sext.w a3,a3 56: 00d4202f amoadd.w zero,a3,(s0) 5a: 60e2 ld ra,24(sp) 5c: 6442 ld s0,16(sp) 5e: 64a2 ld s1,8(sp) 60: 6105 addi sp,sp,32 62: 8082 ret 0000000000000064 <.L2>: 64: 2585 addiw a1,a1,1 66: 4601 li a2,0 68: b7d1 j 2c <.L5> 68: R_RISCV_RVC_JUMP .L5
reduction 的代码是由编译器直接生成的, 不需要调用 libgomp. 生成的代码和手写的不用 reduction 的代码 (例如 hello_openmp 中的 pi_parallel_for) 基本相同
1.3.4. task
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel num_threads(2) { #pragma omp single { #pragma omp task printf("%d\n", omp_get_thread_num()); #pragma omp task printf("%d\n", omp_get_thread_num()); } } return 0; }
0000000000000014 <.L6>: 14: 4881 li a7,0 16: 4801 li a6,0 18: 4785 li a5,1 1a: 4705 li a4,1 1c: 4681 li a3,0 1e: 4601 li a2,0 20: 4581 li a1,0 22: 00000517 auipc a0,0x0 22: R_RISCV_PCREL_HI20 main._omp_fn.1 22: R_RISCV_RELAX *ABS* 26: 00050513 mv a0,a0 26: R_RISCV_PCREL_LO12_I .L0 26: R_RISCV_RELAX *ABS* 2a: e002 sd zero,0(sp) 2c: 00000097 auipc ra,0x0 2c: R_RISCV_CALL_PLT GOMP_task ~~~~~~~~~ GOMP_task(main._omp_fn.1) 2c: R_RISCV_RELAX *ABS* 30: 000080e7 jalr ra # 2c <.L6+0x18> 34: 4881 li a7,0 36: e002 sd zero,0(sp) 38: 4801 li a6,0 3a: 4785 li a5,1 3c: 4705 li a4,1 3e: 4681 li a3,0 40: 4601 li a2,0 42: 4581 li a1,0 44: 00000517 auipc a0,0x0 44: R_RISCV_PCREL_HI20 main._omp_fn.2 44: R_RISCV_RELAX *ABS* 48: 00050513 mv a0,a0 48: R_RISCV_PCREL_LO12_I .L0 48: R_RISCV_RELAX *ABS* 4c: 00000097 auipc ra,0x0 4c: R_RISCV_CALL_PLT GOMP_task ~~~~~~~~~ GOMP_task(main._omp_fn.1) 4c: R_RISCV_RELAX *ABS* 50: 000080e7 jalr ra # 4c <.L6+0x38> 54: 60e2 ld ra,24(sp) 56: 6105 addi sp,sp,32 58: 8082 ret 000000000000005a <main._omp_fn.1>: 5a: 1141 addi sp,sp,-16 5c: e406 sd ra,8(sp) 5e: 00000097 auipc ra,0x0 5e: R_RISCV_CALL_PLT omp_get_thread_num 5e: R_RISCV_RELAX *ABS* 62: 000080e7 jalr ra # 5e <main._omp_fn.1+0x4> 66: 60a2 ld ra,8(sp) 68: 862a mv a2,a0 6a: 00000597 auipc a1,0x0 6a: R_RISCV_PCREL_HI20 .LC0 6a: R_RISCV_RELAX *ABS* 6e: 00058593 mv a1,a1 6e: R_RISCV_PCREL_LO12_I .L0 6e: R_RISCV_RELAX *ABS* 72: 4505 li a0,1 74: 0141 addi sp,sp,16 76: 00000317 auipc t1,0x0 76: R_RISCV_CALL_PLT __printf_chk 76: R_RISCV_RELAX *ABS* 7a: 00030067 jr t1 # 76 <main._omp_fn.1+0x1c> 000000000000007e <main._omp_fn.2>: 7e: 00000317 auipc t1,0x0 7e: R_RISCV_CALL main._omp_fn.1 7e: R_RISCV_RELAX *ABS* 82: 00030067 jr t1 # 7e <main._omp_fn.2>
每个 task 对应一个单独的 main._omp_fn.x, 通过 GOMP_task 放在某个线程里执行
1.3.4.1. GOMP_task
void GOMP_task( void (*fn)(void *), void *data, void (*cpyfn)(void *, void *), long arg_size, long arg_align, bool if_clause, unsigned flags, void **depend, int priority_arg, void *detach): priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority, PRIORITY_INSERT_END, /*adjust_parent_depends_on=*/false, task->parent_depends_on); gomp_team_barrier_set_task_pending (&team->barrier); do_wake = team->task_running_count + !parent->in_tied_task < team->nthreads; if (do_wake) gomp_team_barrier_wake (&team->barrier, 1);
1.3.5. target
#include <omp.h> #include <stdio.h> int main() { int x = 1; #pragma omp target map(tofrom : x) for (int i = 0; i < 10; i++) { x += i; } printf("x = %d\n", x); return 0; }
openmp 从 4.0 开始支持 offload 到不同的 target, 例如 nvptx 和 amd gcn.
offload 过程与 DPC++, TVM BYOC Codegen 以及 ComputeCpp 类似, 以 nvptx 为例, 主要步骤是:
- gcc 编译出 block 对应的 ptx 汇编
- 使用 mkoffload 生成一个头文件, 它会包含 ptx 汇编, 用于注册的初始化函数以及一些 mapping 信息, 确保运行时 gomp 可以找到对应的 ptx 并调用 cuda 去编译和执行它. 这个头文件与 ComputeCpp 生成的 sycl 文件类似
1.3.5.1. GOMP_offload_register_ver
GOMP_offload_register_ver 负责向 runtime 注册 offload 相关的信息
通过 `–save-temps` 可以看到这个生成的 offload 头文件:
$> gcc test.c -O0 -fopenmp -fcf-protection=none -foffload=-misa=sm_35 -fno-stack-protector --save-temps $> cat ccuL4e4U.i static const char ptx_code_0[] = "// BEGIN PREAMBLE\n" ".version 3.1\n" ".target sm_35\n" ".address_size 64\n" "// BEGIN FUNCTION DECL: main$_omp_fn$0$impl\n" ".func main$_omp_fn$0$impl (.param .u64 %in_ar0);\n" "// BEGIN GLOBAL FUNCTION DECL: gomp_nvptx_main\n" ".extern .func gomp_nvptx_main (.param .u64 %in_ar1, .param .u64 %in_ar2);\n" "// BEGIN GLOBAL VAR DECL: __nvptx_stacks\n" ".extern .shared .u64 __nvptx_stacks[32];\n" "// BEGIN GLOBAL VAR DECL: __nvptx_uni\n" ".extern .shared .u32 __nvptx_uni[32];\n" "// END PREAMBLE\n" ".visible .entry main$_omp_fn$0 (.param .u64 %arg, .param .u64 %stack, .param .u64 %sz)\n" "{\n" ".reg .u32 %r<3>;\n" ".reg .u64 %R<4>;\n" /* ... */ "mov.u32 %r0,%tid.y;\n" /* ... */ static const char ptx_code_19[] = "" /* ... */ static const struct ptx_obj { const char *code; long unsigned int size; } ptx_objs[] = { {ptx_code_0, sizeof (ptx_code_0)}, {ptx_code_1, sizeof (ptx_code_1)}, {ptx_code_2, sizeof (ptx_code_2)}, {ptx_code_3, sizeof (ptx_code_3)}, {ptx_code_4, sizeof (ptx_code_4)}, {ptx_code_5, sizeof (ptx_code_5)}, {ptx_code_6, sizeof (ptx_code_6)}, {ptx_code_7, sizeof (ptx_code_7)}, {ptx_code_8, sizeof (ptx_code_8)}, {ptx_code_9, sizeof (ptx_code_9)}, {ptx_code_10, sizeof (ptx_code_10)}, {ptx_code_11, sizeof (ptx_code_11)}, {ptx_code_12, sizeof (ptx_code_12)}, {ptx_code_13, sizeof (ptx_code_13)}, {ptx_code_14, sizeof (ptx_code_14)}, {ptx_code_15, sizeof (ptx_code_15)}, {ptx_code_16, sizeof (ptx_code_16)}, {ptx_code_17, sizeof (ptx_code_17)}, {ptx_code_18, sizeof (ptx_code_18)}, {ptx_code_19, sizeof (ptx_code_19)} }; static const char *const var_mappings[] = { }; static const struct nvptx_fn { const char *name; unsigned short dim[3]; } func_mappings[] = { {"main$_omp_fn$0"} }; static const struct nvptx_tdata { const struct ptx_obj *ptx_objs; unsigned ptx_num; const char *const *var_names; unsigned var_num; const struct nvptx_fn *fn_names; unsigned fn_num; } target_data = { ptx_objs, sizeof (ptx_objs) / sizeof (ptx_objs[0]), var_mappings, sizeof (var_mappings) / sizeof (var_mappings[0]), func_mappings, sizeof (func_mappings) / sizeof (func_mappings[0]) }; extern void GOMP_offload_register_ver (unsigned, const void *, int, const void *); extern void GOMP_offload_unregister_ver (unsigned, const void *, int, const void *); extern const void *const __OFFLOAD_TABLE__[]; static __attribute__((constructor)) void init (void) { GOMP_offload_register_ver (0x10001, __OFFLOAD_TABLE__, 5 , &target_data); }; static __attribute__((destructor)) void fini (void) { GOMP_offload_unregister_ver (0x10001, __OFFLOAD_TABLE__, 5 , &target_data); };
通过注册的 target_data 中的 func_mappings 和 ptx_objs, 运行时可以找到函数 (例如 main._omp_fn.0) 对应的 ptx code (例如 ptx_code_0)
1.3.5.2. GOMP_target_ext
0000000000001169 <main>: # ... 11a7: 6a 00 pushq $0x0 11a9: 4c 8d 0d 68 ee 06 00 lea 0x6ee68(%rip),%r9 # 70018 <.omp_data_kinds.6.2657> 11b0: 4c 8d 05 59 ee 06 00 lea 0x6ee59(%rip),%r8 # 70010 <.omp_data_sizes.5.2656> 11b7: 48 89 c1 mov %rax,%rcx 11ba: ba 01 00 00 00 mov $0x1,%edx 11bf: 48 8d 35 2b 00 00 00 lea 0x2b(%rip),%rsi # 11f1 <main._omp_fn.0> ~~~~~~~~~~~~~~~~~~~~~~~ 11c6: bf ff ff ff ff mov $0xffffffff,%edi 11cb: e8 60 fe ff ff callq 1030 <GOMP_target_ext@plt> ~~~~~~~~~~~~~~~~~~~~ GOMP_target_ext 会负责调用 ptx runtime 相关代码去找 main._omp_fn.0 对应的 ptx code 并执行, 但如果没有成功, 可以 fallback 到 cpu 代码 所以下面有 main._omp_fn.0 完整的 cpu 实现 11d0: 48 83 c4 20 add $0x20,%rsp 11d4: 8b 45 d4 mov -0x2c(%rbp),%eax 11d7: 89 c6 mov %eax,%esi 11d9: 48 8d 3d 24 0e 00 00 lea 0xe24(%rip),%rdi # 2004 <_IO_stdin_used+0x4> 11e0: b8 00 00 00 00 mov $0x0,%eax 11e5: e8 66 fe ff ff callq 1050 <printf@plt> 11ea: b8 00 00 00 00 mov $0x0,%eax 11ef: c9 leaveq 11f0: c3 retq 00000000000011f1 <main._omp_fn.0>: 11f1: 55 push %rbp 11f2: 48 89 e5 mov %rsp,%rbp 11f5: 48 89 7d e8 mov %rdi,-0x18(%rbp) 11f9: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp) 1200: 83 7d fc 09 cmpl $0x9,-0x4(%rbp) ~~~~~~~~~~~~~~~~~~~~~~ 循环 10 次 1204: 7f 1d jg 1223 <main._omp_fn.0+0x32> 1206: 48 8b 45 e8 mov -0x18(%rbp),%rax 120a: 48 8b 00 mov (%rax),%rax 120d: 8b 10 mov (%rax),%edx 120f: 8b 45 fc mov -0x4(%rbp),%eax 1212: 01 c2 add %eax,%edx 1214: 48 8b 45 e8 mov -0x18(%rbp),%rax 1218: 48 8b 00 mov (%rax),%rax 121b: 89 10 mov %edx,(%rax) 121d: 83 45 fc 01 addl $0x1,-0x4(%rbp) 1221: eb dd jmp 1200 <main._omp_fn.0+0xf> 1223: 5d pop %rbp 1224: c3 retq 1225: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 122c: 00 00 00 122f: 90 nop
1.3.6. simd
openmp 支持通过 `#pragma omp simd` 给编译器一个提示, 以便对循环进行向量化.
1.4. Compiler Directive
gcc 对 openmp 的支持主要在两个 tree pass:
- omplower
- ompexp
1.4.1. Example
以下面的代码为例:
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { #pragma omp parallel for num_threads(2) for (int i = 0; i < 10; i++) { printf("%d\n", omp_get_thread_num()); } return 0; }
$> riscv64-linux-gnu-gcc test.c -O2 -fopenmp -c -fno-stack-protector -fdump-tree-all
test.c.007t.omplower:
;; Function main (main, funcdef_no=23, decl_uid=2425, cgraph_uid=24, symbol_order=23) Introduced new external node (main._omp_fn.0/24). main (int argc, char * * argv) { int D.2430; { { #pragma omp parallel num_threads(2) [child fn: main._omp_fn.0 (???)] { int i; { #pragma omp for nowait for (i = 0; i < 10; i = i + 1) D.2429 = omp_get_thread_num (); printf ("%d\n", D.2429); #pragma omp continue (i, i) #pragma omp return(nowait) } } #pragma omp return } D.2430 = 0; return D.2430; } D.2430 = 0; return D.2430; }
test.c.013t.ompexp:
;; Function main._omp_fn.0 (main._omp_fn.0, funcdef_no=24, decl_uid=2432, cgraph_uid=25, symbol_order=24) main._omp_fn.0 (void * .omp_data_i) { int D.2451; int i; int D.2449; int D.2448; int D.2447; int tt.2; int q.1; int D.2444; int D.2443; <bb 11> : <bb 3> : D.2443 = __builtin_omp_get_num_threads (); D.2444 = __builtin_omp_get_thread_num (); q.1 = 10 / D.2443; tt.2 = 10 % D.2443; if (D.2444 < tt.2) goto <bb 9>; [25.00%] else goto <bb 10>; [75.00%] <bb 10> : D.2447 = q.1 * D.2444; D.2448 = D.2447 + tt.2; D.2449 = D.2448 + q.1; if (D.2448 >= D.2449) goto <bb 5>; [INV] else goto <bb 8>; [INV] <bb 8> : i = D.2448; <bb 4> : D.2451 = __builtin_omp_get_thread_num (); printf ("%d\n", D.2451); i = i + 1; if (i < D.2449) goto <bb 4>; [INV] else goto <bb 5>; [INV] <bb 5> : <bb 6> : return; <bb 9> : tt.2 = 0; q.1 = q.1 + 1; goto <bb 10>; [100.00%] } ;; Function main (main, funcdef_no=23, decl_uid=2425, cgraph_uid=24, symbol_order=23) Merging blocks 2 and 12 Merging blocks 2 and 7 main (int argc, char * * argv) { int D.2442; int D.2440; int D.2441; int tt.2; int q.1; int D.2437; int D.2436; int i; int D.2430; <bb 2> : __builtin_GOMP_parallel (main._omp_fn.0, 0B, 2, 0); D.2430 = 0; return D.2430; }
1.4.2. omplower
TBD
1.4.3. ompexp
TBD
1.4.4. omp-builtins
生成代码中对 libgomp 函数的调用是以 builtin 的形式, 例如 `__builtin_GOMP_parallel (main._omp_fn.0, 0B, 2, 0)`, 这些 omp builtins 不需要定义 optab 和 rtl insn, 它们会在 rtl expand 时变成对 libgomp 函数的调用.
所有的 omp builtin 定义在 `omp-builtins.def`, `expand_builtin` 负责把它们变成函数调用.
rtx expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, int ignore) { tree fndecl = get_callee_fndecl (exp); machine_mode target_mode = TYPE_MODE (TREE_TYPE (exp)); int flags; switch (fcode) { CASE_FLT_FN (BUILT_IN_FABS): CASE_FLT_FN_FLOATN_NX (BUILT_IN_FABS): case BUILT_IN_FABSD32: case BUILT_IN_FABSD64: case BUILT_IN_FABSD128: target = expand_builtin_fabs (exp, target, subtarget); if (target) return target; break; ... default: /* just do library call, if unknown builtin */ break; } // NOTE: 这里生成对 libgomp 的调用 return expand_call(exp, target, ignore); }