OpenMP

1. OpenMP

1. OpenMP

1.1. Overview

OpenMP 主要包含两部分内容:

compiler directive

openmp 的 api 主要是 compiler directive 的形式, 例如 `#pragma omp parallel`, 编译器需要:
1. 识别这些 directive
2. 生成 parallel block 对应的匿名函数
3. 调用 libgomp 中相应 api
libgomp

libgomp 中有少量函数例如 `omp_get_thread_num` 可以由用户代码直接调用, 其它的大部分的调用都是由编译器产生的, 例如 GOMP_parallel, GOMP_critical_start, GOMP_loop_nonmonotonic_dynamic_next 等

1.2. Demo

https://github.com/sunwayforever/hello_world/tree/main/hello_openmp

1.3. libgomp

1.3.1. parallel

// 2022-12-05 14:51
#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
#pragma omp parallel num_threads(2)
    {
        int id = omp_get_thread_num();
        printf("hello world %d\n", id);
    }

    printf("------\n");
}

$> riscv64-linux-gnu-gcc test.c -O0 -g -fopenmp

$> readelf -a ./a.out|grep NEEDED
 0x0000000000000001 (NEEDED)             Shared library: [libgomp.so.1]
 0x0000000000000001 (NEEDED)             Shared library: [libc.so.6]

$> riscv64-linux-gnu-objdump -d -r ./a.out

000000000000076a <main>:
 76a: 1101                    addi    sp,sp,-32
 76c: ec06                    sd  ra,24(sp)
 76e: e822                    sd  s0,16(sp)
 770: 1000                    addi    s0,sp,32
 772: 87aa                    mv  a5,a0
 774: feb43023            sd  a1,-32(s0)
 778: fef42623            sw  a5,-20(s0)
 77c: 4681                    li  a3,0
 77e: 4609                    li  a2,2
                              ~~~~~~~~
 780: 4581                    li  a1,0
 782: 00000517            auipc   a0,0x0
 786: 02450513            addi    a0,a0,36 # 7a6 <main._omp_fn.0>
                                                  ~~~~~~~~~~~~~~
 78a: f07ff0ef            jal ra,690 <GOMP_parallel@plt>
                                      ~~~~~~~~~~~~~~~~~
 78e: 00000517            auipc   a0,0x0
 792: 0a250513            addi    a0,a0,162 # 830 <__libc_csu_fini+0x2>
 796: edbff0ef            jal ra,670 <puts@plt>
 79a: 4781                    li  a5,0
 79c: 853e                    mv  a0,a5
 79e: 60e2                    ld  ra,24(sp)
 7a0: 6442                    ld  s0,16(sp)
 7a2: 6105                    addi    sp,sp,32
 7a4: 8082                    ret

00000000000007a6 <main._omp_fn.0>:
 7a6: 7179                    addi    sp,sp,-48
 7a8: f406                    sd  ra,40(sp)
 7aa: f022                    sd  s0,32(sp)
 7ac: 1800                    addi    s0,sp,48
 7ae: fca43c23            sd  a0,-40(s0)
 7b2: eafff0ef            jal ra,660 <omp_get_thread_num@plt>
 7b6: 87aa                    mv  a5,a0
 7b8: fef42623            sw  a5,-20(s0)
 7bc: fec42783            lw  a5,-20(s0)
 7c0: 85be                    mv  a1,a5
 7c2: 00000517            auipc   a0,0x0
 7c6: 07650513            addi    a0,a0,118 # 838 <__libc_csu_fini+0xa>
 7ca: eb7ff0ef            jal ra,680 <printf@plt>
 7ce: 70a2                    ld  ra,40(sp)
 7d0: 7402                    ld  s0,32(sp)
 7d2: 6145                    addi    sp,sp,48
 7d4: 8082                    ret

gcc 会把 `omp parallel` 对应的 block `{int id=….;printf(…)}` 编译成一个独立的函数 `main._omp_fn.0`
`omp parallel` 对应 `GOMP_parallel` 函数, 它的实现在 libgomp 中, 并且调用时传递了 `2` 做为参数, 即需要两个线程的并行

1.3.1.1. GOMP_parallel

void GOMP_parallel(
    void (*fn)(void *), void *data, unsigned num_threads, unsigned int flags) {
    num_threads = gomp_resolve_num_threads(num_threads, 0);
    gomp_team_start(
        fn, data, num_threads, flags, gomp_new_team(num_threads), NULL);
    /* NOTE: fn 即 main._omp_fn.0, 它在主线程会直接执行一次, 所以即使指定
     * num_threads 为 0, 也会执行一次 */
    fn(data);
    ialias_call(GOMP_parallel_end)();
}


gomp_team_start:
  /* NOTE: gomp_team_start 的 num_threads 是从 1 开始计数的, 因为主线程已经执行了一次了 */
  if (nthreads == 1)
    return;

  i = 1;
  for (; i < nthreads; ++i):
    start_data->fn = fn;
    start_data->fn_data = data;
    // ...
    pthread_create (&start_data->handle, attr, gomp_thread_start, start_data);

gomp_thread_start:
  // ...
  local_fn = data->fn;
  local_data = data->fn_data;
  local_fn (local_data);
  // ...

1.3.2. sync

1.3.2.1. atomic

// 2022-12-05 14:51
#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
    int sum = 0;
#pragma omp parallel num_threads(2)
    {
#pragma omp atomic
        sum += 1;
    }

    printf("%d\n", sum);
}

$> riscv64-linux-gnu-gcc test.c -O0 -g  -fopenmp -c -fno-stack-protector
$> riscv64-linux-gnu-objdump -d -r ./test.o

000000000000005a <main._omp_fn.0>:
  5a:   1101                    addi    sp,sp,-32
  5c:   ec22                    sd      s0,24(sp)
  5e:   1000                    addi    s0,sp,32
  60:   fea43423                sd      a0,-24(s0)
  64:   fe843783                ld      a5,-24(s0)
  68:   639c                    ld      a5,0(a5)
  6a:   4705                    li      a4,1
  6c:   00e7a02f                amoadd.w        zero,a4,(a5)
                                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  70:   6462                    ld      s0,24(sp)
  72:   6105                    addi    sp,sp,32
  74:   8082                    ret

`omp atomic` 在 riscv 上实际对应的是 AMO 指令

1.3.2.2. critical

// 2022-12-05 14:51
#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
    int sum = 0;
#pragma omp parallel num_threads(2)
    {
#pragma omp critical
        sum += 1;
    }
    printf("%d\n", sum);
}

0000000000000062 <main._omp_fn.0>:
  62:   1101                    addi    sp,sp,-32
  64:   ec06                    sd      ra,24(sp)
  66:   e822                    sd      s0,16(sp)
  68:   1000                    addi    s0,sp,32
  6a:   fea43423                sd      a0,-24(s0)
  6e:   00000097                auipc   ra,0x0
                        6e: R_RISCV_CALL_PLT    GOMP_critical_start
                        6e: R_RISCV_RELAX       *ABS*
  72:   000080e7                jalr    ra # 6e <main._omp_fn.0+0xc>
  76:   fe843783                ld      a5,-24(s0)
  7a:   439c                    lw      a5,0(a5)
  7c:   2785                    addiw   a5,a5,1
  7e:   0007871b                sext.w  a4,a5
  82:   fe843783                ld      a5,-24(s0)
  86:   c398                    sw      a4,0(a5)
  88:   00000097                auipc   ra,0x0
                        88: R_RISCV_CALL_PLT    GOMP_critical_end
                        88: R_RISCV_RELAX       *ABS*
  8c:   000080e7                jalr    ra # 88 <main._omp_fn.0+0x26>
  90:   60e2                    ld      ra,24(sp)
  92:   6442                    ld      s0,16(sp)
  94:   6105                    addi    sp,sp,32
  96:   8082                    ret

`omp critical` 对应 `GOMP_critical_start` 和 `GOMP_critical_end`, 后者对应 posix 的 `pthread_mutex_lock` 或者 linux 的 `futex_wait`

1.3.2.3. barrier

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
#pragma omp parallel num_threads(2)
    {
        printf("%d\n", omp_get_thread_num());
#pragma omp barrier
        printf("%d\n", omp_get_thread_num());
    }
    return 0;
}

0000000000000000 <main._omp_fn.0>:
   0: 1141                    addi    sp,sp,-16
   2: e406                    sd  ra,8(sp)
   4: e022                    sd  s0,0(sp)
   6: 00000097            auipc   ra,0x0
            6: R_RISCV_CALL_PLT   omp_get_thread_num
            6: R_RISCV_RELAX  *ABS*
   a: 000080e7            jalr    ra # 6 <main._omp_fn.0+0x6>
   e: 862a                    mv  a2,a0
  10: 842a                    mv  s0,a0
  12: 00000597            auipc   a1,0x0
            12: R_RISCV_PCREL_HI20    .LC0
            12: R_RISCV_RELAX *ABS*
  16: 00058593            mv  a1,a1
            16: R_RISCV_PCREL_LO12_I  .L0
            16: R_RISCV_RELAX *ABS*
  1a: 4505                    li  a0,1
  1c: 00000097            auipc   ra,0x0
            1c: R_RISCV_CALL_PLT  __printf_chk
            1c: R_RISCV_RELAX *ABS*
  20: 000080e7            jalr    ra # 1c <main._omp_fn.0+0x1c>
  24: 00000097            auipc   ra,0x0
            24: R_RISCV_CALL_PLT  GOMP_barrier
                                  ~~~~~~~~~~~~

            24: R_RISCV_RELAX *ABS*
  28: 000080e7            jalr    ra # 24 <main._omp_fn.0+0x24>
  2c: 8622                    mv  a2,s0
  2e: 6402                    ld  s0,0(sp)
  30: 60a2                    ld  ra,8(sp)
  32: 00000597            auipc   a1,0x0
            32: R_RISCV_PCREL_HI20    .LC0
            32: R_RISCV_RELAX *ABS*
  36: 00058593            mv  a1,a1
            36: R_RISCV_PCREL_LO12_I  .L0
            36: R_RISCV_RELAX *ABS*
  3a: 4505                    li  a0,1
  3c: 0141                    addi    sp,sp,16
  3e: 00000317            auipc   t1,0x0
            3e: R_RISCV_CALL_PLT  __printf_chk
            3e: R_RISCV_RELAX *ABS*
  42: 00030067            jr  t1 # 3e <main._omp_fn.0+0x3e>

1.3.2.3.1. GOMP_barrier

GOMP_barrier 需要保证:

插入一个 memory fence
已经执行到 barrier 的线程需要等待其它尚未执行到 barrier 的线程

void GOMP_barrier():
  gomp_team_barrier_wait(&team->barrier);
    gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));

gomp_barrier_state_t gomp_barrier_wait_start (gomp_barrier_t *bar):
  /* NOTE: __atomic_xx 后的 MEMMODEL 相当于 memory fench */
  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
  ret &= -BAR_INCR | BAR_CANCELLED;
  /* NOTE: 当 awaited 减小到 0 时, 意味着最后一个线程到达 barrier, 这个带有
   * BAR_WAS_LAST 标记的线程会负责唤醒前面等待的线程 */
  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0):
    ret |= BAR_WAS_LAST;
  return ret;

void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state):
  unsigned int generation, gen;

/* NOTE: if (__builtin_expect (x, 0)) {xxx} 相当于 if (x) {xxx}, 其中的 0 只是给
 * compiler 的 hint, 告诉它 x 为 0 的概率较大, 可以让 compiler 产生有利于 branch
 * prediction 的代码 */
if (__builtin_expect (state & BAR_WAS_LAST, 0)):
    bar->awaited = bar->total;
    team->work_share_cancelled = 0;
    __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
    /* NOTE: 如果当前线程是 BAR_WAS_LAST, 则它会 wake 其它线程 */
    futex_wake ((int *) &bar->generation, INT_MAX);
    return;

generation = state;
state &= ~BAR_CANCELLED;
do:
  /* NOTE: 非 BAR_WAS_LAST 的线程会等待 BAR_WAS_LAST 线程的 wake */
  do_wait ((int *) &bar->generation, generation);
  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
  if (__builtin_expect (gen & BAR_TASK_PENDING, 0)):
    gomp_barrier_handle_tasks (state);
    gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);

  generation |= gen & BAR_WAITING_FOR_TASK;
while (gen != state + BAR_INCR);

1.3.2.4. single

single 要求只有一个线程能执行相应代码, 但并没有要求是哪一个线程

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
#pragma omp parallel num_threads(2)
    {
#pragma omp single
        { printf("single %d\n", omp_get_thread_num()); }
        printf("single %d\n", omp_get_thread_num());
    }
    return 0;
}

0000000000000000 <main._omp_fn.0>:
   0:   1101                    addi    sp,sp,-32
   2:   ec06                    sd  ra,24(sp)
   4:   e822                    sd  s0,16(sp)
   6:   e426                    sd  s1,8(sp)
   8:   00000097            auipc   ra,0x0
            8: R_RISCV_CALL_PLT GOMP_single_start
                                ~~~~~~~~~~~~~~~~~

            8: R_RISCV_RELAX  *ABS*
   c: 000080e7            jalr    ra # 8 <main._omp_fn.0+0x8>
  10: 84aa                    mv  s1,a0
  12: 00000097            auipc   ra,0x0
            12: R_RISCV_CALL_PLT  omp_get_thread_num
            12: R_RISCV_RELAX *ABS*
  16: 000080e7            jalr    ra # 12 <main._omp_fn.0+0x12>
  1a: 842a                    mv  s0,a0
  1c: e09d                    bnez    s1,42 <.L2>
            1c: R_RISCV_RVC_BRANCH    .L2

000000000000001e <.L3>:
  1e: 00000097            auipc   ra,0x0
            1e: R_RISCV_CALL_PLT  GOMP_barrier
                                  ~~~~~~~~~~~~
                                  single 后有一个隐式的 barrier

            1e: R_RISCV_RELAX *ABS*
  22: 000080e7            jalr    ra # 1e <.L3>
  26: 8622                    mv  a2,s0
  28: 6442                    ld  s0,16(sp)
  2a: 60e2                    ld  ra,24(sp)
  2c: 64a2                    ld  s1,8(sp)
  2e: 00000597            auipc   a1,0x0
            2e: R_RISCV_PCREL_HI20    .LC0
            2e: R_RISCV_RELAX *ABS*
  32: 00058593            mv  a1,a1
            32: R_RISCV_PCREL_LO12_I  .L0
            32: R_RISCV_RELAX *ABS*
  36: 4505                    li  a0,1
  38: 6105                    addi    sp,sp,32
  3a: 00000317            auipc   t1,0x0
            3a: R_RISCV_CALL_PLT  __printf_chk
            3a: R_RISCV_RELAX *ABS*
  3e: 00030067            jr  t1 # 3a <.L3+0x1c>

0000000000000042 <.L2>:
  42: 862a                    mv  a2,a0
  44: 00000597            auipc   a1,0x0
            44: R_RISCV_PCREL_HI20    .LC0
            44: R_RISCV_RELAX *ABS*
  48: 00058593            mv  a1,a1
            48: R_RISCV_PCREL_LO12_I  .L0
            48: R_RISCV_RELAX *ABS*
  4c: 4505                    li  a0,1
  4e: 00000097            auipc   ra,0x0
            4e: R_RISCV_CALL_PLT  __printf_chk
            4e: R_RISCV_RELAX *ABS*
  52: 000080e7            jalr    ra # 4e <.L2+0xc>
  56: b7e1                    j   1e <.L3>
            56: R_RISCV_RVC_JUMP  .L3

1.3.2.4.1. GOMP_single_start

bool GOMP_single_start(void):
  struct gomp_thread *thr = gomp_thread();
  struct gomp_team *team = thr->ts.team;
  unsigned long single_count;

  if (__builtin_expect(team == NULL, 0)) return true;

  single_count = thr->ts.single_count++;
  /* NOTE: __sync_bool_compare_and_swap (a,b,c) 的操作是: 比较 a, b, 若相等, 则
   * 把 c 赋给a, 并返回 true, 且整个过程是原子操作.
   *
   * N 个线程竞争, 只有一个会胜出. 但不论是否胜出 ,所有线程的 single_count 以及
   * team 的 single_count 都会加 1, 以便能共同竞争下一下 single, 所以 single 不
   * 会固定给某一个线程执行 */
  return __sync_bool_compare_and_swap(
        &team->single_count, single_count, single_count + 1L);

1.3.2.5. master

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
#pragma omp parallel num_threads(2)
    {
#pragma omp master
        { printf("%d\n", omp_get_thread_num()); }
        printf("%d\n", omp_get_thread_num());
    }
    return 0;
}

0000000000000000 <main._omp_fn.0>:
   0: 1141                    addi    sp,sp,-16
   2: e022                    sd  s0,0(sp)
   4: e406                    sd  ra,8(sp)
   6: 00000097            auipc   ra,0x0
            6: R_RISCV_CALL_PLT   omp_get_thread_num
            6: R_RISCV_RELAX  *ABS*
   a: 000080e7            jalr    ra # 6 <main._omp_fn.0+0x6>
   e: 842a                    mv  s0,a0
  10: cd11                    beqz    a0,2c <.L2>
                              ~~~~~~~~~~~~~~~~~~~~
                              由于 master 只能由主线程执行, 所以这里简单判断 omp_get_thread_num 是否
                              为 0 即可, 不需要像 single 那样有复杂的同步操作

            10: R_RISCV_RVC_BRANCH    .L2
  12: 8622                    mv  a2,s0
  14: 6402                    ld  s0,0(sp)
  16: 60a2                    ld  ra,8(sp)
  18: 00000597            auipc   a1,0x0
            18: R_RISCV_PCREL_HI20    .LC0
            18: R_RISCV_RELAX *ABS*
  1c: 00058593            mv  a1,a1
            1c: R_RISCV_PCREL_LO12_I  .L0
            1c: R_RISCV_RELAX *ABS*
  20: 4505                    li  a0,1
  22: 0141                    addi    sp,sp,16
  24: 00000317            auipc   t1,0x0
            24: R_RISCV_CALL_PLT  __printf_chk
            24: R_RISCV_RELAX *ABS*
  28: 00030067            jr  t1 # 24 <main._omp_fn.0+0x24>

000000000000002c <.L2>:
  2c: 4601                    li  a2,0
  2e: 00000597            auipc   a1,0x0
            2e: R_RISCV_PCREL_HI20    .LC0
            2e: R_RISCV_RELAX *ABS*
  32: 00058593            mv  a1,a1
            32: R_RISCV_PCREL_LO12_I  .L0
            32: R_RISCV_RELAX *ABS*
  36: 4505                    li  a0,1
  38: 00000097            auipc   ra,0x0
            38: R_RISCV_CALL_PLT  __printf_chk
            38: R_RISCV_RELAX *ABS*
  3c: 000080e7            jalr    ra # 38 <.L2+0xc>
  40: 8622                    mv  a2,s0
  42: 6402                    ld  s0,0(sp)
  44: 60a2                    ld  ra,8(sp)
  46: 00000597            auipc   a1,0x0
            46: R_RISCV_PCREL_HI20    .LC0
            46: R_RISCV_RELAX *ABS*
  4a: 00058593            mv  a1,a1
            4a: R_RISCV_PCREL_LO12_I  .L0
            4a: R_RISCV_RELAX *ABS*
  4e: 4505                    li  a0,1
  50: 0141                    addi    sp,sp,16
  52: 00000317            auipc   t1,0x0
            52: R_RISCV_CALL_PLT  __printf_chk
            52: R_RISCV_RELAX *ABS*
  56: 00030067            jr  t1 # 52 <.L2+0x26>

1.3.3. for

1.3.3.1. static schedule

// 2022-12-05 14:51
#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
    int sum = 0;
#pragma omp parallel for num_threads(2)
    for (int i = 0; i < 10; i++) {
        printf("%d\n", i);
    }
    printf("%d\n", sum);
}

默认情况下 `omp for` 使用 `schedule (static, N/num_threads)`, work sharing 是静态的, 由编译器直接完成

$> riscv64-linux-gnu-gcc test.c -O2  -fopenmp -c -fno-stack-protector
$> riscv64-linux-gnu-objdump -d -r ./test.o

0000000000000000 <main._omp_fn.0>:
   0: 1101                    addi    sp,sp,-32
   2: ec06                    sd  ra,24(sp)
   4: e822                    sd  s0,16(sp)
   6: e426                    sd  s1,8(sp)
   8: e04a                    sd  s2,0(sp)
   a: 00000097            auipc   ra,0x0
            a: R_RISCV_CALL   omp_get_num_threads
            a: R_RISCV_RELAX  *ABS*
   e: 000080e7            jalr    ra # a <main._omp_fn.0+0xa>
  12: 842a                    mv  s0,a0
  14: 00000097            auipc   ra,0x0
            14: R_RISCV_CALL  omp_get_thread_num
            14: R_RISCV_RELAX *ABS*
  18: 000080e7            jalr    ra # 14 <main._omp_fn.0+0x14>
  1c: 47a9                    li  a5,10
  1e: 0287e73b            remw    a4,a5,s0
  22: 0287c6bb            divw    a3,a5,s0
                          ~~~~~~~~~~~~~~~~
                          a3=10/(omp_get_num_threads()), a3 表示 schedcule(static, step) 中的 step

  26: 02e54c63            blt a0,a4,5e <.L2>
                          ~~~~~~~~~~~~~~~~~~
                          如果 step 不能整除, 会把多余的部分平摊到前面的 thread, 例如 10%4=2, 则 4 线程分到的任务为 3,3,2,2

            26: R_RISCV_BRANCH    .L2

000000000000002a <.L5>:
  2a: 02a684bb            mulw    s1,a3,a0
                          ~~~~~~~~~~~~~~~~
                          a0 是 omp_get_thread_num(), s1 表示 begin=omp_get_thread_num()*step

  2e: 00e4843b            addw    s0,s1,a4
  32: 00d404bb            addw    s1,s0,a3
                          ~~~~~~~~~~~~~~~~
                          s0 是 begin, s1 是 begin+step, 即 end, 当前 thread 处理的范围是 [s0, s1]

  36: 00945e63            bge s0,s1,52 <.L1>
            36: R_RISCV_BRANCH    .L1
  3a: 00000937            lui s2,0x0
            3a: R_RISCV_HI20  .LC0
            3a: R_RISCV_RELAX *ABS*

000000000000003e <.L4>:
  3e: 85a2                    mv  a1,s0
  40: 00090513            mv  a0,s2
            40: R_RISCV_LO12_I    .LC0
            40: R_RISCV_RELAX *ABS*
  44: 2405                    addiw   s0,s0,1
  46: 00000097            auipc   ra,0x0
            46: R_RISCV_CALL  printf
            46: R_RISCV_RELAX *ABS*
  4a: 000080e7            jalr    ra # 46 <.L4+0x8>
  4e: fe8498e3            bne s1,s0,3e <.L4>
            4e: R_RISCV_BRANCH    .L4
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.L4 对应的代码是 for (i=start; i<end; i++) {printf("%d\n",i);}

0000000000000052 <.L1>:
  52: 60e2                    ld  ra,24(sp)
  54: 6442                    ld  s0,16(sp)
  56: 64a2                    ld  s1,8(sp)
  58: 6902                    ld  s2,0(sp)
  5a: 6105                    addi    sp,sp,32
  5c: 8082                    ret

000000000000005e <.L2>:
  5e: 2685                    addiw   a3,a3,1
  60: 4701                    li  a4,0
  62: b7e1                    j   2a <.L5>
            62: R_RISCV_RVC_JUMP  .L5

1.3.3.2. dynamic schedule

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
    int sum = 0;
#pragma omp parallel for num_threads(2) schedule(dynamic, 1)
    for (int i = 0; i < 10; i++) {
        printf("%d\n", i);
    }
}

0000000000000000 <main>:
   0: 00000537            lui a0,0x0
            0: R_RISCV_HI20   main._omp_fn.0
            0: R_RISCV_RELAX  *ABS*
   4: 1141                    addi    sp,sp,-16
   6: 4881                    li  a7,0
   8: 4805                    li  a6,1
   a: 4785                    li  a5,1
   c: 4729                    li  a4,10
   e: 4681                    li  a3,0
  10: 4609                    li  a2,2
  12: 4581                    li  a1,0
  14: 00050513            mv  a0,a0
            14: R_RISCV_LO12_I    main._omp_fn.0
            14: R_RISCV_RELAX *ABS*
  18: e406                    sd  ra,8(sp)
  1a: 00000097            auipc   ra,0x0
            1a: R_RISCV_CALL  GOMP_parallel_loop_nonmonotonic_dynamic
                              ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                              设置 dynmic schedule 的参数, 例如 chunk size (1)

            1a: R_RISCV_RELAX *ABS*
  1e: 000080e7            jalr    ra # 1a <main+0x1a>


0000000000000000 <main._omp_fn.0>:
   0: 7179                    addi    sp,sp,-48
   2: 002c                    addi    a1,sp,8
   4: 850a                    mv  a0,sp
   6: f406                    sd  ra,40(sp)
   8: f022                    sd  s0,32(sp)
   a: ec26                    sd  s1,24(sp)
   c: e84a                    sd  s2,16(sp)
   e: 00000097            auipc   ra,0x0
            e: R_RISCV_CALL   GOMP_loop_nonmonotonic_dynamic_next
            e: R_RISCV_RELAX  *ABS*
  12: 000080e7            jalr    ra # e <main._omp_fn.0+0xe>
  16: c515                    beqz    a0,42 <.L2>
            16: R_RISCV_RVC_BRANCH    .L2
  18: 00000937            lui s2,0x0
            18: R_RISCV_HI20  .LC0
            18: R_RISCV_RELAX *ABS*

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.L4, .L3 的代码相当于:
while (GOMP_loop_nonmonotonic_dynamic_next(*start,*end)) {
      for (i=start;i<end;i++) {
          printf("%d\n",i);
      }
}


000000000000001c <.L4>:
  1c: 4402                    lw  s0,0(sp)
  1e: 44a2                    lw  s1,8(sp)
                              ~~~~~~~~~~~~
                              s0 是 start, s1 是 end, 由于 GOMP_loop_nonmonotonic_dynamic_next 的参数是 (*start,*end)
                              所以它使用栈来传递参数

0000000000000020 <.L3>:
  20: 85a2                    mv  a1,s0
  22: 00090513            mv  a0,s2
            22: R_RISCV_LO12_I    .LC0
            22: R_RISCV_RELAX *ABS*
  26: 2405                    addiw   s0,s0,1
  28: 00000097            auipc   ra,0x0
            28: R_RISCV_CALL  printf
            28: R_RISCV_RELAX *ABS*
  2c: 000080e7            jalr    ra # 28 <.L3+0x8>
  30: fe9448e3            blt s0,s1,20 <.L3>
                          ~~~~~~~~~~~~~~~~~~~
                          .L3 对应内层的 for (i=start;i<end;i++) {xxx} 循环

            30: R_RISCV_BRANCH    .L3
  34: 002c                    addi    a1,sp,8
                              ~~~~~~~~~~~~~~~ end 的地址
  36: 850a                    mv  a0,sp
                              ~~~~~~~~~ start 的地址

  38: 00000097            auipc   ra,0x0
            38: R_RISCV_CALL  GOMP_loop_nonmonotonic_dynamic_next
            38: R_RISCV_RELAX *ABS*
  3c: 000080e7            jalr    ra # 38 <.L3+0x18>
  40: fd71                    bnez    a0,1c <.L4>
            40: R_RISCV_RVC_BRANCH    .L4
                                     ~~~~~
                                     .L4 对应外层的 while (GOMP_loop_nonmonotonic_dynamic_next(*start,*end) 循环

可见 dynamic schedule 有额外的运行时开销

1.3.3.2.1. GOMP_loop_nonmonotonic_dynamic_next

bool GOMP_loop_nonmonotonic_dynamic_next(long *istart, long *iend)
  return gomp_loop_dynamic_next (istart, iend);
    return gomp_iter_dynamic_next (istart, iend);
      long tmp = __sync_fetch_and_add (&ws->next, chunk);
      if (tmp >= end) return false;
      nend = tmp + chunk;
      if (nend > end) nend = end;
      *pstart = tmp;
      *pend = nend;
      return true;

1.3.3.3. reduction

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
    int sum = 0;
#pragma omp parallel for reduction(+ : sum) num_threads(2)
    for (int i = 0; i < 10; i++) {
        sum += i;
    }
    printf("%d\n", sum);
    return 0;
}

0000000000000000 <main._omp_fn.0>:
   0: 1101                    addi    sp,sp,-32
   2: ec06                    sd  ra,24(sp)
   4: e822                    sd  s0,16(sp)
   6: e426                    sd  s1,8(sp)
   8: 842a                    mv  s0,a0
   a: 00000097            auipc   ra,0x0
            a: R_RISCV_CALL_PLT   omp_get_num_threads
            a: R_RISCV_RELAX  *ABS*
   e: 000080e7            jalr    ra # a <main._omp_fn.0+0xa>
  12: 84aa                    mv  s1,a0
  14: 00000097            auipc   ra,0x0
            14: R_RISCV_CALL_PLT  omp_get_thread_num
            14: R_RISCV_RELAX *ABS*
  18: 000080e7            jalr    ra # 14 <main._omp_fn.0+0x14>
  1c: 46a9                    li  a3,10
  1e: 0296e63b            remw    a2,a3,s1
  22: 87aa                    mv  a5,a0
  24: 0296c5bb            divw    a1,a3,s1
  28: 02c54e63            blt a0,a2,64 <.L2>
            28: R_RISCV_BRANCH    .L2

000000000000002c <.L5>:
  2c: 4681                    li  a3,0
  2e: 02f5873b            mulw    a4,a1,a5
  32: 00c707bb            addw    a5,a4,a2
  36: 00b7873b            addw    a4,a5,a1
                          ~~~~~~~~~~~~~~~~
                          当前线程要处理的范围是 [a5, a4]

  3a: 00e7c963            blt a5,a4,4c <.L4>
            3a: R_RISCV_BRANCH    .L4
  3e: 00d4202f            amoadd.w    zero,a3,(s0)
                          ~~~~~~~~~~~~~~~~~~~~~~~~
                          线程处理完以后把 a3 (当前线程的 partial sum 累加到 (s0)

  42: 60e2                    ld  ra,24(sp)
  44: 6442                    ld  s0,16(sp)
  46: 64a2                    ld  s1,8(sp)
  48: 6105                    addi    sp,sp,32
  4a: 8082                    ret

000000000000004c <.L4>:
  4c: 9ebd                    addw    a3,a3,a5
  4e: 2785                    addiw   a5,a5,1
                              ~~~~~~~~~~~~~~~~
                              a3 是 partial sum, 不断的累加 a5 (i)

  50:	fef71ee3          	bne	a4,a5,4c <.L4>

            50: R_RISCV_BRANCH	.L4
  54:	2681                	sext.w	a3,a3
  56:	00d4202f          	amoadd.w	zero,a3,(s0)
  5a:	60e2                	ld	ra,24(sp)
  5c:	6442                	ld	s0,16(sp)
  5e:	64a2                	ld	s1,8(sp)
  60:	6105                	addi	sp,sp,32
  62:	8082                	ret

0000000000000064 <.L2>:
  64:	2585                	addiw	a1,a1,1
  66:	4601                	li	a2,0
  68:	b7d1                	j	2c <.L5>
            68: R_RISCV_RVC_JUMP	.L5

reduction 的代码是由编译器直接生成的, 不需要调用 libgomp. 生成的代码和手写的不用 reduction 的代码 (例如 hello_openmp 中的 pi_parallel_for) 基本相同

1.3.4. task

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
#pragma omp parallel num_threads(2)
    {
#pragma omp single
        {
#pragma omp task
            printf("%d\n", omp_get_thread_num());
#pragma omp task
            printf("%d\n", omp_get_thread_num());
        }
    }
    return 0;
}

0000000000000014 <.L6>:
  14:	4881                	li	a7,0
  16:	4801                	li	a6,0
  18:	4785                	li	a5,1
  1a:	4705                	li	a4,1
  1c:	4681                	li	a3,0
  1e:	4601                	li	a2,0
  20:	4581                	li	a1,0
  22:	00000517          	auipc	a0,0x0
            22: R_RISCV_PCREL_HI20	main._omp_fn.1
            22: R_RISCV_RELAX	*ABS*
  26:	00050513          	mv	a0,a0
            26: R_RISCV_PCREL_LO12_I	.L0
            26: R_RISCV_RELAX	*ABS*
  2a:	e002                	sd	zero,0(sp)
  2c:	00000097          	auipc	ra,0x0
            2c: R_RISCV_CALL_PLT	GOMP_task
                                    ~~~~~~~~~
                                    GOMP_task(main._omp_fn.1)

            2c: R_RISCV_RELAX	*ABS*
  30:	000080e7          	jalr	ra # 2c <.L6+0x18>
  34:	4881                	li	a7,0
  36:	e002                	sd	zero,0(sp)
  38:	4801                	li	a6,0
  3a:	4785                	li	a5,1
  3c:	4705                	li	a4,1
  3e:	4681                	li	a3,0
  40:	4601                	li	a2,0
  42:	4581                	li	a1,0
  44:	00000517          	auipc	a0,0x0
            44: R_RISCV_PCREL_HI20	main._omp_fn.2
            44: R_RISCV_RELAX	*ABS*
  48:	00050513          	mv	a0,a0
            48: R_RISCV_PCREL_LO12_I	.L0
            48: R_RISCV_RELAX	*ABS*
  4c:	00000097          	auipc	ra,0x0
            4c: R_RISCV_CALL_PLT	GOMP_task
                                    ~~~~~~~~~
                                    GOMP_task(main._omp_fn.1)

            4c: R_RISCV_RELAX	*ABS*
  50:	000080e7          	jalr	ra # 4c <.L6+0x38>
  54:	60e2                	ld	ra,24(sp)
  56:	6105                	addi	sp,sp,32
  58:	8082                	ret

000000000000005a <main._omp_fn.1>:
  5a:	1141                	addi	sp,sp,-16
  5c:	e406                	sd	ra,8(sp)
  5e:	00000097          	auipc	ra,0x0
            5e: R_RISCV_CALL_PLT	omp_get_thread_num
            5e: R_RISCV_RELAX	*ABS*
  62:	000080e7          	jalr	ra # 5e <main._omp_fn.1+0x4>
  66:	60a2                	ld	ra,8(sp)
  68:	862a                	mv	a2,a0
  6a:	00000597          	auipc	a1,0x0
            6a: R_RISCV_PCREL_HI20	.LC0
            6a: R_RISCV_RELAX	*ABS*
  6e:	00058593          	mv	a1,a1
            6e: R_RISCV_PCREL_LO12_I	.L0
            6e: R_RISCV_RELAX	*ABS*
  72:	4505                	li	a0,1
  74:	0141                	addi	sp,sp,16
  76:	00000317          	auipc	t1,0x0
            76: R_RISCV_CALL_PLT	__printf_chk
            76: R_RISCV_RELAX	*ABS*
  7a:	00030067          	jr	t1 # 76 <main._omp_fn.1+0x1c>

000000000000007e <main._omp_fn.2>:
  7e:	00000317          	auipc	t1,0x0
            7e: R_RISCV_CALL	main._omp_fn.1
            7e: R_RISCV_RELAX	*ABS*
  82:	00030067          	jr	t1 # 7e <main._omp_fn.2>

每个 task 对应一个单独的 main._omp_fn.x, 通过 GOMP_task 放在某个线程里执行

1.3.4.1. GOMP_task

void GOMP_task(
    void (*fn)(void *), void *data, void (*cpyfn)(void *, void *),
    long arg_size, long arg_align, bool if_clause, unsigned flags,
    void **depend, int priority_arg, void *detach):
  priority_queue_insert (PQ_TEAM, &team->task_queue,
        task, priority,
        PRIORITY_INSERT_END,
        /*adjust_parent_depends_on=*/false,
        task->parent_depends_on);
  gomp_team_barrier_set_task_pending (&team->barrier);
  do_wake = team->task_running_count + !parent->in_tied_task < team->nthreads;
  if (do_wake) gomp_team_barrier_wake (&team->barrier, 1);

1.3.5. target

#include <omp.h>
#include <stdio.h>
int main() {
    int x = 1;
#pragma omp target map(tofrom : x)
    for (int i = 0; i < 10; i++) {
        x += i;
    }
    printf("x = %d\n", x);
    return 0;
}

openmp 从 4.0 开始支持 offload 到不同的 target, 例如 nvptx 和 amd gcn.

offload 过程与 DPC++, TVM BYOC Codegen 以及 ComputeCpp 类似, 以 nvptx 为例, 主要步骤是:

gcc 编译出 block 对应的 ptx 汇编
使用 mkoffload 生成一个头文件, 它会包含 ptx 汇编, 用于注册的初始化函数以及一些 mapping 信息, 确保运行时 gomp 可以找到对应的 ptx 并调用 cuda 去编译和执行它. 这个头文件与 ComputeCpp 生成的 sycl 文件类似

1.3.5.1. GOMP_offload_register_ver

GOMP_offload_register_ver 负责向 runtime 注册 offload 相关的信息

通过 `–save-temps` 可以看到这个生成的 offload 头文件:

$> gcc test.c -O0 -fopenmp -fcf-protection=none -foffload=-misa=sm_35 -fno-stack-protector --save-temps
$> cat ccuL4e4U.i

static const char ptx_code_0[] =
 "// BEGIN PREAMBLE\n"
 ".version 3.1\n"
 ".target sm_35\n"
 ".address_size 64\n"
 "// BEGIN FUNCTION DECL: main$_omp_fn$0$impl\n"
 ".func main$_omp_fn$0$impl (.param .u64 %in_ar0);\n"
 "// BEGIN GLOBAL FUNCTION DECL: gomp_nvptx_main\n"
 ".extern .func gomp_nvptx_main (.param .u64 %in_ar1, .param .u64 %in_ar2);\n"
 "// BEGIN GLOBAL VAR DECL: __nvptx_stacks\n"
 ".extern .shared .u64 __nvptx_stacks[32];\n"
 "// BEGIN GLOBAL VAR DECL: __nvptx_uni\n"
 ".extern .shared .u32 __nvptx_uni[32];\n"
 "// END PREAMBLE\n"
 ".visible .entry main$_omp_fn$0 (.param .u64 %arg, .param .u64 %stack, .param .u64 %sz)\n"
 "{\n"
 ".reg .u32 %r<3>;\n"
 ".reg .u64 %R<4>;\n"
        /* ... */
 "mov.u32 %r0,%tid.y;\n"
        /* ... */
static const char ptx_code_19[] = ""  /* ... */

static const struct ptx_obj {
  const char *code;
  long unsigned int size;
} ptx_objs[] = {
 {ptx_code_0, sizeof (ptx_code_0)},
 {ptx_code_1, sizeof (ptx_code_1)},
 {ptx_code_2, sizeof (ptx_code_2)},
 {ptx_code_3, sizeof (ptx_code_3)},
 {ptx_code_4, sizeof (ptx_code_4)},
 {ptx_code_5, sizeof (ptx_code_5)},
 {ptx_code_6, sizeof (ptx_code_6)},
 {ptx_code_7, sizeof (ptx_code_7)},
 {ptx_code_8, sizeof (ptx_code_8)},
 {ptx_code_9, sizeof (ptx_code_9)},
 {ptx_code_10, sizeof (ptx_code_10)},
 {ptx_code_11, sizeof (ptx_code_11)},
 {ptx_code_12, sizeof (ptx_code_12)},
 {ptx_code_13, sizeof (ptx_code_13)},
 {ptx_code_14, sizeof (ptx_code_14)},
 {ptx_code_15, sizeof (ptx_code_15)},
 {ptx_code_16, sizeof (ptx_code_16)},
 {ptx_code_17, sizeof (ptx_code_17)},
 {ptx_code_18, sizeof (ptx_code_18)},
 {ptx_code_19, sizeof (ptx_code_19)}
};

static const char *const var_mappings[] = {
};

static const struct nvptx_fn {
  const char *name;
  unsigned short dim[3];
} func_mappings[] = {

 {"main$_omp_fn$0"}
};

static const struct nvptx_tdata {
  const struct ptx_obj *ptx_objs;
  unsigned ptx_num;
  const char *const *var_names;
  unsigned var_num;
  const struct nvptx_fn *fn_names;
  unsigned fn_num;
} target_data = {
  ptx_objs, sizeof (ptx_objs) / sizeof (ptx_objs[0]),
  var_mappings, sizeof (var_mappings) / sizeof (var_mappings[0]),
  func_mappings, sizeof (func_mappings) / sizeof (func_mappings[0])
};

extern void GOMP_offload_register_ver (unsigned, const void *, int, const void *);
extern void GOMP_offload_unregister_ver (unsigned, const void *, int, const void *);

extern const void *const __OFFLOAD_TABLE__[];

static __attribute__((constructor)) void init (void)
{
  GOMP_offload_register_ver (0x10001, __OFFLOAD_TABLE__, 5 , &target_data);
};
static __attribute__((destructor)) void fini (void)
{
  GOMP_offload_unregister_ver (0x10001, __OFFLOAD_TABLE__, 5 , &target_data);
};

通过注册的 target_data 中的 func_mappings 和 ptx_objs, 运行时可以找到函数 (例如 main._omp_fn.0) 对应的 ptx code (例如 ptx_code_0)

1.3.5.2. GOMP_target_ext

0000000000001169 <main>:
    # ...
    11a7:       6a 00                   pushq  $0x0
    11a9:       4c 8d 0d 68 ee 06 00    lea    0x6ee68(%rip),%r9        # 70018 <.omp_data_kinds.6.2657>
    11b0:       4c 8d 05 59 ee 06 00    lea    0x6ee59(%rip),%r8        # 70010 <.omp_data_sizes.5.2656>
    11b7:       48 89 c1                mov    %rax,%rcx
    11ba:       ba 01 00 00 00          mov    $0x1,%edx
    11bf:       48 8d 35 2b 00 00 00    lea    0x2b(%rip),%rsi        # 11f1 <main._omp_fn.0>
                                                                      ~~~~~~~~~~~~~~~~~~~~~~~
    11c6:       bf ff ff ff ff          mov    $0xffffffff,%edi
    11cb:       e8 60 fe ff ff          callq  1030 <GOMP_target_ext@plt>
                                                     ~~~~~~~~~~~~~~~~~~~~
                                                     GOMP_target_ext 会负责调用 ptx runtime 相关代码去找 main._omp_fn.0
                                                     对应的 ptx code 并执行, 但如果没有成功, 可以 fallback 到 cpu 代码
                                                     所以下面有 main._omp_fn.0 完整的 cpu 实现

    11d0:       48 83 c4 20             add    $0x20,%rsp
    11d4:       8b 45 d4                mov    -0x2c(%rbp),%eax
    11d7:       89 c6                   mov    %eax,%esi
    11d9:       48 8d 3d 24 0e 00 00    lea    0xe24(%rip),%rdi        # 2004 <_IO_stdin_used+0x4>
    11e0:       b8 00 00 00 00          mov    $0x0,%eax
    11e5:       e8 66 fe ff ff          callq  1050 <printf@plt>
    11ea:       b8 00 00 00 00          mov    $0x0,%eax
    11ef:       c9                      leaveq
    11f0:       c3                      retq


00000000000011f1 <main._omp_fn.0>:
    11f1:       55                      push   %rbp
    11f2:       48 89 e5                mov    %rsp,%rbp
    11f5:       48 89 7d e8             mov    %rdi,-0x18(%rbp)
    11f9:       c7 45 fc 00 00 00 00    movl   $0x0,-0x4(%rbp)
    1200:       83 7d fc 09             cmpl   $0x9,-0x4(%rbp)
                                        ~~~~~~~~~~~~~~~~~~~~~~ 循环 10 次

    1204:       7f 1d                   jg     1223 <main._omp_fn.0+0x32>
    1206:       48 8b 45 e8             mov    -0x18(%rbp),%rax
    120a:       48 8b 00                mov    (%rax),%rax
    120d:       8b 10                   mov    (%rax),%edx
    120f:       8b 45 fc                mov    -0x4(%rbp),%eax
    1212:       01 c2                   add    %eax,%edx
    1214:       48 8b 45 e8             mov    -0x18(%rbp),%rax
    1218:       48 8b 00                mov    (%rax),%rax
    121b:       89 10                   mov    %edx,(%rax)
    121d:       83 45 fc 01             addl   $0x1,-0x4(%rbp)
    1221:       eb dd                   jmp    1200 <main._omp_fn.0+0xf>
    1223:       5d                      pop    %rbp
    1224:       c3                      retq
    1225:       66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
    122c:       00 00 00
    122f:       90                      nop

1.3.6. simd

openmp 支持通过 `#pragma omp simd` 给编译器一个提示, 以便对循环进行向量化.

1.4. Compiler Directive

gcc 对 openmp 的支持主要在两个 tree pass:

omplower
ompexp

1.4.1. Example

以下面的代码为例:

#include <omp.h>
#include <stdio.h>

int main(int argc, char *argv[]) {
#pragma omp parallel for num_threads(2)
    for (int i = 0; i < 10; i++) {
        printf("%d\n", omp_get_thread_num());
    }
    return 0;
}

$> riscv64-linux-gnu-gcc test.c -O2  -fopenmp -c -fno-stack-protector -fdump-tree-all

test.c.007t.omplower:

;; Function main (main, funcdef_no=23, decl_uid=2425, cgraph_uid=24, symbol_order=23)

Introduced new external node (main._omp_fn.0/24).
main (int argc, char * * argv)
{
  int D.2430;

  {
    {
      #pragma omp parallel num_threads(2) [child fn: main._omp_fn.0 (???)]
        {
          int i;

          {
            #pragma omp for nowait
            for (i = 0; i < 10; i = i + 1)
            D.2429 = omp_get_thread_num ();
            printf ("%d\n", D.2429);
            #pragma omp continue (i, i)
            #pragma omp return(nowait)
          }
        }
        #pragma omp return
    }
    D.2430 = 0;
    return D.2430;
  }
  D.2430 = 0;
  return D.2430;
}

test.c.013t.ompexp:

;; Function main._omp_fn.0 (main._omp_fn.0, funcdef_no=24, decl_uid=2432, cgraph_uid=25, symbol_order=24)

main._omp_fn.0 (void * .omp_data_i)
{
  int D.2451;
  int i;
  int D.2449;
  int D.2448;
  int D.2447;
  int tt.2;
  int q.1;
  int D.2444;
  int D.2443;

  <bb 11> :

  <bb 3> :
  D.2443 = __builtin_omp_get_num_threads ();
  D.2444 = __builtin_omp_get_thread_num ();
  q.1 = 10 / D.2443;
  tt.2 = 10 % D.2443;
  if (D.2444 < tt.2)
    goto <bb 9>; [25.00%]
  else
    goto <bb 10>; [75.00%]

  <bb 10> :
  D.2447 = q.1 * D.2444;
  D.2448 = D.2447 + tt.2;
  D.2449 = D.2448 + q.1;
  if (D.2448 >= D.2449)
    goto <bb 5>; [INV]
  else
    goto <bb 8>; [INV]

  <bb 8> :
  i = D.2448;

  <bb 4> :
  D.2451 = __builtin_omp_get_thread_num ();
  printf ("%d\n", D.2451);
  i = i + 1;
  if (i < D.2449)
    goto <bb 4>; [INV]
  else
    goto <bb 5>; [INV]

  <bb 5> :

  <bb 6> :
  return;

  <bb 9> :
  tt.2 = 0;
  q.1 = q.1 + 1;
  goto <bb 10>; [100.00%]

}

;; Function main (main, funcdef_no=23, decl_uid=2425, cgraph_uid=24, symbol_order=23)

Merging blocks 2 and 12
Merging blocks 2 and 7
main (int argc, char * * argv)
{
  int D.2442;
  int D.2440;
  int D.2441;
  int tt.2;
  int q.1;
  int D.2437;
  int D.2436;
  int i;
  int D.2430;

  <bb 2> :
  __builtin_GOMP_parallel (main._omp_fn.0, 0B, 2, 0);
  D.2430 = 0;
  return D.2430;

}

1.4.2. omplower

TBD

1.4.3. ompexp

TBD

1.4.4. omp-builtins

生成代码中对 libgomp 函数的调用是以 builtin 的形式, 例如 `__builtin_GOMP_parallel (main._omp_fn.0, 0B, 2, 0)`, 这些 omp builtins 不需要定义 optab 和 rtl insn, 它们会在 rtl expand 时变成对 libgomp 函数的调用.

所有的 omp builtin 定义在 `omp-builtins.def`, `expand_builtin` 负责把它们变成函数调用.

rtx
expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
        int ignore)
{
  tree fndecl = get_callee_fndecl (exp);
  machine_mode target_mode = TYPE_MODE (TREE_TYPE (exp));
  int flags;

  switch (fcode)
    {
    CASE_FLT_FN (BUILT_IN_FABS):
    CASE_FLT_FN_FLOATN_NX (BUILT_IN_FABS):
    case BUILT_IN_FABSD32:
    case BUILT_IN_FABSD64:
    case BUILT_IN_FABSD128:
      target = expand_builtin_fabs (exp, target, subtarget);
      if (target) return target;
      break;
      ...
    default: /* just do library call, if unknown builtin */
      break;
    }

    // NOTE: 这里生成对 libgomp 的调用
    return expand_call(exp, target, ignore);
}