網(wǎng)站首頁編程語言正文

OpenMP中For?Construct對dynamic的調(diào)度方式詳解_C 語言

作者：一無是處的研究僧 ? 更新時間： 2023-04-06 編程語言

前言

在本篇文章當中主要給大家介紹 OpenMp for construct 的實現(xiàn)原理，以及與他相關(guān)的動態(tài)庫函數(shù)分析，與 for construct 非常相關(guān)的是循環(huán)的調(diào)度方式，在 OpenMP 當中一共有四種調(diào)調(diào)方式，auto, dynamic, guided, runtime, 在本篇文章當中主要是對 dynamic 的調(diào)度方式進行分析。

前置知識

在介紹 for construct 的實現(xiàn)原理之前，我們首先需要了解一下編譯器是如何處理函數(shù)參數(shù)傳遞的（本文基于 x86_64 ISA），我們來看一下下面的代碼在編譯之后函數(shù)參數(shù)的傳遞情況。

在前面的文章當中我們已經(jīng)談到過了，在 x86 當中參數(shù)傳遞的規(guī)約，具體的內(nèi)容如下所示：

寄存器	含義
rdi	第一個參數(shù)
rsi	第二個參數(shù)
rdx	第三個參數(shù)
rcx	第四個參數(shù)
r8	第五個參數(shù)
r9	第六個參數(shù)

我們現(xiàn)在使用下面的代碼來分析一下具體的情況（因為前面使用寄存器只能夠傳遞 6 個參數(shù)，而在后面我們要分析的動態(tài)庫函數(shù)當中會傳遞 7 個參數(shù)，因此這里我們使用 8 個參數(shù)來測試一下具體的參數(shù)傳遞情況）：

#include "stdio.h"
 
void echo(int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8)
{
  printf("%d %d %d %d %d %d %d %d\n", a8, a7, a1, a2, a3, a4, a5, a6);
}
 
int main()
{
  echo(1, 2, 3, 4 ,5 ,6, 7, 8);
  return 0;
}

上面的程序的反匯編結(jié)果如下所示：

000000000040053d <echo>:
? 40053d: ? ? ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %rbp
? 40053e: ? ? ? 48 89 e5 ? ? ? ? ? ? ? ?mov ? ?%rsp,%rbp
? 400541: ? ? ? 48 83 ec 30 ? ? ? ? ? ? sub ? ?$0x30,%rsp
? 400545: ? ? ? 89 7d fc ? ? ? ? ? ? ? ?mov ? ?%edi,-0x4(%rbp)
? 400548: ? ? ? 89 75 f8 ? ? ? ? ? ? ? ?mov ? ?%esi,-0x8(%rbp)
? 40054b: ? ? ? 89 55 f4 ? ? ? ? ? ? ? ?mov ? ?%edx,-0xc(%rbp)
? 40054e: ? ? ? 89 4d f0 ? ? ? ? ? ? ? ?mov ? ?%ecx,-0x10(%rbp)
? 400551: ? ? ? 44 89 45 ec ? ? ? ? ? ? mov ? ?%r8d,-0x14(%rbp)
? 400555: ? ? ? 44 89 4d e8 ? ? ? ? ? ? mov ? ?%r9d,-0x18(%rbp)
? 400559: ? ? ? 8b 7d f4 ? ? ? ? ? ? ? ?mov ? ?-0xc(%rbp),%edi
? 40055c: ? ? ? 8b 75 f8 ? ? ? ? ? ? ? ?mov ? ?-0x8(%rbp),%esi
? 40055f: ? ? ? 8b 55 fc ? ? ? ? ? ? ? ?mov ? ?-0x4(%rbp),%edx
? 400562: ? ? ? 8b 45 18 ? ? ? ? ? ? ? ?mov ? ?0x18(%rbp),%eax # a8
? 400565: ? ? ? 8b 4d e8 ? ? ? ? ? ? ? ?mov ? ?-0x18(%rbp),%ecx
? 400568: ? ? ? 89 4c 24 10 ? ? ? ? ? ? mov ? ?%ecx,0x10(%rsp)
? 40056c: ? ? ? 8b 4d ec ? ? ? ? ? ? ? ?mov ? ?-0x14(%rbp),%ecx
? 40056f: ? ? ? 89 4c 24 08 ? ? ? ? ? ? mov ? ?%ecx,0x8(%rsp)
? 400573: ? ? ? 8b 4d f0 ? ? ? ? ? ? ? ?mov ? ?-0x10(%rbp),%ecx
? 400576: ? ? ? 89 0c 24 ? ? ? ? ? ? ? ?mov ? ?%ecx,(%rsp)
? 400579: ? ? ? 41 89 f9 ? ? ? ? ? ? ? ?mov ? ?%edi,%r9d
? 40057c: ? ? ? 41 89 f0 ? ? ? ? ? ? ? ?mov ? ?%esi,%r8d
? 40057f: ? ? ? 89 d1 ? ? ? ? ? ? ? ? ? mov ? ?%edx,%ecx
? 400581: ? ? ? 8b 55 10 ? ? ? ? ? ? ? ?mov ? ?0x10(%rbp),%edx # a7
? 400584: ? ? ? 89 c6 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%esi # a8
? 400586: ? ? ? bf 64 06 40 00 ? ? ? ? ?mov ? ?$0x400664,%edi
? 40058b: ? ? ? b8 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%eax
? 400590: ? ? ? e8 8b fe ff ff ? ? ? ? ?callq ?400420 <printf@plt>
? 400595: ? ? ? c9 ? ? ? ? ? ? ? ? ? ? ?leaveq?
?
0000000000400597 <main>:
? 400597: ? ? ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %rbp
? 400598: ? ? ? 48 89 e5 ? ? ? ? ? ? ? ?mov ? ?%rsp,%rbp
? 40059b: ? ? ? 48 83 ec 10 ? ? ? ? ? ? sub ? ?$0x10,%rsp
? 40059f: ? ? ? c7 44 24 08 08 00 00 ? ?movl ? $0x8,0x8(%rsp) # 保存參數(shù) 8?
? 4005a6: ? ? ? 00?
? 4005a7: ? ? ? c7 04 24 07 00 00 00 ? ?movl ? $0x7,(%rsp) # 保存參數(shù) 7?
? 4005ae: ? ? ? 41 b9 06 00 00 00 ? ? ? mov ? ?$0x6,%r9d # 保存參數(shù) 6?
? 4005b4: ? ? ? 41 b8 05 00 00 00 ? ? ? mov ? ?$0x5,%r8d # 保存參數(shù) 5?
? 4005ba: ? ? ? b9 04 00 00 00 ? ? ? ? ?mov ? ?$0x4,%ecx # 保存參數(shù) 4?
? 4005bf: ? ? ? ba 03 00 00 00 ? ? ? ? ?mov ? ?$0x3,%edx # 保存參數(shù) 3?
? 4005c4: ? ? ? be 02 00 00 00 ? ? ? ? ?mov ? ?$0x2,%esi # 保存參數(shù) 2?
? 4005c9: ? ? ? bf 01 00 00 00 ? ? ? ? ?mov ? ?$0x1,%edi # 保存參數(shù) 1
? 4005ce: ? ? ? e8 6a ff ff ff ? ? ? ? ?callq ?40053d <echo>
? 4005d3: ? ? ? b8 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%eax
? 4005d8: ? ? ? c9 ? ? ? ? ? ? ? ? ? ? ?leaveq?
? 4005d9: ? ? ? c3 ? ? ? ? ? ? ? ? ? ? ?retq ??
? 4005da: ? ? ? 66 0f 1f 44 00 00 ? ? ? nopw ? 0x0(%rax,%rax,1)

從上面的匯編程序我們可以知道 1 - 6，這幾個參數(shù)確實是通過寄存器傳遞的，對應的寄存器就是上文當中我們提到不同的參數(shù)對應的寄存器。但是參數(shù) 7 和參數(shù) 8 是保存在棧上的。根據(jù)上面的 main 函數(shù)的匯編程序分析，他對應的棧幀的內(nèi)存布局如下所示：

我們在來分析一下 echo 函數(shù)當中 printf 函數(shù)參數(shù)的傳遞情況，第二個參數(shù)和第三個參數(shù)分別是 a8, a7，應該分別保存到寄存器 rsi/esi, rdx/edx 當中，在上面的匯編代碼當中已經(jīng)使用注釋的方式進行標注出來了，從下往上進行分析可以看到 a8 保存在位置 0x18(%rbp)，a7 保存在 0x10(%rbp)，這個地址正是 main 函數(shù)保存 a7（當進入函數(shù) echo 之后，a7，和 a8 的位置分別是 rsp + 0x10）, a8（當進入函數(shù) echo 之后，a7，和 a8 的位置分別是 rsp + 0x10 + 0x8）的位置，具體可以結(jié)合上面的內(nèi)存布局圖進行分析。

dynamic 調(diào)度方式分析

我們使用下面的代碼來分析一下動態(tài)調(diào)度的情況下整個程序的執(zhí)行流程是怎么樣的：

#pragma omp parallel for num_threads(t) schedule(dynamic, size)
for (i = lb; i <= ub; i++)
  body;

編譯器會將上面的程序編譯成下面的形式：

void subfunction (void *data)
{
  long _s0, _e0;
  while (GOMP_loop_dynamic_next (&_s0, &_e0))
  {
    long _e1 = _e0, i;
    for (i = _s0; i < _e1; i++)
      body;
  }
  // GOMP_loop_end_nowait 這個函數(shù)的主要作用就是釋放數(shù)據(jù)的內(nèi)存空間 在后文當中不進行分析
  GOMP_loop_end_nowait ();
}
 
GOMP_parallel_loop_dynamic_start (subfunction, NULL, t, lb, ub+1, 1, size);
subfunction (NULL);
// 這個函數(shù)在前面的很多文章已經(jīng)分析過 本文也不在進行分析
GOMP_parallel_end ();

void
GOMP_parallel_loop_dynamic_start (void (*fn) (void *), void *data,
				  unsigned num_threads, long start, long end,
				  long incr, long chunk_size)
{
  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
			    GFS_DYNAMIC, chunk_size);
}
 
static void
gomp_parallel_loop_start (void (*fn) (void *), void *data,
			  unsigned num_threads, long start, long end,
			  long incr, enum gomp_schedule_type sched,
			  long chunk_size)
{
  struct gomp_team *team;
  // 解析具體創(chuàng)建多少個線程
  num_threads = gomp_resolve_num_threads (num_threads, 0);
  // 創(chuàng)建一個含有 num_threads 個線程的線程組
  team = gomp_new_team (num_threads);
  // 對線程組的數(shù)據(jù)進行初始化操作
  gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
  // 啟動 num_threads 個線程執(zhí)行函數(shù) fn 
  gomp_team_start (fn, data, num_threads, team);
}
 
enum gomp_schedule_type
{
  GFS_RUNTIME, // runtime 調(diào)度方式
  GFS_STATIC,	 // static  調(diào)度方式
  GFS_DYNAMIC, // dynamic 調(diào)度方式
  GFS_GUIDED,	 // guided  調(diào)度方式
  GFS_AUTO     // auto    調(diào)度方式
};

在上面的程序當中 GOMP_parallel_loop_dynamic_start，有 7 個參數(shù)，我們接下來仔細解釋一下這七個參數(shù)的含義：

fn，函數(shù)指針也就是并行域被編譯之后的函數(shù)。
data，指向共享或者私有的數(shù)據(jù)，在并行域當中可能會使用外部的一些變量。
num_threads，并行域當中指定啟動線程的個數(shù)。
start，for 循環(huán)迭代的初始值，比如 for(int i = 0; ?? 這個 start 就是 0 。
end，for 循環(huán)迭代的最終值，比如 for(int i = 0; i < 100; i++) 這個 end 就是 100 。
incr，這個值一般都是 1 或者 -1，如果是 for 循環(huán)是從小到達迭代這個值就是 1，反之就是 -1。
chunk_size，這個就是給一個線程劃分塊的時候一個塊的大小，比如 schedule(dynamic, 1)，這個 chunk_size 就等于 1 。

在函數(shù) GOMP_parallel_loop_dynamic_start 當中會調(diào)用函數(shù) gomp_parallel_loop_start ，這個函數(shù)的主要作用就是將整個循環(huán)的起始位置信息保存到線程組內(nèi)部，那么就能夠在函數(shù) GOMP_loop_dynamic_next 當中直接使用這些信息進行不同線程的分塊劃分。GOMP_loop_dynamic_next 最終會調(diào)用函數(shù) gomp_loop_dynamic_next ，其源代碼如下所示：

static bool
gomp_loop_dynamic_next (long *istart, long *iend)
{
  bool ret;
  ret = gomp_iter_dynamic_next (istart, iend);
  return ret;
}

gomp_loop_dynamic_next 函數(shù)的返回值是一個布爾值：

如果返回值為 true ，則說明還有剩余的分塊需要執(zhí)行。
如果返回值為 false，則說明沒有剩余的分塊需要執(zhí)行了，根據(jù)前面 dynamic 編譯之后的結(jié)果，那么就會退出 while 循環(huán)。

gomp_iter_dynamic_next 是劃分具體的分塊，并且將分塊的起始位置保存到變量 istart 和 iend 當中，因為傳遞的是指針，就能夠使用 s0 和 e0 得到數(shù)據(jù)的值，下面是 gomp_iter_dynamic_next 的源代碼，就是具體的劃分算法了。

bool
gomp_iter_dynamic_next (long *pstart, long *pend)
{
  // 得到當前線程的指針
  struct gomp_thread *thr = gomp_thread ();
  // 得到線程組共享的數(shù)據(jù)
  struct gomp_work_share *ws = thr->ts.work_share;
  long start, end, nend, chunk, incr;
  
  // 保存迭代的最終值
  end = ws->end;
  // 這個值一般都是 1
  incr = ws->incr;
  // 保存分塊的大小 chunk size
  chunk = ws->chunk_size;
  
  // ws->mode 在數(shù)據(jù)分塊比較小的時候就是 1 在數(shù)據(jù)分塊比較大的時候就是 0
  if (__builtin_expect (ws->mode, 1))
    {
    // __sync_fetch_and_add 函數(shù)是一個原子操作 ws->next 的初始值為 for 循環(huán)的起始位置值
    // 這個函數(shù)的返回值是 ws->next 的舊值 然后會將 ws->next 的值加上 chunk
    // 并且整個操作是原子的 是并發(fā)安全的
      long tmp = __sync_fetch_and_add (&ws->next, chunk);
    // 從小到大迭代
      if (incr > 0)
	{
	  if (tmp >= end)
	    return false;
    // 分塊的最終位置
	  nend = tmp + chunk;
    // 溢出保護操作 分塊的值需要小于最終的迭代位置
	  if (nend > end)
	    nend = end;
    // 將分塊的值賦值給 pstart 和 pend 這樣就能夠在并行域當中得到這個分塊的區(qū)間了
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
      else
	{
    // 同樣的原理不過是從大到小達迭代
	  if (tmp <= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend < end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
    }
  
  // 當數(shù)據(jù)分塊比較大的時候執(zhí)行下面的操作
  // 下面的整體的流程相對比較容易理解整個過程就是一個比較并交換的過程
  // 當比較并交換成功之后就返回結(jié)果 返回為 true 或者分塊已經(jīng)分完的話也進行返回
  start = ws->next;
  while (1)
    {
      long left = end - start;
      long tmp;
      // 如果分塊已經(jīng)完全分完 就直接返回 false 
      if (start == end)
	return false;
 
      if (incr < 0)
	{
	  if (chunk < left)
	    chunk = left;
	}
      else
	{
	  if (chunk > left)
	    chunk = left;
	}
      nend = start + chunk;
 
      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;
 
      start = tmp;
    }
 
  *pstart = start;
  *pend = nend;
  return true;
}

gomp_iter_dynamic_next 函數(shù)當中有兩種情況的劃分方式：

當數(shù)據(jù)塊相對比較小的時候，說明劃分的次數(shù)就會相對多一點，在這種情況下如果使用 CAS 的話成功的概率就會相對低，對應的就會降低程序執(zhí)行的效率，因此選擇 __sync_fetch_and_add 以減少多線程的競爭情況，降低 CPU 的消耗。
當數(shù)據(jù)塊比較大的時候，說明劃分的次數(shù)相對比較小，就使用比較并交換的操作（CAS），這樣多個線程在進行競爭的時候開銷就比較小。

在上面的文章當中我們提到了，gomp_loop_init 函數(shù)是對線程共享數(shù)據(jù) work_share 進行初始化操作，如果你對具體 work_share 中的數(shù)據(jù)初始化規(guī)則感興趣，下面是對其初始化的程序：

static inline void
gomp_loop_init (struct gomp_work_share *ws, long start, long end, long incr,
		enum gomp_schedule_type sched, long chunk_size)
{
  ws->sched = sched;
  ws->chunk_size = chunk_size;
  /* Canonicalize loops that have zero iterations to ->next == ->end.  */
  ws->end = ((incr > 0 && start > end) || (incr < 0 && start < end))
	    ? start : end;
  ws->incr = incr;
  ws->next = start;
  if (sched == GFS_DYNAMIC)
    {
      ws->chunk_size *= incr;
 
#ifdef HAVE_SYNC_BUILTINS
      {
	/* For dynamic scheduling prepare things to make each iteration
	   faster.  */
	struct gomp_thread *thr = gomp_thread ();
	struct gomp_team *team = thr->ts.team;
	long nthreads = team ? team->nthreads : 1;
 
	if (__builtin_expect (incr > 0, 1))
	  {
	    /* Cheap overflow protection.  */
	    if (__builtin_expect ((nthreads | ws->chunk_size)
				  >= 1UL << (sizeof (long)
					     * __CHAR_BIT__ / 2 - 1), 0))
	      ws->mode = 0;
	    else
	      ws->mode = ws->end < (LONG_MAX
				    - (nthreads + 1) * ws->chunk_size);
	  }
	/* Cheap overflow protection.  */
	else if (__builtin_expect ((nthreads | -ws->chunk_size)
				   >= 1UL << (sizeof (long)
					      * __CHAR_BIT__ / 2 - 1), 0))
	  ws->mode = 0;
	else
	  ws->mode = ws->end > (nthreads + 1) * -ws->chunk_size - LONG_MAX;
      }
#endif
    }
}

實例分析

在本小節(jié)當中我們將使用一個實際的例子去分析上面我們所談到的整個過程：

#include <stdio.h>
#include <omp.h>
 
int main()
{
#pragma omp parallel for num_threads(4) default(none) schedule(dynamic, 2)
  for(int i = 0; i < 12; ++i)
  {
    printf("i = %d tid = %d\n", i, omp_get_thread_num());
  }
  return 0;
}

上面的程序被編譯之后的結(jié)果如下所示，具體的程序分析和注釋都在下面的匯編程序當中：

000000000040073d <main>:
? 40073d: ? ? ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %rbp
? 40073e: ? ? ? 48 89 e5 ? ? ? ? ? ? ? ?mov ? ?%rsp,%rbp
? 400741: ? ? ? 48 83 ec 20 ? ? ? ? ? ? sub ? ?$0x20,%rsp
? 400745: ? ? ? 48 c7 04 24 02 00 00 ? ?movq ? $0x2,(%rsp) # 這個就是 chunk size 符合上面的代碼當中指定的 2
? 40074c: ? ? ? 00?
? 40074d: ? ? ? 41 b9 01 00 00 00 ? ? ? mov ? ?$0x1,%r9d # 因為是從小到達 incr 這個參數(shù)是 1
? 400753: ? ? ? 41 b8 0c 00 00 00 ? ? ? mov ? ?$0xc,%r8d # 這個參數(shù)是 end 符合上面的程序 12
? 400759: ? ? ? b9 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%ecx # 這個參數(shù)是 start 符合上面的程序 1
? 40075e: ? ? ? ba 04 00 00 00 ? ? ? ? ?mov ? ?$0x4,%edx # num_threads(4) 線程的個數(shù)是 4
? 400763: ? ? ? be 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%esi # 因為上面的代碼當中并沒有在并行域當中使用數(shù)據(jù) 因此這個數(shù)據(jù)為 0 也就是 NULL?
? 400768: ? ? ? bf 88 07 40 00 ? ? ? ? ?mov ? ?$0x400788,%edi # 函數(shù)指針 main._omp_fn.0
? 40076d: ? ? ? e8 ce fe ff ff ? ? ? ? ?callq ?400640 <GOMP_parallel_loop_dynamic_start@plt>
? 400772: ? ? ? bf 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%edi
? 400777: ? ? ? e8 0c 00 00 00 ? ? ? ? ?callq ?400788 <main._omp_fn.0>
? 40077c: ? ? ? e8 5f fe ff ff ? ? ? ? ?callq ?4005e0 <GOMP_parallel_end@plt>
? 400781: ? ? ? b8 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%eax
? 400786: ? ? ? c9 ? ? ? ? ? ? ? ? ? ? ?leaveq?
? 400787: ? ? ? c3 ? ? ? ? ? ? ? ? ? ? ?retq
??
0000000000400788 <main._omp_fn.0>:
? 400788: ? ? ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %rbp
? 400789: ? ? ? 48 89 e5 ? ? ? ? ? ? ? ?mov ? ?%rsp,%rbp
? 40078c: ? ? ? 53 ? ? ? ? ? ? ? ? ? ? ?push ? %rbx
? 40078d: ? ? ? 48 83 ec 38 ? ? ? ? ? ? sub ? ?$0x38,%rsp
? 400791: ? ? ? 48 89 7d c8 ? ? ? ? ? ? mov ? ?%rdi,-0x38(%rbp)
? 400795: ? ? ? c7 45 ec 00 00 00 00 ? ?movl ? $0x0,-0x14(%rbp)
? 40079c: ? ? ? 48 8d 55 e0 ? ? ? ? ? ? lea ? ?-0x20(%rbp),%rdx
? 4007a0: ? ? ? 48 8d 45 d8 ? ? ? ? ? ? lea ? ?-0x28(%rbp),%rax
? 4007a4: ? ? ? 48 89 d6 ? ? ? ? ? ? ? ?mov ? ?%rdx,%rsi
? 4007a7: ? ? ? 48 89 c7 ? ? ? ? ? ? ? ?mov ? ?%rax,%rdi
? 4007aa: ? ? ? e8 21 fe ff ff ? ? ? ? ?callq ?4005d0 <GOMP_loop_dynamic_next@plt>
? 4007af: ? ? ? 84 c0 ? ? ? ? ? ? ? ? ? test ? %al,%al # 如果 GOMP_loop_dynamic_next 返回值是 0 則跳轉(zhuǎn)到 4007fb 執(zhí)行函數(shù) GOMP_loop_end_nowait
? 4007b1: ? ? ? 74 48 ? ? ? ? ? ? ? ? ? je ? ? 4007fb <main._omp_fn.0+0x73>
? 4007b3: ? ? ? 48 8b 45 d8 ? ? ? ? ? ? mov ? ?-0x28(%rbp),%rax
? 4007b7: ? ? ? 89 45 ec ? ? ? ? ? ? ? ?mov ? ?%eax,-0x14(%rbp)
? 4007ba: ? ? ? 48 8b 45 e0 ? ? ? ? ? ? mov ? ?-0x20(%rbp),%rax
? 4007be: ? ? ? 89 c3 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%ebx
? # ===========================下面的代碼就是執(zhí)行循環(huán)和 body =================
? 4007c0: ? ? ? e8 2b fe ff ff ? ? ? ? ?callq ?4005f0 <omp_get_thread_num@plt>
? 4007c5: ? ? ? 89 c2 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%edx
? 4007c7: ? ? ? 8b 45 ec ? ? ? ? ? ? ? ?mov ? ?-0x14(%rbp),%eax
? 4007ca: ? ? ? 89 c6 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%esi
? 4007cc: ? ? ? bf 94 08 40 00 ? ? ? ? ?mov ? ?$0x400894,%edi
? 4007d1: ? ? ? b8 00 00 00 00 ? ? ? ? ?mov ? ?$0x0,%eax
? 4007d6: ? ? ? e8 25 fe ff ff ? ? ? ? ?callq ?400600 <printf@plt>
? 4007db: ? ? ? 83 45 ec 01 ? ? ? ? ? ? addl ? $0x1,-0x14(%rbp)
? 4007df: ? ? ? 39 5d ec ? ? ? ? ? ? ? ?cmp ? ?%ebx,-0x14(%rbp)
? 4007e2: ? ? ? 7c dc ? ? ? ? ? ? ? ? ? jl ? ? 4007c0 <main._omp_fn.0+0x38>
? # ======================================================================
? # ============下面的代碼主要是進行 while 循環(huán)查看循環(huán)是否執(zhí)行完成==============
? 4007e4: ? ? ? 48 8d 55 e0 ? ? ? ? ? ? lea ? ?-0x20(%rbp),%rdx
? 4007e8: ? ? ? 48 8d 45 d8 ? ? ? ? ? ? lea ? ?-0x28(%rbp),%rax
? 4007ec: ? ? ? 48 89 d6 ? ? ? ? ? ? ? ?mov ? ?%rdx,%rsi
? 4007ef: ? ? ? 48 89 c7 ? ? ? ? ? ? ? ?mov ? ?%rax,%rdi
? 4007f2: ? ? ? e8 d9 fd ff ff ? ? ? ? ?callq ?4005d0 <GOMP_loop_dynamic_next@plt>
? 4007f7: ? ? ? 84 c0 ? ? ? ? ? ? ? ? ? test ? %al,%al
? 4007f9: ? ? ? 75 b8 ? ? ? ? ? ? ? ? ? jne ? ?4007b3 <main._omp_fn.0+0x2b>
? # ======================================================================
? 4007fb: ? ? ? e8 10 fe ff ff ? ? ? ? ?callq ?400610 <GOMP_loop_end_nowait@plt>
? 400800: ? ? ? 48 83 c4 38 ? ? ? ? ? ? add ? ?$0x38,%rsp
? 400804: ? ? ? 5b ? ? ? ? ? ? ? ? ? ? ?pop ? ?%rbx
? 400805: ? ? ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%rbp
? 400806: ? ? ? c3 ? ? ? ? ? ? ? ? ? ? ?retq ??
? 400807: ? ? ? 66 0f 1f 84 00 00 00 ? ?nopw ? 0x0(%rax,%rax,1)
? 40080e: ? ? ? 00 00

總結(jié)

在本篇文章當中我們主要分析了 OpenMP 當中 for 循環(huán)動態(tài)調(diào)度方式的具體實現(xiàn)原理，以及動態(tài)庫函數(shù)的分析。整個過程主要有兩大部分，一個是編譯角度，編譯器會將 for construct 編譯成什么樣子，以及動態(tài)庫函數(shù)具體是如何劃分迭代分塊的。在迭代分塊當中主要分為兩種方式，當分塊數(shù)目多的時候不使用 CAS 因為這樣線程之間競爭比較激烈，但是當分塊數(shù)目比較小的時候就使用 CAS ，這種做法可以提高程序執(zhí)行的效率。

原文鏈接：https://www.cnblogs.com/Chang-LeHung/p/17089310.html

上一篇：Pytorch中關(guān)于model.eval()的作用及分析_p
下一篇：C++聚合體初始化aggregate?initializat

日本免费高清视频-国产福利视频导航-黄色在线播放国产-天天操天天操天天操天天操|www.shdianci.com

網(wǎng)站首頁編程語言正文

OpenMP中For?Construct對dynamic的調(diào)度方式詳解_C 語言

目錄

前言

前置知識

dynamic 調(diào)度方式分析

實例分析

總結(jié)

相關(guān)推薦

日本免费高清视频-国产福利视频导航-黄色在线播放国产-天天操天天操天天操天天操|www.shdianci.com

網(wǎng)站首頁 編程語言 正文

OpenMP中For?Construct對dynamic的調(diào)度方式詳解_C 語言

目錄

前言

前置知識

dynamic 調(diào)度方式分析

實例分析

總結(jié)

相關(guān)推薦

網(wǎng)站首頁編程語言正文