目录
数据局部性:
图12-6:有和没有首次接触
- //Step 1.a Initialization by initial thread only
- for (j = 0; j < VectorSize; j++) {
- a[j] = 1.0; b[j] = 2.0; c[j] = 0.0;}
-
- //Step 1.b Initialization by all threads (first touch)
- omp_set_dynamic(0);
- #pragma omp parallel for schedule(static)
- for (j = 0; j < VectorSize; j++) {
- a[j] = 1.0; b[j] = 2.0; c[j] = 0.0;}
-
- //Step 2 Compute
- #pragma omp parallel for schedule(static)
- for (j = 0; j < VectorSize; j++) {
- a[j] = b[j] + d * c[j];}
place:线程对应的硬件资源
定义place : export OMP_PLACES="{0,1,2,3},{4,5,6,7}"
export OMP_PLACES="{0:4},{4:4}"
export OMP_PLACES=threads(以硬件线程的粒度绑定openmp线程),cores,sockets
控制处理器绑定:OMP_PROC_BIND true,false,master,close(组中的线程被放置在靠近主线程的place,线程以循环的方式从主线程的右边位置开始被分配到连续的place),spread(将线程尽可能均匀的分布在各个place上)
通过嵌套并行构造来影响NUMA系统中的线程分布:
export OMP_NESTED=true(openmp5.0弃用)
export OMP_MAX_ACTIVATE_LEVELS=3
- #include
- #include
- void report_num_threads(int level)
- {
- #pragma omp single
- {
- printf("Level %d: number of threads in the team: %d\n", \
- level, omp_get_num_threads());
- }
- }
- int main()
- {
- omp_set_dynamic(0);
- #pragma omp parallel num_threads(2)
- {
- report_num_threads(1);
- #pragma omp parallel num_threads(2)
- {
- report_num_threads(2);
- #pragma omp parallel num_threads(2)
- {
- report_num_threads(3);
- }
- }
- }
- return(0);
- }
OMP_NUM_THREADS,OMP_PLACES,OMP_PROC_BIND环境变量被扩展为支持嵌套:
export OMP_PLACES=sockets,threads
export OMP_NUM_THREADS=2,4
export OMP_PROC_BIND=spread,close
程序开始时,有一个初始线程运行在核心0的第一个硬件线程上,我们遇到第一个并行区域,并使用OMP_NUM_THREADS和OMP_PROC_BIND的第一个值(2,spread)。我们创建两个线程,每个插槽上一个。注意,线程可以运行在任何核心和定义place的任何硬件线程上,本例中是socket,意味着它们可以运行在各自插槽的任何核心上。在创建一个并行区域后,内部控制变量会进入列表的下个值,4代表线程数,close代表处理器绑定。当每个线程遇到嵌套的并行区域时,他们会在同一核心上创建4个线程。
OMP_DISPLAY_AFFINITY
OMP_AFFINITY_FORMAT
图12-12:
- $ icc -qopenmp -DNTIMES=20 -DSTREAM_ARRAY_SIZE=64000000 -c stream.c
- $ icc -qopenmp -o stream stream.o
- $ export OMP_DISPLAY_AFFINITY=true
- $ export OMP_AFFINITY_FORMAT="Thrd Lev=%3L, thrd_num=%5n, thrd_aff=%15A"
- $ export OMP_PLACES=threads
- $ export OMP_NUM_THREADS=8
- $ export OMP_PROC_BIND=spread
-
- $ ./stream | sort -k3
- Thrd Lev=1 , thrd_num=0 , thrd_aff=0
- Thrd Lev=1 , thrd_num=1 , thrd_aff=8
- Thrd Lev=1 , thrd_num=2 , thrd_aff=16
- Thrd Lev=1 , thrd_num=3 , thrd_aff=24
- Thrd Lev=1 , thrd_num=4 , thrd_aff=1
- Thrd Lev=1 , thrd_num=5 , thrd_aff=9
- Thrd Lev=1 , thrd_num=6 , thrd_aff=17
- Thrd Lev=1 , thrd_num=7 , thrd_aff=25
-
- $ export OMP_PROC_BIND=close
- $ ./stream |sort -k3
- Thrd Lev=1 , thrd_num=0 , thread_aff=0
- Thrd Lev=1 , thrd_num=1 , thread_aff=32
- Thrd Lev=1 , thrd_num=2 , thread_aff=2
- Thrd Lev=1 , thrd_num=3 , thread_aff=34
- Thrd Lev=1 , thrd_num=4 , thread_aff=4
- Thrd Lev=1 , thrd_num=5 , thread_aff=36
- Thrd Lev=1 , thrd_num=6 , thread_aff=6
- Thrd Lev=1 , thrd_num=7 , thread_af=38
线程亲和力和数据局部性:
一般建议是每个NUMA域至少有一个进程(列入单个MPI rank或OS进程)。让Openmp线程对应一个NUMA域内的并行,将所需数据保持在同一个NUMA域内。这就减少了跨NUMA域边界时,正确初始化首次接触的任何错误的影响。另一个建议是将线程相隔很远(spread)以利用聚集的内存带宽,然后将最里面close的工作在嵌套的并行区域内分叉,以最大化缓存局部性。
图12-17:
openmp对PI程序进行向量化:
- #include
- #include
- static long num_steps = 100000;
- float step;
- int main ()
- {
- int i;
- float x, pi, sum = 0.0;
-
- step = 1.0f / (double) num_steps;
-
- #pragma omp simd private(x) reduction(+:sum)
- for (i = 0; i < num_steps; i++) {
- x = (i + 0.5f) * step;
- sum += 4.0f / (1.0f + x * x);
- }
-
- pi = step * sum;
- printf("pi=%lf \n", pi);
- }
图12-18:
openmp对PI程序进行多线程和向量化:
- #include
- #include
- static long num_steps = 100000000;
- double step;
- int main ()
- {
- int i;
- double x, pi, sum = 0.0;
-
- step = 1.0f / (double) num_steps;
-
- #pragma omp parallel for simd private(x) reduction(+:sum)
- for (i = 0; i < num_steps; i++) {
- x = (i + 0.5f) * step;
- sum += 4.0f / (1.0f + x * x);
- }
-
- pi = step * sum;
- printf("pi=%f\n",pi);
- }
target指令及其相关的结构化块定义了卸载到设备上执行的目标区域。target指令还导致数据移动到设备上,当目标区域完成执行时这些数据将从设备副指挥主机
图12-19:
- #include
- #include
- #define N 1024
- int main()
- {
- float a[N], b[N], c[N];
- int i;
-
- // initialize a, b, and c (code not shown)
-
- #pragma omp target
- #pragma omp teams distribute parallel for simd
- for (i = 0;i < N; i++)
- c[i] += a[i] * b[i];
- }
对于GPU来说target指令后面加上以下指令及其相关循环:
#pragma omp teams distribute parallel for simd
map子句:
图12-21:
- #include
- #include
- #include
- #define N 1024
- int main()
- {
- float *a, *b, *c, *d;
- int i;
-
- a = (float*) malloc(N * sizeof(float));
- b = (float*) malloc(N * sizeof(float));
- c = (float*) malloc(N * sizeof(float));
- d = (float*) malloc(N * sizeof(float));
-
- // initialize a, b, c, and d (code not shown)
-
- #pragma omp target map(to:a[0:N],b[0:N]) map(tofrom:c[0:N])
- #pragma omp teams distribute parallel for simd
- for (i = 0; i < N;i++)
- c[i] += a[i] * b[i];
-
- #pragma omp target map(to:a[0:N],c[0:N]) map(tofrom:d[0:N])
- #pragma omp teams distribute parallel for simd
- for (i = 0; i < N; i++)
- d[i] += a[i] + c[i];
- }
图12-22:
- #include
- #include
- #include
- #define N 1024
- int main()
- {
- float *a, *b, *c, *d;
- int i;
-
- a = (float*)malloc(N*sizeof(float));
- b = (float*)malloc(N*sizeof(float));
- c = (float*)malloc(N*sizeof(float));
- d = (float*)malloc(N*sizeof(float));
-
- // initialize a, b, c, and d (code not shown)
-
- #pragma omp target data map(to:a[0:N],b[0:N],c[0:N]) map(tofrom:d[0:N])
- {
- #pragma omp target
- #pragma omp teams distribute parallel for simd
- for (i = 0; i < N; i++)
- c[i] += a[i] * b[i];
-
- #pragma omp target
- #pragma omp teams distribute parallel for simd
- for (i = 0; i < N; i++)
- d[i] += a[i] + c[i];
- }
-
- // continue in the program but only using d (not c)
-
- }