• openmp 超越通用核心


    目录

    并行构造子句:

     共享工作循环构造:

    任务构造:

    多线程功能:

    运行时库例程:


    并行构造子句:

    1.if() 如果表达式为真,则创建线程组

    2.num_threads() 明确要求并行区域所使用地线程数

    3.

    图10-1:

    1. // sample compile command: "gcc -fopenmp -c Fig_10.1_parClaw.c" to generate *.o object file
    2. #include
    3. #include
    4. #include
    5. // initialization and transform functions
    6. // (we will not show the function bodies)
    7. extern void initMats(int N, float *A, float *T);
    8. extern void transform(int N, int id, int Nthrds, float *A, float *T);
    9. int main(int argc, char**argv)
    10. {
    11. float trace=0;
    12. int i, id, N, Nthrds;
    13. float *A, *T;
    14. // set matrix order N
    15. if (argc == 2)
    16. N = atoi(argv[1]);
    17. else
    18. N = 10;
    19. // allocate space for three N x N matrices and initialize them
    20. T = (float *) malloc(N*N*sizeof(float));
    21. A = (float *) malloc(N*N*sizeof(float));
    22. initMats(N, A, T);
    23. #pragma omp parallel if(N>100) num_threads(4) default(none) \
    24. shared(A,T,N) private (i,id,Nthrds) reduction(+:trace)
    25. {
    26. id = omp_get_thread_num();
    27. Nthrds = omp_get_num_threads();
    28. transform(N, id, Nthrds, T, A);
    29. // compute trace of A matrix
    30. // i.e., the sum of diagonal elements
    31. #pragma omp for
    32. for (i = 0; i < N; i++)
    33. trace += *(A+i*N+i);
    34. }
    35. printf(" transform complete with trace = \%f\n",trace);
    36. }

     共享工作循环构造:

    子句:schedule(runtime)

    bash shell命令行:修改相应环境变量:export OMP_SCHEDULE="dynamic,7"

    配对函数:omp_set_schedule(kind,chunk_size)\omp_get_schedule(kind,chunk_size)

    kind:typedef enum omp_sched_t{

            omp_sched_static = 1,

      omp_sched_dynamic = 2,

      omp_sched_guided = 3,

      omp_sched_auto = 4,

            }omp_sched_t;

    图10-3:

    1. // sample compile command: "gcc -fopenmp -c Fig_10.3_runtimeEx.c" to generate *.o object file
    2. #include
    3. #include
    4. #define DEBUG 1
    5. // map schedule kind enum values to strings for printing
    6. static char* schdKind[] = { "ERR","static","dynamic","guided","auto"};
    7. // external function for potential energy term
    8. extern double pot(double dist);
    9. void forces(int npart,double x[],double f[],double side,double rcoff)
    10. {
    11. #pragma omp parallel for schedule(runtime)
    12. for (int i = 0; i < npart*3; i += 3) {
    13. // zero force components on particle i
    14. double fxi = 0.0; double fyi = 0.0; double fzi = 0.0;
    15. // loop over all particles with index > i
    16. for (int j = i + 3; j < npart * 3; j += 3) {
    17. // compute distance between i and j with wraparound
    18. double xx = x[i] - x[j];
    19. double yy = x[i+1] - x[j+1];
    20. double zz = x[i+2] - x[j+2];
    21. if(xx<(-0.5*side)) xx+=side; if(xx>(0.5*side)) xx-=side;
    22. if(yy<(-0.5*side)) yy+=side; if(yy>(0.5*side)) yy-=side;
    23. if(zz<(-0.5*side)) zz+=side; if(zz>(0.5*side)) zz-=side;
    24. double rd = xx * xx + yy * yy + zz * zz;
    25. // if distance is inside cutoff radius, compute forces
    26. if (rd <= rcoff*rcoff) {
    27. double fcomp = pot(rd);
    28. fxi += xx*fcomp; fyi += yy*fcomp; fzi += zz*fcomp;
    29. f[j] -= xx*fcomp; f[j+1] -= yy*fcomp; f[j+2] -= zz*fcomp;
    30. }
    31. }
    32. // update forces on particle i
    33. f[i] += fxi; f[i+1] += fyi; f[i+2] += fzi;
    34. }
    35. #ifdef DEBUG
    36. omp_sched_t kind;
    37. int chunk_size;
    38. omp_get_schedule(&kind, &chunk_size);
    39. printf("schedule(%s,%d)\n",schdKind[kind],chunk_size);
    40. #endif
    41. }

    子句:collapse(n),对循环进行合并

    如果是三层循环可以collapse(3)。但是要考虑数据竞争。详情建议参考 雷洪 多核异构并行计算

    图10-4:

    1. // sample compile command: "gcc -fopenmp -c Fig_10.4_loopCollapse.c" to generate *.o object file
    2. #include
    3. // apply a function (*MFUNC) to each element of an N by M array
    4. void Apply(int N, int M, float* A, void(*MFUNC)(int, int, float*))
    5. {
    6. #pragma omp parallel for num_threads(4) collapse(2) if(N*M>100)
    7. for (int i = 0; i < N; i++)
    8. for (int j = 0; j < M; j++)
    9. MFUNC(i, j, (A+i*M+j));
    10. }

    任务构造:

    子句:

    untied:任务默认是绑定的(线程绑定任务),可以用untied子句显示的将任务标记为非绑定状态。

    priority():设置任务优先级

    环境变量OMP_MAX_TASK_PRIORITY.设置最大优先级

    depend():定义任务之间的执行顺序

    图10-6:

    1. // sample compile command: "gcc -fopenmp -c Fig_10.6_taskDep.c" to generate *.o object file
    2. #include
    3. // functions Awork through Ework not shown
    4. int main()
    5. {
    6. float A, B, C, D, E;
    7. #pragma omp parallel shared(A, B, C, D, E)
    8. {
    9. #pragma omp single
    10. {
    11. #pragma omp task depend(out:A)
    12. Awork(&A);
    13. #pragma omp task depend(out:E)
    14. Ework(&E);
    15. #pragma omp task depend(in:A) depend(out:B)
    16. Bwork(&B);
    17. #pragma omp task depend(in:A) depend(out:C)
    18. Cwork(&C);
    19. #pragma omp task depend(in:B,C,E)
    20. Dwork(&E);
    21. }
    22. }
    23. }

    多线程功能:

    threadprivate:线程私有指令:

    图10-7:

    1. // sample compile command: "gcc -fopenmp -c Fig_10.7_threadpriv.c" to generate *.o object file
    2. // will get warning messages about functions init_list, processwork, and freeList are implicitly declared
    3. #include
    4. #include
    5. #include
    6. struct node {
    7. int data;
    8. struct node * next;
    9. };
    10. int counter = 0;
    11. #pragma omp threadprivate(counter)
    12. void inc_count()
    13. {
    14. counter++;
    15. }
    16. int main()
    17. {
    18. struct node *p = NULL;
    19. struct node *head = NULL;
    20. init_list(p);
    21. head = p;
    22. #pragma omp parallel
    23. {
    24. #pragma omp single
    25. {
    26. p = head;
    27. while (p) {
    28. #pragma omp task firstprivate(p)
    29. {
    30. inc_count();
    31. processwork(p);
    32. }
    33. p = p->next;
    34. }
    35. }
    36. printf("thread \%d ran \%d tasks\n",omp_get_thread_num(),counter);
    37. }
    38. freeList(p);
    39. return 0;
    40. }

    子句copyin(list): 可以将threadprivate变量出现在openmp构造中。

    master构造:让主线程处理结构化块:

    atomic构造:保护一个变量为原子操作。 

    环境变量 OMP_STACKSIZE:设置每个线程的栈空间大小。

    运行时库例程:

    omp_get_max_threads() //获取最大线程数

    omp_set_dynamic //并行区域到下一个并行区域时,优化线程组大小,叫做动态模式。

    omp_in_parallel //此时否在并行区域

    同步和内存模型

    图11-5:使用冲刷和原子性的成对同步:使用原子性更新然后读取flag

    1. int flag = 0; // a flag to communicate when the consumer can start
    2. omp_set_num_threads(2);
    3. #pragma omp parallel shared(A, flag)
    4. {
    5. int id = omp_get_thread_num();
    6. int nthrds = omp_get_num_threads();
    7. // we need two or more threads for this program
    8. if ((id == 0) && (nthrds < 2)) exit(-1);
    9. if (id == 0) {
    10. produce(A);
    11. #pragma omp flush
    12. #pragma omp atomic write
    13. flag = 1;
    14. }
    15. if (id == 1) {
    16. while(1) {
    17. #pragma omp atomic read
    18. flag_temp = flag;
    19. if (flag_temp != 0) break;
    20. }
    21. #pragma omp flush
    22. consume (A);
    23. }
    24. }

     openmp的锁:

    图11-6:

    1. // sample compile command: "gcc -fopenmp -c Fig_11.6_hist.c" to generate *.o object file
    2. #include
    3. #include
    4. //#include "random.h" //seed() and drandom()
    5. extern double drandom();
    6. extern void seed(double low_in, double hi_in);
    7. #define num_trials 1000000 // number of x values
    8. #define num_bins 100 // number of bins in histogram
    9. static long xlow = 0.0; // low end of x range
    10. static long xhi = 100.0; // High end of x range
    11. int main ()
    12. {
    13. double x;
    14. long hist[num_bins]; // the histogram
    15. double bin_width; // the width of each bin in the histogram
    16. omp_lock_t hist_lcks[num_bins]; // array of locks, one per bucket
    17. seed(xlow, xhi); // seed random generator over range of x
    18. bin_width = (xhi - xlow) / (double)num_bins;
    19. // initialize the histogram and the array of locks
    20. #pragma omp parallel for schedule(static)
    21. for (int i = 0; i < num_bins; i++) {
    22. hist[i] = 0;
    23. omp_init_lock(&hist_lcks[i]);
    24. }
    25. // test uniform pseudorandom sequence by assigning values
    26. // to the right histogram bin
    27. #pragma omp parallel for schedule(static) private(x)
    28. for(int i = 0; i < num_trials; i++) {
    29. x = drandom();
    30. long ival = (long) (x - xlow)/bin_width;
    31. // protect histogram bins. Low overhead due to uncontended locks
    32. omp_set_lock(&hist_lcks[ival]);
    33. hist[ival]++;
    34. omp_unset_lock(&hist_lcks[ival]);
    35. }
    36. double sumh = 0.0, sumhsq = 0.0, ave, std_dev;
    37. // compute statistics (ave, std_dev) and destroy locks
    38. #pragma omp parallel for schedule(static) reduction(+:sumh,sumhsq)
    39. for (int i = 0; i < num_bins; i++) {
    40. sumh += (double) hist[i];
    41. sumhsq += (double) hist[i] * hist[i];
    42. omp_destroy_lock(&hist_lcks[i]);
    43. }
    44. ave = sumh / num_bins;
    45. std_dev = sqrt(sumhsq / ((double)num_bins) - ave * ave);
    46. return 0;
    47. }

    临界区的实现是使用锁实现的,当我们使用 #pragma omp critical 的时候,我们默认是使用的 OpenMP 内部的默认锁实现的,如果你在其他地方也使用 #pragma omp critical 的话使用的也是同一把锁,因此即使你用 #pragma omp critical 创建多个临界区你使用的也是同一把锁,也就是说这多个临界区在同一时刻也只会有一个线程在一个临界区执行,其余的临界区是没有线程在执行的,因为所有的临界区使用同一把锁,而一个时刻只能够有一个线程获得锁。

    为了解决上面所谈到的问题,在 OpenMP 当中使用 critical 构造代码块的时候我们可以指定一个名字,以此用不同的锁在不同的临界区。

    参考链接:https://zhuanlan.zhihu.com/p/600324334
    总结:锁比临界区性能好

    内存模型

    seq_cst顺序一致,release,acquire,acquire_release

    图11-7:

    1. #include
    2. #include
    3. int main()
    4. {
    5. double *A;
    6. int flag = 0; // a flag to communicate when the consumer can start
    7. omp_set_num_threads(2);
    8. #pragma omp parallel shared(A, flag)
    9. {
    10. int id = omp_get_thread_num();
    11. int nthrds = omp_get_num_threads();
    12. int flag_temp;
    13. // we need two or more threads for this program
    14. if ((id == 0) && (nthrds < 2)) exit(-1);
    15. if (id == 0) {
    16. produce(A);
    17. #pragma omp atomic write seq_cst
    18. flag = 1;
    19. }
    20. if (id == 1) {
    21. while(1) {
    22. #pragma omp atomic read seq_cst
    23. flag_temp = flag;
    24. if(flag_temp != 0) break;
    25. }
    26. consume(A);
    27. }
    28. }
    29. }

  • 相关阅读:
    欧科云链携手上海数据交易所,关于未来的“超前实践”正在发生...
    快1倍,我在 M1 Max 上开发 iOS 应用有了这些发现
    怎么修改Jenkins的默认工作路径,最简单高效的方式
    【嵌入式Linux应用开发】设计温湿度采集MCU子系统
    深入浅出Dockerfile实战
    笔试刷题Day—1
    数据降维——因子分析
    用深度强化学习来玩Chrome小恐龙快跑
    《深入浅出.NET框架设计与实现》阅读笔记(一)
    基于Vue+SpringBoot的无代码动态表单系统 开源项目
  • 原文地址:https://blog.csdn.net/qq_52758467/article/details/133812106