• cpu GEMM 之 微数据复用 寄存器优化


    一,三重for循环方式

    存储为 helloGemm_1x1.cpp

    编译:

    g++ hello_gemm_1x1.cpp -o hello_gemm_1x1.out -O3

    1. #include
    2. #include
    3. using namespace std;
    4. void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
    5. for(int i=0; i
    6. for(int j=0; j
    7. for(int k=0; k
    8. C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    9. }
    10. }
    11. }
    12. }
    13. void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
    14. register float a1, a2, b1, b2;
    15. for(int i=0; i2; i++){
    16. for(int j=0; j2; j++){
    17. for(int k=0; k
    18. a1 = A[2*i + k*M_ ];//2*i k
    19. a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
    20. b1 = B[k*N_ + 2*j ];//k 2*j
    21. b2 = B[k*N_ + (2*j+1)];//k 2*j+1
    22. C[2*i + 2*j*M_ ] += a1*b1;
    23. C[2*i + (2*j+1)*M_] += a1*b2;
    24. C[(2*i+1) + 2*j*M_ ] += a2*b1;
    25. C[2*i+1 + (2*j+1)*M_] += a2*b2;
    26. // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    27. }
    28. }
    29. }
    30. }
    31. void init_mat(float* A, int count, int mod){
    32. for(int idx=0; idx
    33. A[idx] = idx%mod + 1;
    34. }
    35. }
    36. void print_mat(float* A, int M, int N, bool colMajor){
    37. cout<
    38. for(int i=0; i
    39. for(int j=0; j
    40. cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
    41. }
    42. cout<
    43. }
    44. }
    45. int main(){
    46. int M = 1024;
    47. int N = 1024;
    48. int K = 1024;
    49. float* A = nullptr;
    50. float* B = nullptr;
    51. float* C_1x1 = nullptr;
    52. float* C_2x2 = nullptr;
    53. A = (float*)malloc(M*K*sizeof(float));
    54. B = (float*)malloc(K*N*sizeof(float));
    55. C_1x1 = (float*)malloc(M*N*sizeof(float));
    56. C_2x2 = (float*)malloc(M*N*sizeof(float));
    57. init_mat(A, M*K, 3);
    58. init_mat(B, K*N, 4);
    59. // print_mat(A, M, K, true);
    60. // print_mat(B, K, N, false);
    61. gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
    62. // gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
    63. //print_mat(C_1x1, M, N, true);
    64. //print_mat(C_2x2, M, N, true);
    65. cout<"C(M,N)"<< C_1x1[M*N-1]<
    66. // cout<
    67. return 0;
    68. }

    二,block2x2 register 优化

    存储为 helloGemm_2x2.cpp

    编译:

    g++ hello_gemm_2x2.cpp -o hello_gemm_2x2.out -O3

    1. #include
    2. #include
    3. using namespace std;
    4. void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
    5. for(int i=0; i
    6. for(int j=0; j
    7. for(int k=0; k
    8. C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    9. }
    10. }
    11. }
    12. }
    13. void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
    14. register float a1, a2, b1, b2;
    15. for(int i=0; i2; i++){
    16. for(int j=0; j2; j++){
    17. for(int k=0; k
    18. a1 = A[2*i + k*M_ ];//2*i k
    19. a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
    20. b1 = B[k*N_ + 2*j ];//k 2*j
    21. b2 = B[k*N_ + (2*j+1)];//k 2*j+1
    22. C[2*i + 2*j*M_ ] += a1*b1;
    23. C[2*i + (2*j+1)*M_] += a1*b2;
    24. C[(2*i+1) + 2*j*M_ ] += a2*b1;
    25. C[2*i+1 + (2*j+1)*M_] += a2*b2;
    26. // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    27. }
    28. }
    29. }
    30. }
    31. void init_mat(float* A, int count, int mod){
    32. for(int idx=0; idx
    33. A[idx] = idx%mod + 1;
    34. }
    35. }
    36. void print_mat(float* A, int M, int N, bool colMajor){
    37. cout<
    38. for(int i=0; i
    39. for(int j=0; j
    40. cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
    41. }
    42. cout<
    43. }
    44. }
    45. int main(){
    46. int M = 1024;
    47. int N = 1024;
    48. int K = 1024;
    49. float* A = nullptr;
    50. float* B = nullptr;
    51. float* C_1x1 = nullptr;
    52. float* C_2x2 = nullptr;
    53. A = (float*)malloc(M*K*sizeof(float));
    54. B = (float*)malloc(K*N*sizeof(float));
    55. C_1x1 = (float*)malloc(M*N*sizeof(float));
    56. C_2x2 = (float*)malloc(M*N*sizeof(float));
    57. init_mat(A, M*K, 3);
    58. init_mat(B, K*N, 4);
    59. // print_mat(A, M, K, true);
    60. // print_mat(B, K, N, false);
    61. // gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
    62. gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
    63. //print_mat(C_1x1, M, N, true);
    64. //print_mat(C_2x2, M, N, true);
    65. // cout<
    66. cout<"C(M,N)"<< C_2x2[M*N-1]<
    67. return 0;
    68. }

    (m, n, k) = (1024, 1024, 1024)

    加速效果明显,  效果图: 2 second    VS   5 second

    加入 openmp后,并行效果更加明显:

    三,使用openmp对三重for循环优化

    将测试 size 全部改成:  ((m, n, k) = (1024, 1024, 4096))

    在上面个的两个版本基础上,加入openmp加速指令后的代码为:

    存储为 helloGemm_1x1_omp.cpp

    编译:

     g++ hello_gemm_1x1_omp.cpp -o hello_gemm_1x1_omp.out -O3 -fopenmp

    1. #include
    2. #include
    3. #include
    4. using namespace std;
    5. void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
    6. #pragma omp parallel for num_threads(omp_get_num_procs())
    7. for(int i=0; i
    8. for(int j=0; j
    9. for(int k=0; k
    10. C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    11. }
    12. }
    13. }
    14. }
    15. void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
    16. // register float a1, a2, b1, b2;
    17. #pragma omp parallel for num_threads(omp_get_num_procs())
    18. for(int i=0; i2; i++){
    19. for(int j=0; j2; j++){
    20. for(int k=0; k
    21. register float a1, a2, b1, b2;
    22. a1 = A[2*i + k*M_ ];//2*i k
    23. a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
    24. b1 = B[k*N_ + 2*j ];//k 2*j
    25. b2 = B[k*N_ + (2*j+1)];//k 2*j+1
    26. C[2*i + 2*j*M_ ] += a1*b1;
    27. C[2*i + (2*j+1)*M_] += a1*b2;
    28. C[(2*i+1) + 2*j*M_ ] += a2*b1;
    29. C[2*i+1 + (2*j+1)*M_] += a2*b2;
    30. // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    31. }
    32. }
    33. }
    34. }
    35. void init_mat(float* A, int count, int mod){
    36. for(int idx=0; idx
    37. A[idx] = idx%mod + 1;
    38. }
    39. }
    40. void print_mat(float* A, int M, int N, bool colMajor){
    41. cout<
    42. for(int i=0; i
    43. for(int j=0; j
    44. cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
    45. }
    46. cout<
    47. }
    48. }
    49. int main(){
    50. int M = 1024;
    51. int N = 1024;
    52. int K = 4096;
    53. float* A = nullptr;
    54. float* B = nullptr;
    55. float* C_1x1 = nullptr;
    56. float* C_2x2 = nullptr;
    57. A = (float*)malloc(M*K*sizeof(float));
    58. B = (float*)malloc(K*N*sizeof(float));
    59. C_1x1 = (float*)malloc(M*N*sizeof(float));
    60. C_2x2 = (float*)malloc(M*N*sizeof(float));
    61. init_mat(A, M*K, 3);
    62. init_mat(B, K*N, 4);
    63. // print_mat(A, M, K, true);
    64. // print_mat(B, K, N, false);
    65. gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
    66. // gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
    67. //print_mat(C_1x1, M, N, true);
    68. //print_mat(C_2x2, M, N, true);
    69. cout<"C(M,N)"<< C_1x1[M*N-1]<
    70. // cout<
    71. return 0;
    72. }

    四,对block2x2 register 优化加入 openmp加速

    存储为 helloGemm_2x2_omp.cpp

    g++ hello_gemm_2x2_omp.cpp -o hello_gemm_2x2_omp.out -O3 -fopenmp

    1. #include
    2. #include
    3. #include
    4. using namespace std;
    5. void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
    6. #pragma omp parallel for num_threads(omp_get_num_procs())
    7. for(int i=0; i
    8. for(int j=0; j
    9. for(int k=0; k
    10. C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    11. }
    12. }
    13. }
    14. }
    15. void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
    16. // register float a1, a2, b1, b2;
    17. #pragma omp parallel for num_threads(omp_get_num_procs())
    18. for(int i=0; i2; i++){
    19. for(int j=0; j2; j++){
    20. for(int k=0; k
    21. register float a1, a2, b1, b2;
    22. a1 = A[2*i + k*M_ ];//2*i k
    23. a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
    24. b1 = B[k*N_ + 2*j ];//k 2*j
    25. b2 = B[k*N_ + (2*j+1)];//k 2*j+1
    26. C[2*i + 2*j*M_ ] += a1*b1;
    27. C[2*i + (2*j+1)*M_] += a1*b2;
    28. C[(2*i+1) + 2*j*M_ ] += a2*b1;
    29. C[2*i+1 + (2*j+1)*M_] += a2*b2;
    30. // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
    31. }
    32. }
    33. }
    34. }
    35. void init_mat(float* A, int count, int mod){
    36. for(int idx=0; idx
    37. A[idx] = idx%mod + 1;
    38. }
    39. }
    40. void print_mat(float* A, int M, int N, bool colMajor){
    41. cout<
    42. for(int i=0; i
    43. for(int j=0; j
    44. cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
    45. }
    46. cout<
    47. }
    48. }
    49. int main(){
    50. int M = 1024;//2048;
    51. int N = 1024;//2048;
    52. int K = 4096;
    53. float* A = nullptr;
    54. float* B = nullptr;
    55. float* C_1x1 = nullptr;
    56. float* C_2x2 = nullptr;
    57. A = (float*)malloc(M*K*sizeof(float));
    58. B = (float*)malloc(K*N*sizeof(float));
    59. C_1x1 = (float*)malloc(M*N*sizeof(float));
    60. C_2x2 = (float*)malloc(M*N*sizeof(float));
    61. init_mat(A, M*K, 3);
    62. init_mat(B, K*N, 4);
    63. // print_mat(A, M, K, true);
    64. // print_mat(B, K, N, false);
    65. // gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
    66. gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
    67. //print_mat(C_1x1, M, N, true);
    68. //print_mat(C_2x2, M, N, true);
    69. // cout<
    70. cout<"C(M,N)"<< C_2x2[M*N-1]<
    71. return 0;
    72. }

    编译图:

    效果图:

    优化效果统计:

    三重for循环:                       43 s

    block2x2 register:              11 s

    openmp 三重for循环:          2 s

    openmp block2x2 register:    1s

  • 相关阅读:
    智汀家庭云究竟有什么魔力?实现跨品牌联动
    [阶段4 企业开发进阶] 3. 消息队列--RabbitMQ
    yolov5 tensorrt 精度对齐总结
    Transformer为什么如此有效 | 通用建模能力,并行
    chmod的用法,及几个权限的宏
    网络安全漏洞分析之远程代码执行
    23种设计模式之单例模式
    CDC实时数据同步
    将文字转为UTF-8编码(即HTML可识别的)
    基于MetaTown构建数字资产平台
  • 原文地址:https://blog.csdn.net/eloudy/article/details/126698509