存储为 helloGemm_1x1.cpp
编译:
g++ hello_gemm_1x1.cpp -o hello_gemm_1x1.out -O3
- #include
- #include
-
-
- using namespace std;
-
- void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
-
- for(int i=0; i
- for(int j=0; j
- for(int k=0; k
- C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
- void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
-
- register float a1, a2, b1, b2;
-
- for(int i=0; i
2; i++){ - for(int j=0; j
2; j++){ - for(int k=0; k
- a1 = A[2*i + k*M_ ];//2*i k
- a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
- b1 = B[k*N_ + 2*j ];//k 2*j
- b2 = B[k*N_ + (2*j+1)];//k 2*j+1
-
- C[2*i + 2*j*M_ ] += a1*b1;
- C[2*i + (2*j+1)*M_] += a1*b2;
- C[(2*i+1) + 2*j*M_ ] += a2*b1;
- C[2*i+1 + (2*j+1)*M_] += a2*b2;
- // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
-
-
- void init_mat(float* A, int count, int mod){
- for(int idx=0; idx
- A[idx] = idx%mod + 1;
- }
- }
-
-
- void print_mat(float* A, int M, int N, bool colMajor){
-
- cout<
-
- for(int i=0; i
- for(int j=0; j
- cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
- }
- cout<
- }
- }
-
-
-
- int main(){
-
-
- int M = 1024;
- int N = 1024;
- int K = 1024;
-
- float* A = nullptr;
- float* B = nullptr;
- float* C_1x1 = nullptr;
- float* C_2x2 = nullptr;
-
- A = (float*)malloc(M*K*sizeof(float));
- B = (float*)malloc(K*N*sizeof(float));
- C_1x1 = (float*)malloc(M*N*sizeof(float));
- C_2x2 = (float*)malloc(M*N*sizeof(float));
-
- init_mat(A, M*K, 3);
- init_mat(B, K*N, 4);
-
- // print_mat(A, M, K, true);
- // print_mat(B, K, N, false);
-
- gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
- // gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
-
- //print_mat(C_1x1, M, N, true);
- //print_mat(C_2x2, M, N, true);
-
- cout<
"C(M,N)"<< C_1x1[M*N-1]< - // cout<
-
-
-
-
-
- return 0;
- }
二,block2x2 register 优化
存储为 helloGemm_2x2.cpp
编译:
g++ hello_gemm_2x2.cpp -o hello_gemm_2x2.out -O3
- #include
- #include
-
-
- using namespace std;
-
- void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
-
- for(int i=0; i
- for(int j=0; j
- for(int k=0; k
- C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
- void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
-
- register float a1, a2, b1, b2;
-
- for(int i=0; i
2; i++){ - for(int j=0; j
2; j++){ - for(int k=0; k
- a1 = A[2*i + k*M_ ];//2*i k
- a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
- b1 = B[k*N_ + 2*j ];//k 2*j
- b2 = B[k*N_ + (2*j+1)];//k 2*j+1
-
- C[2*i + 2*j*M_ ] += a1*b1;
- C[2*i + (2*j+1)*M_] += a1*b2;
- C[(2*i+1) + 2*j*M_ ] += a2*b1;
- C[2*i+1 + (2*j+1)*M_] += a2*b2;
- // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
-
-
- void init_mat(float* A, int count, int mod){
- for(int idx=0; idx
- A[idx] = idx%mod + 1;
- }
- }
-
-
- void print_mat(float* A, int M, int N, bool colMajor){
-
- cout<
-
- for(int i=0; i
- for(int j=0; j
- cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
- }
- cout<
- }
- }
-
-
-
- int main(){
-
-
- int M = 1024;
- int N = 1024;
- int K = 1024;
-
- float* A = nullptr;
- float* B = nullptr;
- float* C_1x1 = nullptr;
- float* C_2x2 = nullptr;
-
- A = (float*)malloc(M*K*sizeof(float));
- B = (float*)malloc(K*N*sizeof(float));
- C_1x1 = (float*)malloc(M*N*sizeof(float));
- C_2x2 = (float*)malloc(M*N*sizeof(float));
-
- init_mat(A, M*K, 3);
- init_mat(B, K*N, 4);
-
- // print_mat(A, M, K, true);
- // print_mat(B, K, N, false);
-
- // gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
- gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
-
- //print_mat(C_1x1, M, N, true);
- //print_mat(C_2x2, M, N, true);
-
- // cout<
- cout<
"C(M,N)"<< C_2x2[M*N-1]< -
-
-
-
-
- return 0;
- }
(m, n, k) = (1024, 1024, 1024)
加速效果明显, 效果图: 2 second VS 5 second

加入 openmp后,并行效果更加明显:
三,使用openmp对三重for循环优化
将测试 size 全部改成: ((m, n, k) = (1024, 1024, 4096))
在上面个的两个版本基础上,加入openmp加速指令后的代码为:
存储为 helloGemm_1x1_omp.cpp
编译:
g++ hello_gemm_1x1_omp.cpp -o hello_gemm_1x1_omp.out -O3 -fopenmp
- #include
- #include
- #include
-
- using namespace std;
-
- void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
-
- #pragma omp parallel for num_threads(omp_get_num_procs())
- for(int i=0; i
- for(int j=0; j
- for(int k=0; k
- C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
- void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
-
- // register float a1, a2, b1, b2;
- #pragma omp parallel for num_threads(omp_get_num_procs())
- for(int i=0; i
2; i++){ - for(int j=0; j
2; j++){ - for(int k=0; k
- register float a1, a2, b1, b2;
-
- a1 = A[2*i + k*M_ ];//2*i k
- a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
- b1 = B[k*N_ + 2*j ];//k 2*j
- b2 = B[k*N_ + (2*j+1)];//k 2*j+1
-
- C[2*i + 2*j*M_ ] += a1*b1;
- C[2*i + (2*j+1)*M_] += a1*b2;
- C[(2*i+1) + 2*j*M_ ] += a2*b1;
- C[2*i+1 + (2*j+1)*M_] += a2*b2;
- // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
-
-
- void init_mat(float* A, int count, int mod){
- for(int idx=0; idx
- A[idx] = idx%mod + 1;
- }
- }
-
-
- void print_mat(float* A, int M, int N, bool colMajor){
-
- cout<
-
- for(int i=0; i
- for(int j=0; j
- cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
- }
- cout<
- }
- }
-
-
-
- int main(){
-
-
- int M = 1024;
- int N = 1024;
- int K = 4096;
-
- float* A = nullptr;
- float* B = nullptr;
- float* C_1x1 = nullptr;
- float* C_2x2 = nullptr;
-
- A = (float*)malloc(M*K*sizeof(float));
- B = (float*)malloc(K*N*sizeof(float));
- C_1x1 = (float*)malloc(M*N*sizeof(float));
- C_2x2 = (float*)malloc(M*N*sizeof(float));
-
- init_mat(A, M*K, 3);
- init_mat(B, K*N, 4);
-
- // print_mat(A, M, K, true);
- // print_mat(B, K, N, false);
-
- gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
- // gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
-
- //print_mat(C_1x1, M, N, true);
- //print_mat(C_2x2, M, N, true);
-
- cout<
"C(M,N)"<< C_1x1[M*N-1]< - // cout<
-
-
-
-
-
- return 0;
- }
四,对block2x2 register 优化加入 openmp加速
存储为 helloGemm_2x2_omp.cpp
g++ hello_gemm_2x2_omp.cpp -o hello_gemm_2x2_omp.out -O3 -fopenmp
- #include
- #include
- #include
-
- using namespace std;
-
- void gemm_A_N_B_T_1x1(float* A, float* B, float* C, int M_, int N_, int K_){
-
- #pragma omp parallel for num_threads(omp_get_num_procs())
- for(int i=0; i
- for(int j=0; j
- for(int k=0; k
- C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
- void gemm_A_N_B_T_2x2(float* A, float* B, float* C, int M_, int N_, int K_){
-
- // register float a1, a2, b1, b2;
- #pragma omp parallel for num_threads(omp_get_num_procs())
- for(int i=0; i
2; i++){ - for(int j=0; j
2; j++){ - for(int k=0; k
- register float a1, a2, b1, b2;
-
- a1 = A[2*i + k*M_ ];//2*i k
- a2 = A[(2*i+1) + k*M_ ];//2*i+1 k
- b1 = B[k*N_ + 2*j ];//k 2*j
- b2 = B[k*N_ + (2*j+1)];//k 2*j+1
-
- C[2*i + 2*j*M_ ] += a1*b1;
- C[2*i + (2*j+1)*M_] += a1*b2;
- C[(2*i+1) + 2*j*M_ ] += a2*b1;
- C[2*i+1 + (2*j+1)*M_] += a2*b2;
- // C[i + j*M_] += A[i + k*M_]*B[k*N_ + j];
- }
- }
- }
-
- }
-
-
-
- void init_mat(float* A, int count, int mod){
- for(int idx=0; idx
- A[idx] = idx%mod + 1;
- }
- }
-
-
- void print_mat(float* A, int M, int N, bool colMajor){
-
- cout<
-
- for(int i=0; i
- for(int j=0; j
- cout<<" "<<(colMajor? A[i + j*M]: A[i*N + j]);
- }
- cout<
- }
- }
-
-
-
- int main(){
-
-
- int M = 1024;//2048;
- int N = 1024;//2048;
- int K = 4096;
-
- float* A = nullptr;
- float* B = nullptr;
- float* C_1x1 = nullptr;
- float* C_2x2 = nullptr;
-
- A = (float*)malloc(M*K*sizeof(float));
- B = (float*)malloc(K*N*sizeof(float));
- C_1x1 = (float*)malloc(M*N*sizeof(float));
- C_2x2 = (float*)malloc(M*N*sizeof(float));
-
- init_mat(A, M*K, 3);
- init_mat(B, K*N, 4);
-
- // print_mat(A, M, K, true);
- // print_mat(B, K, N, false);
-
- // gemm_A_N_B_T_1x1(A, B, C_1x1, M, N, K);
- gemm_A_N_B_T_2x2(A, B, C_2x2, M, N, K);
-
- //print_mat(C_1x1, M, N, true);
- //print_mat(C_2x2, M, N, true);
-
- // cout<
- cout<
"C(M,N)"<< C_2x2[M*N-1]< -
-
-
-
-
- return 0;
- }
编译图:

效果图:

优化效果统计:
三重for循环: 43 s
block2x2 register: 11 s
openmp 三重for循环: 2 s
openmp block2x2 register: 1s
-
相关阅读:
智汀家庭云究竟有什么魔力?实现跨品牌联动
[阶段4 企业开发进阶] 3. 消息队列--RabbitMQ
yolov5 tensorrt 精度对齐总结
Transformer为什么如此有效 | 通用建模能力,并行
chmod的用法,及几个权限的宏
网络安全漏洞分析之远程代码执行
23种设计模式之单例模式
CDC实时数据同步
将文字转为UTF-8编码(即HTML可识别的)
基于MetaTown构建数字资产平台
-
原文地址:https://blog.csdn.net/eloudy/article/details/126698509