有一道面试题目:
将两个维度同为M行N列的二维数组,在嵌套循环中逐元素相加,有两种算法。
试问:上述两算法,哪种效率更高?
解析:虽然时间复杂度都是O(M*N),但第一种算法由于是顺序访问数组元素,因此缓存命中率更高。
下面进行一下验证。注意,为了加强效果,矩阵的循环相加计算都重复1000遍。
#include <Windows.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
unsigned int M = 900;
unsigned int N = 1000;
#define LOOPS 1000
time_t getCurrentTime()
{
system_clock::time_point time_point_now = system_clock::now(); // 获取当前时间点
system_clock::duration duration_since_epoch
= time_point_now.time_since_epoch(); // 从1970-01-01 00:00:00到当前时间点的时长
time_t microseconds_since_epoch
= duration_cast<microseconds>(duration_since_epoch).count(); // 将时长转换为微秒数
return microseconds_since_epoch;
}
void mat_init(long* a, long* b) {
long c;
int index;
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
index = i * M + N;
a[index] = j;
b[index] = i;
}
}
return;
}
void matadd_in_order(long* a, long* b) {
long c;
int index;
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
index = j + i * M;
c = a[index] + b[index];
}
}
return;
}
void matadd_not_in_order(long* a, long* b) {
long c;
int index;
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
index = i + j * M;
c = a[index] + b[index];
}
}
return;
}
int main()
{
time_t start, end, time_elapsed_in_order, time_elapsed_not_in_order;
for (int i = 100; i < 2000; i += 100) {
M = i;
N = 1000;
long* a = (long*)malloc(8 * M * N);
long* b = (long*)malloc(8 * M * N);
mat_init(a, b);
start = getCurrentTime();
for (int i = 0; i < LOOPS; i++) {
matadd_in_order(a, b);
}
end = getCurrentTime();
time_elapsed_in_order = end - start;
start = getCurrentTime();
for (int i = 0; i < LOOPS; i++) {
matadd_not_in_order(a, b);
}
end = getCurrentTime();
time_elapsed_not_in_order = end - start;
cout<< M * N<<","<< time_elapsed_in_order<<","<< time_elapsed_not_in_order<<endl;
free((void*)a);
free((void*)b);
}
}
以上循环输出的结果,可以输入到Python程序之中进行绘制。
绘制后的结果如下所示:

放大图如下:

可以看出,一开始,非顺序访存和顺序访存,在计算消耗时间上相差不大。但当数据大小从8001000变到9001000矩阵时,非顺序访问的时间消耗陡增。这是因为什么呢?
我的电脑使用的CPU型号为i5-8250U。