BLISlab中C矩阵初始化速度测试

BLISlab中C矩阵初始化速度测试

假设我们想把C中的所有元素都设为0。

统一编译命令:

1
g++  xxx.cpp -O0 -std=c++11 -o xxx

版本一:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#include<bits/stdc++.h>
using namespace std;
#define C( i, j ) C[ (j)*ldc + (i) ]
int main()
{
int i,j,m,n;
m=20000;
n=20000;
int ldc=m;
double* C =(double*)_mm_malloc(m*n*sizeof(double),32);
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
for( i = 0; i < m; i++){
for( j = 0; j < n; j++){
C(i,j) = 0.0;
}
}
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<size_t, std::nano> dur = tp2 - tp1;
std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;
std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;
std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;
std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl;
return 0;
}

实测结果:

1
2
3
4
1 被计时代码耗时:10776901341 纳秒
2 被计时代码耗时:10776901 微妙
3 被计时代码耗时:10776 毫秒
4 被计时代码耗时:10 秒钟

版本二:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#include<bits/stdc++.h>
using namespace std;
#define C( i, j ) C[ (j)*ldc + (i) ]
int main()
{
int i,j,m,n;
m=20000;
n=20000;
int ldc=m;
double* C =(double*)_mm_malloc(m*n*sizeof(double),32);
double *cp;
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
for( j = 0; j < n; j++){
cp = &C[ j*ldc ]; //point cp to top of ith column
for( i = 0; i < m; i++){
*cp++ = 0.0; //set element that cp points to to zero and
} //advance the pointer
}
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<size_t, std::nano> dur = tp2 - tp1;
std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;
std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;
std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;
std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl;
return 0;
}

实测结果:

1
2
3
4
1 被计时代码耗时:3455968000 纳秒
2 被计时代码耗时:3455968 微妙
3 被计时代码耗时:3455 毫秒
4 被计时代码耗时:3 秒钟

版本三:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include<bits/stdc++.h>
using namespace std;
#define C( i, j ) C[ (j)*ldc + (i) ]
int main()
{
int i,j,m,n;
m=20000;
n=20000;
int ldc=m;
double* C =(double*)_mm_malloc(m*n*sizeof(double),32);
double *cp;
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
for( j = 0; j < n; j++){
cp = &C[ j * ldc ];
for( i = 0; i < m; i+=4){
*(cp+0) = 0.0;
*(cp+1) = 0.0;
*(cp+2) = 0.0;
*(cp+3) = 0.0;
cp+=4;
}
}
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<size_t, std::nano> dur = tp2 - tp1;
std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;
std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;
std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;
std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl;
return 0;
}

实测结果:

1
2
3
4
1 被计时代码耗时:2581347801 纳秒
2 被计时代码耗时:2581347 微妙
3 被计时代码耗时:2581 毫秒
4 被计时代码耗时:2 秒钟

版本四:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include<bits/stdc++.h>
using namespace std;
#define C( i, j ) C[ (j)*ldc + (i) ]
int main()
{
int i,j,m,n;
m=20000;
n=20000;
int ldc=m;
double* C =(double*)_mm_malloc(m*n*sizeof(double),32);
double *cp;
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
register double c0 =0.0,c1=0.0,c2=0.0,c3=0.0;
for( j = 0; j < n; j++){
cp = &C[ j * ldc ];
for( i = 0; i < m; i+=4){
*(cp+0) = c0;
*(cp+1) = c1;
*(cp+2) = c2;
*(cp+3) = c3;
cp+=4;
}
}
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<size_t, std::nano> dur = tp2 - tp1;
std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;
std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;
std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;
std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl;
return 0;
}

实测结果:

1
2
3
4
1 被计时代码耗时:2269606428 纳秒
2 被计时代码耗时:2269606 微妙
3 被计时代码耗时:2269 毫秒
4 被计时代码耗时:2 秒钟

参考资料:

C++11 chrono 高精度计时方法 | GuKaifeng’s Blog


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!