BLISlab中C矩阵初始化速度测试
假设我们想把C中的所有元素都设为0。
统一编译命令:
 | g++  xxx.cpp -O0 -std=c++11 -o xxx
 
  | 
版本一:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
   | #include<bits/stdc++.h> using namespace std; #define C( i, j )     C[ (j)*ldc + (i) ] int main() {	 	int i,j,m,n; 	m=20000; 	n=20000; 	int ldc=m; 	double*	C =(double*)_mm_malloc(m*n*sizeof(double),32); 	std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now(); 		for( i = 0; i < m; i++){     		for( j = 0; j < n; j++){          		C(i,j) = 0.0;    		 	} 		} 	std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now(); 	std::chrono::duration<size_t, std::nano> dur = tp2 - tp1; 	std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;     std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;     std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;     std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl; 	return 0; } 
 
  | 
实测结果:
 | 1 被计时代码耗时:10776901341 纳秒 2 被计时代码耗时:10776901 微妙 3 被计时代码耗时:10776 毫秒 4 被计时代码耗时:10 秒钟
 
  | 
版本二:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
   | #include<bits/stdc++.h> using namespace std; #define C( i, j )     C[ (j)*ldc + (i) ] int main() {	 	int i,j,m,n; 	m=20000; 	n=20000; 	int ldc=m; 	double*	C =(double*)_mm_malloc(m*n*sizeof(double),32); 	double *cp; 	std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now(); 	for( j = 0; j < n; j++){     	cp = &C[ j*ldc ];			     	for( i = 0; i < m; i++){         	*cp++ = 0.0;		     	}		 	} 	std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now(); 	std::chrono::duration<size_t, std::nano> dur = tp2 - tp1; 	std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;     std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;     std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;     std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl; 	return 0; } 
 
  | 
实测结果:
 | 1 被计时代码耗时:3455968000 纳秒 2 被计时代码耗时:3455968 微妙 3 被计时代码耗时:3455 毫秒 4 被计时代码耗时:3 秒钟
 
  | 
版本三:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
   | #include<bits/stdc++.h> using namespace std; #define C( i, j )     C[ (j)*ldc + (i) ] int main() {	 	int i,j,m,n; 	m=20000; 	n=20000; 	int ldc=m; 	double*	C =(double*)_mm_malloc(m*n*sizeof(double),32); 	double *cp; 	std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now(); 	for( j = 0; j < n; j++){     	cp = &C[ j * ldc ];     	for( i = 0; i < m; i+=4){        		*(cp+0) = 0.0;         	*(cp+1) = 0.0;         	*(cp+2) = 0.0;         	*(cp+3) = 0.0;         	cp+=4;    		} 	}	 	std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now(); 	std::chrono::duration<size_t, std::nano> dur = tp2 - tp1; 	std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;     std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;     std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;     std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl; 	return 0; } 
 
  | 
实测结果:
 | 1 被计时代码耗时:2581347801 纳秒 2 被计时代码耗时:2581347 微妙 3 被计时代码耗时:2581 毫秒 4 被计时代码耗时:2 秒钟
 
  | 
版本四:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
   | #include<bits/stdc++.h> using namespace std; #define C( i, j )     C[ (j)*ldc + (i) ] int main() {	 	int i,j,m,n; 	m=20000; 	n=20000; 	int ldc=m; 	double*	C =(double*)_mm_malloc(m*n*sizeof(double),32); 	double *cp; 	std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();     register double c0 =0.0,c1=0.0,c2=0.0,c3=0.0; 	for( j = 0; j < n; j++){     	cp = &C[ j * ldc ];     	for( i = 0; i < m; i+=4){         	*(cp+0) = c0;         	*(cp+1) = c1;         	*(cp+2) = c2;         	*(cp+3) = c3;         	cp+=4;    		 } 	} 	std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now(); 	std::chrono::duration<size_t, std::nano> dur = tp2 - tp1; 	std::cout << "1 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::nanoseconds>(dur).count() << " 纳秒" << std::endl;     std::cout << "2 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::microseconds>(dur).count() << " 微妙" << std::endl;     std::cout << "3 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::milliseconds>(dur).count() << " 毫秒" << std::endl;     std::cout << "4 被计时代码耗时:" << std::chrono::duration_cast<std::chrono::seconds>(dur).count() << " 秒钟" << std::endl; 	return 0; }
 
  | 
实测结果:
 | 1 被计时代码耗时:2269606428 纳秒 2 被计时代码耗时:2269606 微妙 3 被计时代码耗时:2269 毫秒 4 被计时代码耗时:2 秒钟
 
  | 
参考资料:
C++11 chrono 高精度计时方法 | GuKaifeng’s Blog